← Back to Data Science

All Topics

Advertisement

Learn/Data Science/Data Preprocessing

Handling Missing Data

Topic: Missing Data

Advertisement

Understanding Missing Data

Missing data is one of the most common problems in data science. Understanding the type and pattern of missing data is crucial for choosing the right imputation strategy.

Types of Missing Data

TypeDescriptionExampleCause
MCARMissing Completely At RandomRandom survey respondents skip questionsTechnical errors, accidents
MARMissing At RandomWomen less likely to answer income questionsSystematic but predictable
MNARMissing Not At RandomSick patients drop out of health studyRelated to the missing value itself
import pandas as pd
import numpy as np

# Create dataset with different missing patterns
np.random.seed(42)
n = 100

df = pd.DataFrame({
    'age': np.random.randint(20, 60, n),
    'income': np.random.normal(50000, 10000, n),
    'score': np.random.uniform(60, 100, n),
    'education': np.random.choice(['High School', 'Bachelor', 'Master'], n)
})

# MCAR - completely random missing
mask_mcar = np.random.random(n) < 0.1
df.loc[mask_mcar, 'income'] = np.nan

# MAR - missing based on another variable (e.g., age)
mask_mar = (df['age'] > 40) & (np.random.random(n) < 0.2)
df.loc[mask_mar, 'score'] = np.nan

# MNAR - missing related to the value itself
mask_mnar = df['income'] > 60000
df.loc[mask_mnar[:30], 'education'] = np.nan

print("Missing value analysis:")
print(df.isnull().sum())
print(f"\nMissing percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))

Visualizing Missing Data

import matplotlib.pyplot as plt
import seaborn as sns

# Missing data heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.show()

# Missing data bar plot
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df) * 100).sort_values(ascending=True)

plt.figure(figsize=(10, 5))
missing_pct.plot(kind='barh', color='coral')
plt.xlabel('Percentage Missing')
plt.title('Missing Data by Column')
plt.tight_layout()
plt.show()

# Missing pattern analysis
print("Missing value correlations:")
missing_corr = df.isnull().corr()
print(missing_corr)

Imputation Methods

# Simple imputation methods
from sklearn.impute import SimpleImputer

# Mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df_mean = pd.DataFrame(
    imputer_mean.fit_transform(df[['income', 'score']]),
    columns=['income', 'score']
)

# Median imputation (better for skewed data)
imputer_median = SimpleImputer(strategy='median')
df_median = pd.DataFrame(
    imputer_median.fit_transform(df[['income', 'score']]),
    columns=['income', 'score']
)

# Mode imputation (for categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df_mode = pd.DataFrame(
    imputer_mode.fit_transform(df[['education']]),
    columns=['education']
)

# Constant imputation
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)

Advanced Imputation Techniques

# KNN Imputation
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Scale data before KNN imputation
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['age', 'income', 'score']])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_imputed = knn_imputer.fit_transform(df_scaled)

# Inverse transform to get original scale
df_filled = pd.DataFrame(
    scaler.inverse_transform(df_imputed),
    columns=['age', 'income', 'score']
)

# Iterative Imputer (uses other features to predict missing)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(max_iter=10, random_state=42)
df_iterative = iter_imputer.fit_transform(df[['age', 'income', 'score']])

Multiple Imputation

# Multiple imputation approach
def multiple_imputation(data, n_imputations=5):
    """Create multiple imputed datasets."""
    imputed_datasets = []
    
    for i in range(n_imputations):
        df_imputed = data.copy()
        
        # Add random noise to imputed values
        for col in df_imputed.columns:
            if df_imputed[col].isnull().any():
                missing_mask = df_imputed[col].isnull()
                std = df_imputed[col].std()
                df_imputed.loc[missing_mask, col] += np.random.normal(0, std * 0.1, missing_mask.sum())
        
        imputed_datasets.append(df_imputed)
    
    return imputed_datasets

# Pooling results
imputed_dfs = multiple_imputation(df[['income', 'score']], n_imputations=5)
pooled_income = np.mean([df['income'].mean() for df in imputed_dfs])

Handling Missing Data in Time Series

import pandas as pd

# Create time series with missing values
dates = pd.date_range('2024-01-01', periods=50, freq='D')
ts = pd.Series(range(50), index=dates)
ts[5:10] = np.nan
ts[20:22] = np.nan
ts[35] = np.nan

# Forward fill
ts_ffill = ts.fillna(method='ffill')

# Backward fill  
ts_bfill = ts.fillna(method='bfill')

# Linear interpolation
ts_interp = ts.interpolate(method='linear')

# Time-aware interpolation
ts_interp_time = ts.interpolate(method='time')

# Rolling mean imputation
ts_rolling = ts.fillna(ts.rolling(window=3, min_periods=1).mean())

# Spline interpolation for smooth curves
ts_spline = ts.interpolate(method='spline', order=3)

Imputation with Machine Learning

# Using Random Forest for imputation
from sklearn.ensemble import RandomForestRegressor

def rf_impute(df, target_col):
    """Impute missing values using Random Forest."""
    df_copy = df.copy()
    
    # Separate complete and incomplete rows
    complete = df_copy[df_copy[target_col].notna()]
    incomplete = df_copy[df_copy[target_col].isna()]
    
    if len(incomplete) == 0:
        return df_copy
    
    # Prepare features
    features = [col for col in df.columns if col != target_col]
    X_train = complete[features]
    y_train = complete[target_col]
    
    # Train model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Predict missing values
    X_pred = incomplete[features]
    predicted = rf.predict(X_pred)
    
    # Fill in missing values
    df_copy.loc[incomplete.index, target_col] = predicted
    
    return df_copy

# Apply to dataset
df_rf_imputed = rf_impute(df, 'income')

Key Takeaways

  1. Understand the missing data mechanism - MCAR, MAR, or MNAR
  2. Choose appropriate method - Simple vs. advanced imputation
  3. Consider the context - Time series needs different treatment
  4. Validate imputation - Compare before and after distributions

Imputation Method Selection Guide

ScenarioRecommended Method
MCAR, small datasetMean/Median imputation
MARKNN, Iterative imputer
MNARMultiple imputation, ML-based
Time seriesInterpolation, rolling mean
CategoricalMode, KNN

Advertisement

Advertisement

Need More Practice?

Get personalized data science help from ChatWhole's AI-powered platform.

Get Expert Help →