Handling Missing Data

Topic: Missing Data

Understanding Missing Data

Missing data is one of the most common problems in data science. Understanding the type and pattern of missing data is crucial for choosing the right imputation strategy.

Types of Missing Data

Type	Description	Example	Cause
MCAR	Missing Completely At Random	Random survey respondents skip questions	Technical errors, accidents
MAR	Missing At Random	Women less likely to answer income questions	Systematic but predictable
MNAR	Missing Not At Random	Sick patients drop out of health study	Related to the missing value itself

import pandas as pd
import numpy as np

# Create dataset with different missing patterns
np.random.seed(42)
n = 100

df = pd.DataFrame({
    'age': np.random.randint(20, 60, n),
    'income': np.random.normal(50000, 10000, n),
    'score': np.random.uniform(60, 100, n),
    'education': np.random.choice(['High School', 'Bachelor', 'Master'], n)
})

# MCAR - completely random missing
mask_mcar = np.random.random(n) < 0.1
df.loc[mask_mcar, 'income'] = np.nan

# MAR - missing based on another variable (e.g., age)
mask_mar = (df['age'] > 40) & (np.random.random(n) < 0.2)
df.loc[mask_mar, 'score'] = np.nan

# MNAR - missing related to the value itself
mask_mnar = df['income'] > 60000
df.loc[mask_mnar[:30], 'education'] = np.nan

print("Missing value analysis:")
print(df.isnull().sum())
print(f"\nMissing percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))

Visualizing Missing Data

import matplotlib.pyplot as plt
import seaborn as sns

# Missing data heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.show()

# Missing data bar plot
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df) * 100).sort_values(ascending=True)

plt.figure(figsize=(10, 5))
missing_pct.plot(kind='barh', color='coral')
plt.xlabel('Percentage Missing')
plt.title('Missing Data by Column')
plt.tight_layout()
plt.show()

# Missing pattern analysis
print("Missing value correlations:")
missing_corr = df.isnull().corr()
print(missing_corr)

Imputation Methods

# Simple imputation methods
from sklearn.impute import SimpleImputer

# Mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df_mean = pd.DataFrame(
    imputer_mean.fit_transform(df[['income', 'score']]),
    columns=['income', 'score']
)

# Median imputation (better for skewed data)
imputer_median = SimpleImputer(strategy='median')
df_median = pd.DataFrame(
    imputer_median.fit_transform(df[['income', 'score']]),
    columns=['income', 'score']
)

# Mode imputation (for categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df_mode = pd.DataFrame(
    imputer_mode.fit_transform(df[['education']]),
    columns=['education']
)

# Constant imputation
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)

Advanced Imputation Techniques

# KNN Imputation
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Scale data before KNN imputation
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['age', 'income', 'score']])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_imputed = knn_imputer.fit_transform(df_scaled)

# Inverse transform to get original scale
df_filled = pd.DataFrame(
    scaler.inverse_transform(df_imputed),
    columns=['age', 'income', 'score']
)

# Iterative Imputer (uses other features to predict missing)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(max_iter=10, random_state=42)
df_iterative = iter_imputer.fit_transform(df[['age', 'income', 'score']])

Multiple Imputation

# Multiple imputation approach
def multiple_imputation(data, n_imputations=5):
    """Create multiple imputed datasets."""
    imputed_datasets = []
    
    for i in range(n_imputations):
        df_imputed = data.copy()
        
        # Add random noise to imputed values
        for col in df_imputed.columns:
            if df_imputed[col].isnull().any():
                missing_mask = df_imputed[col].isnull()
                std = df_imputed[col].std()
                df_imputed.loc[missing_mask, col] += np.random.normal(0, std * 0.1, missing_mask.sum())
        
        imputed_datasets.append(df_imputed)
    
    return imputed_datasets

# Pooling results
imputed_dfs = multiple_imputation(df[['income', 'score']], n_imputations=5)
pooled_income = np.mean([df['income'].mean() for df in imputed_dfs])

Handling Missing Data in Time Series

import pandas as pd

# Create time series with missing values
dates = pd.date_range('2024-01-01', periods=50, freq='D')
ts = pd.Series(range(50), index=dates)
ts[5:10] = np.nan
ts[20:22] = np.nan
ts[35] = np.nan

# Forward fill
ts_ffill = ts.fillna(method='ffill')

# Backward fill  
ts_bfill = ts.fillna(method='bfill')

# Linear interpolation
ts_interp = ts.interpolate(method='linear')

# Time-aware interpolation
ts_interp_time = ts.interpolate(method='time')

# Rolling mean imputation
ts_rolling = ts.fillna(ts.rolling(window=3, min_periods=1).mean())

# Spline interpolation for smooth curves
ts_spline = ts.interpolate(method='spline', order=3)

Imputation with Machine Learning

# Using Random Forest for imputation
from sklearn.ensemble import RandomForestRegressor

def rf_impute(df, target_col):
    """Impute missing values using Random Forest."""
    df_copy = df.copy()
    
    # Separate complete and incomplete rows
    complete = df_copy[df_copy[target_col].notna()]
    incomplete = df_copy[df_copy[target_col].isna()]
    
    if len(incomplete) == 0:
        return df_copy
    
    # Prepare features
    features = [col for col in df.columns if col != target_col]
    X_train = complete[features]
    y_train = complete[target_col]
    
    # Train model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Predict missing values
    X_pred = incomplete[features]
    predicted = rf.predict(X_pred)
    
    # Fill in missing values
    df_copy.loc[incomplete.index, target_col] = predicted
    
    return df_copy

# Apply to dataset
df_rf_imputed = rf_impute(df, 'income')

Key Takeaways

Understand the missing data mechanism - MCAR, MAR, or MNAR
Choose appropriate method - Simple vs. advanced imputation
Consider the context - Time series needs different treatment
Validate imputation - Compare before and after distributions

Imputation Method Selection Guide

Scenario	Recommended Method
MCAR, small dataset	Mean/Median imputation
MAR	KNN, Iterative imputer
MNAR	Multiple imputation, ML-based
Time series	Interpolation, rolling mean
Categorical	Mode, KNN

Need More Practice?

Get personalized data science help from ChatWhole's AI-powered platform.

Get Expert Help →

All Topics