Understanding Missing Data
Missing data is one of the most common problems in data science. Understanding the type and pattern of missing data is crucial for choosing the right imputation strategy.
Types of Missing Data
| Type | Description | Example | Cause |
|---|
| MCAR | Missing Completely At Random | Random survey respondents skip questions | Technical errors, accidents |
| MAR | Missing At Random | Women less likely to answer income questions | Systematic but predictable |
| MNAR | Missing Not At Random | Sick patients drop out of health study | Related to the missing value itself |
import pandas as pd
import numpy as np
# Create dataset with different missing patterns
np.random.seed(42)
n = 100
df = pd.DataFrame({
'age': np.random.randint(20, 60, n),
'income': np.random.normal(50000, 10000, n),
'score': np.random.uniform(60, 100, n),
'education': np.random.choice(['High School', 'Bachelor', 'Master'], n)
})
# MCAR - completely random missing
mask_mcar = np.random.random(n) < 0.1
df.loc[mask_mcar, 'income'] = np.nan
# MAR - missing based on another variable (e.g., age)
mask_mar = (df['age'] > 40) & (np.random.random(n) < 0.2)
df.loc[mask_mar, 'score'] = np.nan
# MNAR - missing related to the value itself
mask_mnar = df['income'] > 60000
df.loc[mask_mnar[:30], 'education'] = np.nan
print("Missing value analysis:")
print(df.isnull().sum())
print(f"\nMissing percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))
Visualizing Missing Data
import matplotlib.pyplot as plt
import seaborn as sns
# Missing data heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.show()
# Missing data bar plot
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df) * 100).sort_values(ascending=True)
plt.figure(figsize=(10, 5))
missing_pct.plot(kind='barh', color='coral')
plt.xlabel('Percentage Missing')
plt.title('Missing Data by Column')
plt.tight_layout()
plt.show()
# Missing pattern analysis
print("Missing value correlations:")
missing_corr = df.isnull().corr()
print(missing_corr)
Imputation Methods
# Simple imputation methods
from sklearn.impute import SimpleImputer
# Mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df_mean = pd.DataFrame(
imputer_mean.fit_transform(df[['income', 'score']]),
columns=['income', 'score']
)
# Median imputation (better for skewed data)
imputer_median = SimpleImputer(strategy='median')
df_median = pd.DataFrame(
imputer_median.fit_transform(df[['income', 'score']]),
columns=['income', 'score']
)
# Mode imputation (for categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df_mode = pd.DataFrame(
imputer_mode.fit_transform(df[['education']]),
columns=['education']
)
# Constant imputation
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)
Advanced Imputation Techniques
# KNN Imputation
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
# Scale data before KNN imputation
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['age', 'income', 'score']])
# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_imputed = knn_imputer.fit_transform(df_scaled)
# Inverse transform to get original scale
df_filled = pd.DataFrame(
scaler.inverse_transform(df_imputed),
columns=['age', 'income', 'score']
)
# Iterative Imputer (uses other features to predict missing)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
iter_imputer = IterativeImputer(max_iter=10, random_state=42)
df_iterative = iter_imputer.fit_transform(df[['age', 'income', 'score']])
Multiple Imputation
# Multiple imputation approach
def multiple_imputation(data, n_imputations=5):
"""Create multiple imputed datasets."""
imputed_datasets = []
for i in range(n_imputations):
df_imputed = data.copy()
# Add random noise to imputed values
for col in df_imputed.columns:
if df_imputed[col].isnull().any():
missing_mask = df_imputed[col].isnull()
std = df_imputed[col].std()
df_imputed.loc[missing_mask, col] += np.random.normal(0, std * 0.1, missing_mask.sum())
imputed_datasets.append(df_imputed)
return imputed_datasets
# Pooling results
imputed_dfs = multiple_imputation(df[['income', 'score']], n_imputations=5)
pooled_income = np.mean([df['income'].mean() for df in imputed_dfs])
Handling Missing Data in Time Series
import pandas as pd
# Create time series with missing values
dates = pd.date_range('2024-01-01', periods=50, freq='D')
ts = pd.Series(range(50), index=dates)
ts[5:10] = np.nan
ts[20:22] = np.nan
ts[35] = np.nan
# Forward fill
ts_ffill = ts.fillna(method='ffill')
# Backward fill
ts_bfill = ts.fillna(method='bfill')
# Linear interpolation
ts_interp = ts.interpolate(method='linear')
# Time-aware interpolation
ts_interp_time = ts.interpolate(method='time')
# Rolling mean imputation
ts_rolling = ts.fillna(ts.rolling(window=3, min_periods=1).mean())
# Spline interpolation for smooth curves
ts_spline = ts.interpolate(method='spline', order=3)
Imputation with Machine Learning
# Using Random Forest for imputation
from sklearn.ensemble import RandomForestRegressor
def rf_impute(df, target_col):
"""Impute missing values using Random Forest."""
df_copy = df.copy()
# Separate complete and incomplete rows
complete = df_copy[df_copy[target_col].notna()]
incomplete = df_copy[df_copy[target_col].isna()]
if len(incomplete) == 0:
return df_copy
# Prepare features
features = [col for col in df.columns if col != target_col]
X_train = complete[features]
y_train = complete[target_col]
# Train model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Predict missing values
X_pred = incomplete[features]
predicted = rf.predict(X_pred)
# Fill in missing values
df_copy.loc[incomplete.index, target_col] = predicted
return df_copy
# Apply to dataset
df_rf_imputed = rf_impute(df, 'income')
Key Takeaways
- Understand the missing data mechanism - MCAR, MAR, or MNAR
- Choose appropriate method - Simple vs. advanced imputation
- Consider the context - Time series needs different treatment
- Validate imputation - Compare before and after distributions
Imputation Method Selection Guide
| Scenario | Recommended Method |
|---|
| MCAR, small dataset | Mean/Median imputation |
| MAR | KNN, Iterative imputer |
| MNAR | Multiple imputation, ML-based |
| Time series | Interpolation, rolling mean |
| Categorical | Mode, KNN |