Introduction to Descriptive Statistics
Descriptive statistics summarize and describe the main features of a dataset. They provide simple summaries about the sample and measures that describe the central tendency, dispersion, and shape of the data distribution.
Measures of Central Tendency
import numpy as np
import pandas as pd
from scipy import stats
# Sample dataset
data = np.array([23, 45, 67, 89, 12, 34, 56, 78, 90, 11, 45, 67, 23, 89, 45])
# Mean - arithmetic average
mean_value = np.mean(data)
print(f"Mean: {mean_value:.2f}")
# Median - middle value (robust to outliers)
median_value = np.median(data)
print(f"Median: {median_value:.2f}")
# Mode - most frequent value
mode_value = stats.mode(data, keepdims=True)
print(f"Mode: {mode_value.mode[0]}, Count: {mode_value.count[0]}")
# Weighted mean
weights = np.array([1, 2, 1, 3, 2, 1, 2, 1, 3, 1, 2, 1, 2, 3, 1])
weighted_mean = np.average(data, weights=weights)
print(f"Weighted Mean: {weighted_mean:.2f}")
# Geometric mean (for positive data)
pos_data = data[data > 0]
geometric_mean = stats.gmean(pos_data)
print(f"Geometric Mean: {geometric_mean:.2f}")
# Harmonic mean (for rates and ratios)
harmonic_mean = stats.hmean(pos_data)
print(f"Harmonic Mean: {harmonic_mean:.2f}")
Measures of Dispersion
# Variance - average squared deviation from mean
variance = np.var(data, ddof=0) # Population variance
sample_variance = np.var(data, ddof=1) # Sample variance
print(f"Population Variance: {variance:.2f}")
print(f"Sample Variance: {sample_variance:.2f}")
# Standard deviation
std_dev = np.std(data, ddof=0)
sample_std = np.std(data, ddof=1)
print(f"Standard Deviation: {std_dev:.2f}")
# Range - difference between max and min
data_range = np.max(data) - np.min(data)
print(f"Range: {data_range}")
# Interquartile Range (IQR)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
print(f"IQR: {IQR}")
print(f"Q1: {Q1}, Q3: {Q3}")
# Mean Absolute Deviation
mad = np.mean(np.abs(data - np.mean(data)))
print(f"Mean Absolute Deviation: {mad:.2f}")
# Coefficient of Variation (relative measure)
cv = (std_dev / mean_value) * 100
print(f"Coefficient of Variation: {cv:.2f}%")
Percentiles and Quantiles
# Calculate various percentiles
percentiles = [5, 10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
value = np.percentile(data, p)
print(f"{p}th percentile: {value:.2f}")
# Five-number summary
five_num = {
'Min': np.min(data),
'Q1': np.percentile(data, 25),
'Median': np.median(data),
'Q3': np.percentile(data, 75),
'Max': np.max(data)
}
print("\nFive-Number Summary:")
for k, v in five_num.items():
print(f" {k}: {v}")
# Box plot calculations
box_plot_values = {
'Lower whisker': np.min(data[data >= Q1 - 1.5*IQR]),
'Q1': Q1,
'Median': np.median(data),
'Q3': Q3,
'Upper whisker': np.max(data[data <= Q3 + 1.5*IQR])
}
Shape of Distribution
from scipy import stats
# Skewness - measure of asymmetry
# Positive skew: tail extends to right
# Negative skew: tail extends to left
skewness = stats.skew(data)
print(f"Skewness: {skewness:.2f}")
# Kurtosis - measure of tail heaviness
# Excess kurtosis (Fisher's definition): 0 = normal
kurtosis = stats.kurtosis(data)
print(f"Excess Kurtosis: {kurtosis:.2f}")
# Interpretation
if skewness > 1:
print("Highly positively skewed")
elif skewness < -1:
print("Highly negatively skewed")
else:
print("Approximately symmetric")
if kurtosis > 0:
print("Heavy-tailed (leptokurtic)")
elif kurtosis < 0:
print("Light-tailed (platykurtic)")
else:
print("Normal-like tails (mesokurtic)")
Comprehensive Statistics with Pandas
import pandas as pd
import numpy as np
# Create sample DataFrame
df = pd.DataFrame({
'age': np.random.randint(20, 70, 100),
'salary': np.random.normal(50000, 15000, 100),
'experience': np.random.exponential(5, 100),
'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], 100)
})
# Basic statistics
print("Basic Statistics:")
print(df.describe())
# Extended statistics
print("\nExtended Statistics:")
print(df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]))
# Numeric columns only
print("\nNumeric Summary:")
numeric_summary = df.select_dtypes(include=[np.number]).agg([
'mean', 'median', 'std', 'min', 'max',
lambda x: x.quantile(0.25),
lambda x: x.quantile(0.75)
]).round(2)
print(numeric_summary)
# Grouped statistics
print("\nGrouped Statistics:")
print(df.groupby('department')['salary'].agg(['mean', 'std', 'min', 'max']))
Visualization of Descriptive Statistics
import matplotlib.pyplot as plt
import seaborn as sns
# Create sample data
data = np.random.normal(50, 10, 1000)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Histogram with KDE
axes[0, 0].hist(data, bins=30, density=True, alpha=0.7, color='blue')
axes[0, 0].set_title('Distribution with Statistics')
mean_val = np.mean(data)
axes[0, 0].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.1f}')
axes[0, 0].axvline(np.median(data), color='green', linestyle='--', label=f'Median: {np.median(data):.1f}')
axes[0, 0].legend()
# 2. Box plot
axes[0, 1].boxplot(data, vert=True)
axes[0, 1].set_title('Box Plot (Shows IQR, Median, Outliers)')
# 3. Violin plot
sns.violinplot(data=[data], ax=axes[1, 0])
axes[1, 0].set_title('Violin Plot (Shows Distribution Shape)')
# 4. Summary statistics table
stats_text = f"""
Mean: {np.mean(data):.2f}
Median: {np.median(data):.2f}
Std: {np.std(data):.2f}
Min: {np.min(data):.2f}
Max: {np.max(data):.2f}
Q1: {np.percentile(data, 25):.2f}
Q3: {np.percentile(data, 75):.2f}
Skewness: {stats.skew(data):.2f}
Kurtosis: {stats.kurtosis(data):.2f}
"""
axes[1, 1].text(0.1, 0.5, stats_text, fontsize=12, family='monospace')
axes[1, 1].axis('off')
axes[1, 1].set_title('Summary Statistics')
plt.tight_layout()
plt.show()
Key Takeaways
- Use multiple measures - Mean, median, and mode together
- Check distribution shape - Skewness and kurtosis matter
- Consider outliers - Use median/IQR for robust analysis
- Visualize - Plots reveal what numbers miss
When to Use Each Measure
| Measure | Use Case |
|---|---|
| Mean | Symmetric distributions, no outliers |
| Median | Skewed distributions, outliers present |
| Mode | Categorical data, most frequent value |
| Std Dev | Normal distributions, comparing spread |
| IQR | Robust measure, outliers present |
| Range | Quick estimate of spread |