Descriptive Statistics for Data Science

Topic: Descriptive Statistics

Introduction to Descriptive Statistics

Descriptive statistics summarize and describe the main features of a dataset. They provide simple summaries about the sample and measures that describe the central tendency, dispersion, and shape of the data distribution.

Measures of Central Tendency

import numpy as np
import pandas as pd
from scipy import stats

# Sample dataset
data = np.array([23, 45, 67, 89, 12, 34, 56, 78, 90, 11, 45, 67, 23, 89, 45])

# Mean - arithmetic average
mean_value = np.mean(data)
print(f"Mean: {mean_value:.2f}")

# Median - middle value (robust to outliers)
median_value = np.median(data)
print(f"Median: {median_value:.2f}")

# Mode - most frequent value
mode_value = stats.mode(data, keepdims=True)
print(f"Mode: {mode_value.mode[0]}, Count: {mode_value.count[0]}")

# Weighted mean
weights = np.array([1, 2, 1, 3, 2, 1, 2, 1, 3, 1, 2, 1, 2, 3, 1])
weighted_mean = np.average(data, weights=weights)
print(f"Weighted Mean: {weighted_mean:.2f}")

# Geometric mean (for positive data)
pos_data = data[data > 0]
geometric_mean = stats.gmean(pos_data)
print(f"Geometric Mean: {geometric_mean:.2f}")

# Harmonic mean (for rates and ratios)
harmonic_mean = stats.hmean(pos_data)
print(f"Harmonic Mean: {harmonic_mean:.2f}")

Measures of Dispersion

# Variance - average squared deviation from mean
variance = np.var(data, ddof=0)  # Population variance
sample_variance = np.var(data, ddof=1)  # Sample variance
print(f"Population Variance: {variance:.2f}")
print(f"Sample Variance: {sample_variance:.2f}")

# Standard deviation
std_dev = np.std(data, ddof=0)
sample_std = np.std(data, ddof=1)
print(f"Standard Deviation: {std_dev:.2f}")

# Range - difference between max and min
data_range = np.max(data) - np.min(data)
print(f"Range: {data_range}")

# Interquartile Range (IQR)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
print(f"IQR: {IQR}")
print(f"Q1: {Q1}, Q3: {Q3}")

# Mean Absolute Deviation
mad = np.mean(np.abs(data - np.mean(data)))
print(f"Mean Absolute Deviation: {mad:.2f}")

# Coefficient of Variation (relative measure)
cv = (std_dev / mean_value) * 100
print(f"Coefficient of Variation: {cv:.2f}%")

Percentiles and Quantiles

# Calculate various percentiles
percentiles = [5, 10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    value = np.percentile(data, p)
    print(f"{p}th percentile: {value:.2f}")

# Five-number summary
five_num = {
    'Min': np.min(data),
    'Q1': np.percentile(data, 25),
    'Median': np.median(data),
    'Q3': np.percentile(data, 75),
    'Max': np.max(data)
}
print("\nFive-Number Summary:")
for k, v in five_num.items():
    print(f"  {k}: {v}")

# Box plot calculations
box_plot_values = {
    'Lower whisker': np.min(data[data >= Q1 - 1.5*IQR]),
    'Q1': Q1,
    'Median': np.median(data),
    'Q3': Q3,
    'Upper whisker': np.max(data[data <= Q3 + 1.5*IQR])
}

Shape of Distribution

from scipy import stats

# Skewness - measure of asymmetry
# Positive skew: tail extends to right
# Negative skew: tail extends to left
skewness = stats.skew(data)
print(f"Skewness: {skewness:.2f}")

# Kurtosis - measure of tail heaviness
# Excess kurtosis (Fisher's definition): 0 = normal
kurtosis = stats.kurtosis(data)
print(f"Excess Kurtosis: {kurtosis:.2f}")

# Interpretation
if skewness > 1:
    print("Highly positively skewed")
elif skewness < -1:
    print("Highly negatively skewed")
else:
    print("Approximately symmetric")

if kurtosis > 0:
    print("Heavy-tailed (leptokurtic)")
elif kurtosis < 0:
    print("Light-tailed (platykurtic)")
else:
    print("Normal-like tails (mesokurtic)")

Comprehensive Statistics with Pandas

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'age': np.random.randint(20, 70, 100),
    'salary': np.random.normal(50000, 15000, 100),
    'experience': np.random.exponential(5, 100),
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], 100)
})

# Basic statistics
print("Basic Statistics:")
print(df.describe())

# Extended statistics
print("\nExtended Statistics:")
print(df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]))

# Numeric columns only
print("\nNumeric Summary:")
numeric_summary = df.select_dtypes(include=[np.number]).agg([
    'mean', 'median', 'std', 'min', 'max',
    lambda x: x.quantile(0.25),
    lambda x: x.quantile(0.75)
]).round(2)
print(numeric_summary)

# Grouped statistics
print("\nGrouped Statistics:")
print(df.groupby('department')['salary'].agg(['mean', 'std', 'min', 'max']))

Visualization of Descriptive Statistics

import matplotlib.pyplot as plt
import seaborn as sns

# Create sample data
data = np.random.normal(50, 10, 1000)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Histogram with KDE
axes[0, 0].hist(data, bins=30, density=True, alpha=0.7, color='blue')
axes[0, 0].set_title('Distribution with Statistics')
mean_val = np.mean(data)
axes[0, 0].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.1f}')
axes[0, 0].axvline(np.median(data), color='green', linestyle='--', label=f'Median: {np.median(data):.1f}')
axes[0, 0].legend()

# 2. Box plot
axes[0, 1].boxplot(data, vert=True)
axes[0, 1].set_title('Box Plot (Shows IQR, Median, Outliers)')

# 3. Violin plot
sns.violinplot(data=[data], ax=axes[1, 0])
axes[1, 0].set_title('Violin Plot (Shows Distribution Shape)')

# 4. Summary statistics table
stats_text = f"""
Mean: {np.mean(data):.2f}
Median: {np.median(data):.2f}
Std: {np.std(data):.2f}
Min: {np.min(data):.2f}
Max: {np.max(data):.2f}
Q1: {np.percentile(data, 25):.2f}
Q3: {np.percentile(data, 75):.2f}
Skewness: {stats.skew(data):.2f}
Kurtosis: {stats.kurtosis(data):.2f}
"""
axes[1, 1].text(0.1, 0.5, stats_text, fontsize=12, family='monospace')
axes[1, 1].axis('off')
axes[1, 1].set_title('Summary Statistics')

plt.tight_layout()
plt.show()

Key Takeaways

Use multiple measures - Mean, median, and mode together
Check distribution shape - Skewness and kurtosis matter
Consider outliers - Use median/IQR for robust analysis
Visualize - Plots reveal what numbers miss

When to Use Each Measure

Measure	Use Case
Mean	Symmetric distributions, no outliers
Median	Skewed distributions, outliers present
Mode	Categorical data, most frequent value
Std Dev	Normal distributions, comparing spread
IQR	Robust measure, outliers present
Range	Quick estimate of spread

Need More Practice?

Get personalized data science help from ChatWhole's AI-powered platform.

Get Expert Help →

All Topics