Introduction to Functions in Data Science
Functions are reusable blocks of code that are essential for building data science pipelines, implementing algorithms, and creating modular data processing workflows.
Defining and Calling Functions
# Basic function for data transformation
def calculate_mean(data):
"""Calculate the arithmetic mean of a list."""
return sum(data) / len(data)
data = [23, 45, 67, 89, 12, 34, 56, 78, 90, 11]
mean_value = calculate_mean(data)
print(f"Mean: {mean_value}") # 50.5
# Function with multiple return values
def calculate_stats(data):
"""Return multiple statistics."""
n = len(data)
sorted_data = sorted(data)
middle = n // 2
mean = sum(data) / n
median = (sorted_data[middle - 1] + sorted_data[middle]) / 2 if n % 2 == 0 else sorted_data[middle]
variance = sum((x - mean) ** 2 for x in data) / n
std = variance ** 0.5
return mean, median, variance, std
mean, median, var, std = calculate_stats(data)
print(f"Mean: {mean}, Median: {median}, Std: {std}")
Function Parameters and Arguments
# Default parameters for flexibility
def calculate_percentile(data, percentile=50):
"""Calculate percentile of data."""
sorted_data = sorted(data)
index = (len(sorted_data) - 1) * percentile / 100
lower = int(index)
upper = lower + 1
weight = index - lower
if upper >= len(sorted_data):
return sorted_data[lower]
return sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight
# Keyword arguments
result = calculate_percentile(data, percentile=75)
result = calculate_percentile(data=data, percentile=25)
# *args and **kwargs for variable arguments
def summarize_data(*args, **kwargs):
print("Positional arguments:", args)
print("Keyword arguments:", kwargs)
summarize_data(1, 2, 3, name="Alice", age=25)
# *args for handling multiple data columns
def combine_features(*arrays):
"""Combine multiple feature arrays into matrix."""
return [sum(row) / len(row) for row in zip(*arrays)]
feature1 = [1, 2, 3]
feature2 = [4, 5, 6]
combined = combine_features(feature1, feature2) # [2.5, 3.5, 4.5]
Lambda Functions for Data Transformation
# Lambda functions for quick transformations
square = lambda x: x ** 2
add = lambda x, y: x + y
# Using lambda with map
data = [1, 2, 3, 4, 5]
squared_data = list(map(lambda x: x ** 2, data))
# Using lambda with filter
even_numbers = list(filter(lambda x: x % 2 == 0, data))
# Using lambda with sorted
students = [('Alice', 85), ('Bob', 92), ('Charlie', 78)]
sorted_students = sorted(students, key=lambda x: x[1], reverse=True)
# Lambda in Pandas for column operations
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'salary': [50000, 60000, 75000]
})
df['salary_bonus'] = df['salary'].apply(lambda x: x * 0.1 if x > 55000 else 0)
Higher-Order Functions
# Functions that return functions
def create_multiplier(factor):
"""Create a function that multiplies by factor."""
def multiplier(x):
return x * factor
return multiplier
double = create_multiplier(2)
triple = create_multiplier(3)
print(double(5)) # 10
print(triple(5)) # 15
# Functions that accept functions
def apply_operation(data, operation):
"""Apply an operation to each element."""
return [operation(x) for x in data]
result = apply_operation([1, 2, 3, 4, 5], lambda x: x ** 2)
print(result) # [1, 4, 9, 16, 25]
# Map, Reduce, Filter patterns
from functools import reduce
data = [1, 2, 3, 4, 5]
# Map: transform each element
squared = list(map(lambda x: x**2, data))
# Filter: select elements
evens = list(filter(lambda x: x % 2 == 0, data))
# Reduce: combine elements
product = reduce(lambda x, y: x * y, data) # 120
sum_all = reduce(lambda x, y: x + y, data) # 15
Decorators for Data Processing
import time
from functools import wraps
# Timing decorator
def timer(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print(f"Execution time: {end - start:.4f} seconds")
return result
return wrapper
@timer
def process_data(data):
"""Simulate data processing."""
return sum(data) / len(data)
data = list(range(100000))
result = process_data(data)
# Decorator for caching (memoization)
def memoize(func):
cache = {}
@wraps(func)
def wrapper(*args):
if args not in cache:
cache[args] = func(*args)
return cache[args]
return wrapper
@memoize
def fibonacci(n):
if n < 2:
return n
return fibonacci(n-1) + fibonacci(n-2)
# Logging decorator
def log_call(func):
@wraps(func)
def wrapper(*args, **kwargs):
print(f"Calling {func.__name__} with args={args}, kwargs={kwargs}")
result = func(*args, **kwargs)
print(f"{func.__name__} returned {result}")
return result
return wrapper
Practice Exercise: Data Science Pipeline Functions
import pandas as pd
import numpy as np
def load_and_clean_data(filepath, **kwargs):
"""Load and clean data from file."""
df = pd.read_csv(filepath, **kwargs)
# Drop duplicates
df = df.drop_duplicates()
# Fill missing values
for col in df.columns:
if df[col].dtype in ['int64', 'float64']:
df[col].fillna(df[col].median(), inplace=True)
else:
df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)
return df
def engineer_features(df, target_col=None):
"""Create new features from existing data."""
df = df.copy()
# Create numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if col != target_col:
df[f'{col}_squared'] = df[col] ** 2
df[f'{col}_log'] = np.log1p(df[col].clip(lower=0))
# Create interaction features
if len(numeric_cols) >= 2:
col1, col2 = numeric_cols[0], numeric_cols[1]
df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
return df
def train_model(X, y, model):
"""Train machine learning model."""
model.fit(X, y)
return model
# Example usage in pipeline
pipeline = [
('load', lambda x: load_and_clean_data(x)),
('clean', lambda x: engineer_features(x)),
('train', lambda x: train_model(*x))
]
def run_pipeline(data, pipeline):
result = data
for step_name, step_func in pipeline:
result = step_func(result)
print(f"Completed: {step_name}")
return result
Key Takeaways
- Modular code - Break code into reusable functions
- Lambda functions - Use for quick transformations in Pandas
- Higher-order functions - Master map, filter, reduce
- Decorators - Add functionality without modifying functions
Function Best Practices
| Practice | Recommendation |
|---|---|
| Documentation | Add docstrings to all functions |
| Parameters | Use descriptive parameter names |
| Return values | Always return consistent types |
| Side effects | Minimize global state changes |
| Testing | Write unit tests for functions |