Python Functions for Data Science

Topic: Functions

Introduction to Functions in Data Science

Functions are reusable blocks of code that are essential for building data science pipelines, implementing algorithms, and creating modular data processing workflows.

Defining and Calling Functions

# Basic function for data transformation
def calculate_mean(data):
    """Calculate the arithmetic mean of a list."""
    return sum(data) / len(data)

data = [23, 45, 67, 89, 12, 34, 56, 78, 90, 11]
mean_value = calculate_mean(data)
print(f"Mean: {mean_value}")  # 50.5

# Function with multiple return values
def calculate_stats(data):
    """Return multiple statistics."""
    n = len(data)
    sorted_data = sorted(data)
    middle = n // 2
    
    mean = sum(data) / n
    median = (sorted_data[middle - 1] + sorted_data[middle]) / 2 if n % 2 == 0 else sorted_data[middle]
    variance = sum((x - mean) ** 2 for x in data) / n
    std = variance ** 0.5
    
    return mean, median, variance, std

mean, median, var, std = calculate_stats(data)
print(f"Mean: {mean}, Median: {median}, Std: {std}")

Function Parameters and Arguments

# Default parameters for flexibility
def calculate_percentile(data, percentile=50):
    """Calculate percentile of data."""
    sorted_data = sorted(data)
    index = (len(sorted_data) - 1) * percentile / 100
    lower = int(index)
    upper = lower + 1
    weight = index - lower
    
    if upper >= len(sorted_data):
        return sorted_data[lower]
    return sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight

# Keyword arguments
result = calculate_percentile(data, percentile=75)
result = calculate_percentile(data=data, percentile=25)

# *args and **kwargs for variable arguments
def summarize_data(*args, **kwargs):
    print("Positional arguments:", args)
    print("Keyword arguments:", kwargs)
    
summarize_data(1, 2, 3, name="Alice", age=25)

# *args for handling multiple data columns
def combine_features(*arrays):
    """Combine multiple feature arrays into matrix."""
    return [sum(row) / len(row) for row in zip(*arrays)]

feature1 = [1, 2, 3]
feature2 = [4, 5, 6]
combined = combine_features(feature1, feature2)  # [2.5, 3.5, 4.5]

Lambda Functions for Data Transformation

# Lambda functions for quick transformations
square = lambda x: x ** 2
add = lambda x, y: x + y

# Using lambda with map
data = [1, 2, 3, 4, 5]
squared_data = list(map(lambda x: x ** 2, data))

# Using lambda with filter
even_numbers = list(filter(lambda x: x % 2 == 0, data))

# Using lambda with sorted
students = [('Alice', 85), ('Bob', 92), ('Charlie', 78)]
sorted_students = sorted(students, key=lambda x: x[1], reverse=True)

# Lambda in Pandas for column operations
import pandas as pd
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'salary': [50000, 60000, 75000]
})
df['salary_bonus'] = df['salary'].apply(lambda x: x * 0.1 if x > 55000 else 0)

Higher-Order Functions

# Functions that return functions
def create_multiplier(factor):
    """Create a function that multiplies by factor."""
    def multiplier(x):
        return x * factor
    return multiplier

double = create_multiplier(2)
triple = create_multiplier(3)
print(double(5))  # 10
print(triple(5))  # 15

# Functions that accept functions
def apply_operation(data, operation):
    """Apply an operation to each element."""
    return [operation(x) for x in data]

result = apply_operation([1, 2, 3, 4, 5], lambda x: x ** 2)
print(result)  # [1, 4, 9, 16, 25]

# Map, Reduce, Filter patterns
from functools import reduce
data = [1, 2, 3, 4, 5]

# Map: transform each element
squared = list(map(lambda x: x**2, data))

# Filter: select elements
evens = list(filter(lambda x: x % 2 == 0, data))

# Reduce: combine elements
product = reduce(lambda x, y: x * y, data)  # 120
sum_all = reduce(lambda x, y: x + y, data)  # 15

Decorators for Data Processing

import time
from functools import wraps

# Timing decorator
def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Execution time: {end - start:.4f} seconds")
        return result
    return wrapper

@timer
def process_data(data):
    """Simulate data processing."""
    return sum(data) / len(data)

data = list(range(100000))
result = process_data(data)

# Decorator for caching (memoization)
def memoize(func):
    cache = {}
    @wraps(func)
    def wrapper(*args):
        if args not in cache:
            cache[args] = func(*args)
        return cache[args]
    return wrapper

@memoize
def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

# Logging decorator
def log_call(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        print(f"Calling {func.__name__} with args={args}, kwargs={kwargs}")
        result = func(*args, **kwargs)
        print(f"{func.__name__} returned {result}")
        return result
    return wrapper

Practice Exercise: Data Science Pipeline Functions

import pandas as pd
import numpy as np

def load_and_clean_data(filepath, **kwargs):
    """Load and clean data from file."""
    df = pd.read_csv(filepath, **kwargs)
    
    # Drop duplicates
    df = df.drop_duplicates()
    
    # Fill missing values
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)
    
    return df

def engineer_features(df, target_col=None):
    """Create new features from existing data."""
    df = df.copy()
    
    # Create numerical features
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != target_col:
            df[f'{col}_squared'] = df[col] ** 2
            df[f'{col}_log'] = np.log1p(df[col].clip(lower=0))
    
    # Create interaction features
    if len(numeric_cols) >= 2:
        col1, col2 = numeric_cols[0], numeric_cols[1]
        df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
    
    return df

def train_model(X, y, model):
    """Train machine learning model."""
    model.fit(X, y)
    return model

# Example usage in pipeline
pipeline = [
    ('load', lambda x: load_and_clean_data(x)),
    ('clean', lambda x: engineer_features(x)),
    ('train', lambda x: train_model(*x))
]

def run_pipeline(data, pipeline):
    result = data
    for step_name, step_func in pipeline:
        result = step_func(result)
        print(f"Completed: {step_name}")
    return result

Key Takeaways

Modular code - Break code into reusable functions
Lambda functions - Use for quick transformations in Pandas
Higher-order functions - Master map, filter, reduce
Decorators - Add functionality without modifying functions

Function Best Practices

Practice	Recommendation
Documentation	Add docstrings to all functions
Parameters	Use descriptive parameter names
Return values	Always return consistent types
Side effects	Minimize global state changes
Testing	Write unit tests for functions

Need More Practice?

Get personalized data science help from ChatWhole's AI-powered platform.

Get Expert Help →

All Topics