Python Control Flow for Data Science

Topic: Control Flow

Introduction to Control Flow

Control flow statements determine the order in which your code executes. Mastering these is essential for data manipulation, algorithm implementation, and building data pipelines.

Conditional Statements

# Basic if-else for data filtering
score = 85
if score >= 90:
    grade = 'A'
elif score >= 80:
    grade = 'B'
elif score >= 70:
    grade = 'C'
elif score >= 60:
    grade = 'D'
else:
    grade = 'F'

# Multiple conditions with logical operators
age = 25
salary = 50000
if age >= 18 and salary > 30000:
    print("Eligible for credit card")

# Handling missing data with conditionals
value = None
if value is None:
    print("Missing value")
else:
    print(f"Value: {value}")

# Ternary operator for data transformation
temperature = 75
weather = "hot" if temperature > 70 else "moderate"

Loops for Data Processing

# For loop over data
data = [23, 45, 67, 89, 12, 34, 56, 78, 90, 11]

# Calculate sum manually
total = 0
for value in data:
    total += value
print(f"Sum: {total}")

# Find maximum value
max_val = data[0]
for val in data:
    if val > max_val:
        max_val = val

# Enumerate for index and value
for index, value in enumerate(data):
    print(f"Index {index}: {value}")

# While loop with break/continue
i = 0
while i < len(data):
    if data[i] > 50:
        print(f"Found value > 50: {data[i]}")
        break
    i += 1

# Continue to skip certain values
for val in data:
    if val < 20:
        continue  # Skip small values
    print(f"Processing: {val}")

List Comprehensions for Data Transformation

# Basic list comprehension
data = [1, 2, 3, 4, 5]
squares = [x**2 for x in data]  # [1, 4, 9, 16, 25]

# With condition
evens = [x for x in data if x % 2 == 0]  # [2, 4]

# Nested comprehension for matrix operations
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = [x for row in matrix for x in row]  # [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Dictionary comprehension
names = ['Alice', 'Bob', 'Charlie']
name_lengths = {name: len(name) for name in names}  # {'Alice': 5, 'Bob': 3, 'Charlie': 7}

# Set comprehension
data_with_dups = [1, 2, 2, 3, 3, 3]
unique_squares = {x**2 for x in data_with_dups}  # {1, 4, 9}

# Generator expression for memory efficiency
sum_of_squares = sum(x**2 for x in range(1000000))

Working with DataFrames Using Control Flow

import pandas as pd

# Sample data
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28],
    'salary': [50000, 60000, 75000, 55000],
    'department': ['IT', 'HR', 'Finance', 'IT']
})

# Conditional column creation
df['bonus'] = df['salary'].apply(
    lambda x: x * 0.15 if x > 60000 else x * 0.10
)

# Row-wise operations with iterrows
for index, row in df.iterrows():
    if row['age'] > 28:
        df.at[index, 'category'] = 'Senior'
    else:
        df.at[index, 'category'] = 'Junior'

# Vectorized approach (preferred for large datasets)
df['category'] = df['age'].apply(lambda x: 'Senior' if x > 28 else 'Junior')

# Filter with boolean indexing
high_salary = df[df['salary'] > 55000]
it_dept = df[df['department'] == 'IT']

Handling Exceptions in Data Processing

# Try-except for robust data handling
data = ['10', '20', 'thirty', '40', '50']

valid_numbers = []
for item in data:
    try:
        valid_numbers.append(float(item))
    except ValueError:
        print(f"Could not convert: {item}")

print(valid_numbers)  # [10.0, 20.0, 40.0, 50.0]

# Multiple exception handling
def safe_divide(a, b):
    try:
        return a / b
    except ZeroDivisionError:
        return None
    except TypeError:
        return None

# Finally for cleanup
try:
    result = some_function()
except Exception as e:
    print(f"Error: {e}")
finally:
    print("Process complete")

# Custom exceptions for data validation
class DataValidationError(Exception):
    pass

def validate_age(age):
    if age < 0:
        raise DataValidationError("Age cannot be negative")
    return age

Practice Exercise: Data Processing Pipeline

import pandas as pd

# Create sample dataset
data = {
    'id': [1, 2, 3, 4, 5, 6, 7, 8],
    'value': [100, 200, None, 400, 500, 600, 700, 800],
    'category': ['A', 'B', 'A', 'C', None, 'B', 'A', 'C'],
    'score': [85, 90, 78, 92, 88, 95, 80, 87]
}
df = pd.DataFrame(data)

# Step 1: Handle missing values
def process_data(df):
    # Fill numeric missing values with median
    df['value'] = df['value'].fillna(df['value'].median())
    
    # Fill categorical missing values with mode
    df['category'] = df['category'].fillna(df['category'].mode()[0])
    
    # Step 2: Add derived features
    df['value_normalized'] = (df['value'] - df['value'].min()) / (df['value'].max() - df['value'].min())
    
    # Step 3: Create categories based on score
    df['grade'] = df['score'].apply(
        lambda x: 'A' if x >= 90 else ('B' if x >= 80 else 'C')
    )
    
    return df

processed_df = process_data(df.copy())
print(processed_df)

Key Takeaways

Use vectorized operations - Pandas/NumPy are faster than loops
List comprehensions - More Pythonic than traditional loops
Exception handling - Essential for real-world data cleaning
Avoid nested loops - Use apply/map for data transformations

Control Flow Best Practices

Practice	Recommendation
Loops over data	Use list comprehension when possible
DataFrame operations	Use vectorized methods (apply, map)
Conditionals	Use numpy.where for column operations
Error handling	Try-except for data cleaning
Performance	Avoid loops on large datasets

Need More Practice?

Get personalized data science help from ChatWhole's AI-powered platform.

Get Expert Help →

All Topics