Introduction to Control Flow
Control flow statements determine the order in which your code executes. Mastering these is essential for data manipulation, algorithm implementation, and building data pipelines.
Conditional Statements
# Basic if-else for data filtering
score = 85
if score >= 90:
grade = 'A'
elif score >= 80:
grade = 'B'
elif score >= 70:
grade = 'C'
elif score >= 60:
grade = 'D'
else:
grade = 'F'
# Multiple conditions with logical operators
age = 25
salary = 50000
if age >= 18 and salary > 30000:
print("Eligible for credit card")
# Handling missing data with conditionals
value = None
if value is None:
print("Missing value")
else:
print(f"Value: {value}")
# Ternary operator for data transformation
temperature = 75
weather = "hot" if temperature > 70 else "moderate"
Loops for Data Processing
# For loop over data
data = [23, 45, 67, 89, 12, 34, 56, 78, 90, 11]
# Calculate sum manually
total = 0
for value in data:
total += value
print(f"Sum: {total}")
# Find maximum value
max_val = data[0]
for val in data:
if val > max_val:
max_val = val
# Enumerate for index and value
for index, value in enumerate(data):
print(f"Index {index}: {value}")
# While loop with break/continue
i = 0
while i < len(data):
if data[i] > 50:
print(f"Found value > 50: {data[i]}")
break
i += 1
# Continue to skip certain values
for val in data:
if val < 20:
continue # Skip small values
print(f"Processing: {val}")
List Comprehensions for Data Transformation
# Basic list comprehension
data = [1, 2, 3, 4, 5]
squares = [x**2 for x in data] # [1, 4, 9, 16, 25]
# With condition
evens = [x for x in data if x % 2 == 0] # [2, 4]
# Nested comprehension for matrix operations
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = [x for row in matrix for x in row] # [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Dictionary comprehension
names = ['Alice', 'Bob', 'Charlie']
name_lengths = {name: len(name) for name in names} # {'Alice': 5, 'Bob': 3, 'Charlie': 7}
# Set comprehension
data_with_dups = [1, 2, 2, 3, 3, 3]
unique_squares = {x**2 for x in data_with_dups} # {1, 4, 9}
# Generator expression for memory efficiency
sum_of_squares = sum(x**2 for x in range(1000000))
Working with DataFrames Using Control Flow
import pandas as pd
# Sample data
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'age': [25, 30, 35, 28],
'salary': [50000, 60000, 75000, 55000],
'department': ['IT', 'HR', 'Finance', 'IT']
})
# Conditional column creation
df['bonus'] = df['salary'].apply(
lambda x: x * 0.15 if x > 60000 else x * 0.10
)
# Row-wise operations with iterrows
for index, row in df.iterrows():
if row['age'] > 28:
df.at[index, 'category'] = 'Senior'
else:
df.at[index, 'category'] = 'Junior'
# Vectorized approach (preferred for large datasets)
df['category'] = df['age'].apply(lambda x: 'Senior' if x > 28 else 'Junior')
# Filter with boolean indexing
high_salary = df[df['salary'] > 55000]
it_dept = df[df['department'] == 'IT']
Handling Exceptions in Data Processing
# Try-except for robust data handling
data = ['10', '20', 'thirty', '40', '50']
valid_numbers = []
for item in data:
try:
valid_numbers.append(float(item))
except ValueError:
print(f"Could not convert: {item}")
print(valid_numbers) # [10.0, 20.0, 40.0, 50.0]
# Multiple exception handling
def safe_divide(a, b):
try:
return a / b
except ZeroDivisionError:
return None
except TypeError:
return None
# Finally for cleanup
try:
result = some_function()
except Exception as e:
print(f"Error: {e}")
finally:
print("Process complete")
# Custom exceptions for data validation
class DataValidationError(Exception):
pass
def validate_age(age):
if age < 0:
raise DataValidationError("Age cannot be negative")
return age
Practice Exercise: Data Processing Pipeline
import pandas as pd
# Create sample dataset
data = {
'id': [1, 2, 3, 4, 5, 6, 7, 8],
'value': [100, 200, None, 400, 500, 600, 700, 800],
'category': ['A', 'B', 'A', 'C', None, 'B', 'A', 'C'],
'score': [85, 90, 78, 92, 88, 95, 80, 87]
}
df = pd.DataFrame(data)
# Step 1: Handle missing values
def process_data(df):
# Fill numeric missing values with median
df['value'] = df['value'].fillna(df['value'].median())
# Fill categorical missing values with mode
df['category'] = df['category'].fillna(df['category'].mode()[0])
# Step 2: Add derived features
df['value_normalized'] = (df['value'] - df['value'].min()) / (df['value'].max() - df['value'].min())
# Step 3: Create categories based on score
df['grade'] = df['score'].apply(
lambda x: 'A' if x >= 90 else ('B' if x >= 80 else 'C')
)
return df
processed_df = process_data(df.copy())
print(processed_df)
Key Takeaways
- Use vectorized operations - Pandas/NumPy are faster than loops
- List comprehensions - More Pythonic than traditional loops
- Exception handling - Essential for real-world data cleaning
- Avoid nested loops - Use apply/map for data transformations
Control Flow Best Practices
| Practice | Recommendation |
|---|---|
| Loops over data | Use list comprehension when possible |
| DataFrame operations | Use vectorized methods (apply, map) |
| Conditionals | Use numpy.where for column operations |
| Error handling | Try-except for data cleaning |
| Performance | Avoid loops on large datasets |