Introduction
Ensemble methods combine multiple models to improve predictive performance through voting, bagging, and boosting.
Random Forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=4, random_state=42)
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=5,
random_state=42,
n_jobs=-1
)
rf.fit(X, y)
print(f"Feature importances: {rf.feature_importances_}")
print(f"OOB score: {rf.oob_score_}")
Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
gb.fit(X_train, y_train)
pred = gb.predict(X_test)
AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
base_estimator = DecisionTreeClassifier(max_depth=1)
ada = AdaBoostClassifier(
estimator=base_estimator,
n_estimators=50,
learning_rate=1.0,
random_state=42
)
ada.fit(X_train, y_train)
Voting Classifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
voting = VotingClassifier(
estimators=[
('lr', LogisticRegression()),
('dt', DecisionTreeClassifier()),
('svm', SVC(probability=True))
],
voting='soft'
)
voting.fit(X_train, y_train)
Practice Problems
- Train random forest with different n_estimators
- Tune gradient boosting learning rate
- Compare hard vs soft voting
- Extract feature importances
- Use OOB score for validation