Introduction
Feature selection identifies the most relevant features for prediction, reducing overfitting and improving model interpretability.
SelectKBest
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)
print(f"Scores: {selector.scores_}")
print(f"P-values: {selector.pvalues_}")
print(f"Selected features: {selector.get_support()}")
SelectPercentile
from sklearn.feature_selection import SelectPercentile, mutual_info_classif
selector = SelectPercentile(score_func=mutual_info_classif, percentile=30)
X_selected = selector.fit_transform(X, y)
Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Using estimator
estimator = LogisticRegression()
rfe = RFE(estimator=estimator, n_features_to_select=5, step=1)
rfe.fit(X, y)
print(f"Ranking: {rfe.ranking_}")
print(f"Selected: {rfe.support_}")
Feature Importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# Get importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# Select important features
selector = SelectFromModel(rf, threshold='mean')
X_selected = selector.transform(X)
Practice Problems
- Use SelectKBest with f_classif
- Implement recursive feature elimination
- Extract feature importances from tree
- Use SelectFromModel for selection
- Compare univariate vs model-based selection