Introduction
Clustering algorithms group similar data points without labels using distance-based or density-based methods.
K-Means
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import numpy as np
X, _ = make_blobs(n_samples=100, centers=3, random_state=42)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
inertia = kmeans.inertia_
print(f"Inertia: {inertia:.2f}")
DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
X, _ = make_moons(n_samples=100, noise=0.1, random_state=42)
dbscan = DBSCAN(eps=0.3, min_samples=5)
labels = dbscan.fit_predict(X)
# -1 label = noise points
print(f"Cluster labels: {np.unique(labels)}")
print(f"Core samples: {dbscan.core_sample_indices_[:10]}")
Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import numpy as np
# Hierarchical clustering
agg = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels = agg.fit_predict(X)
# Different linkage methods
agg_complete = AgglomerativeClustering(linkage='complete')
agg_average = AgglomerativeClustering(linkage='average')
Finding Optimal Clusters
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# Elbow method
inertias = []
silhouettes = []
K = range(2, 11)
for k in K:
km = KMeans(n_clusters=k, random_state=42, n_init=10)
km.fit(X)
inertias.append(km.inertia_)
silhouettes.append(silhouette_score(X, km.labels_))
plt.plot(K, silhouettes, 'b-o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
Practice Problems
- Implement K-means clustering
- Use DBSCAN for non-convex clusters
- Visualize dendrogram
- Find optimal k using silhouette score
- Compare different linkage methods