|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Benchmark: sklearnex HDBSCAN (oneDAL backend) vs stock sklearn HDBSCAN. |
| 4 | +
|
| 5 | +Measures wall-clock time, ARI vs ground truth, ARI between implementations, |
| 6 | +and cluster counts across multiple dataset sizes and algorithm choices. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + # Ensure sklearnex is built against patched oneDAL: |
| 10 | + # conda activate build_sklearnex |
| 11 | + # source __release_lnx/daal/latest/env/vars.sh |
| 12 | + # cd ../scikit-learn-intelex && python setup.py develop --no-deps |
| 13 | + # |
| 14 | + # Then run: |
| 15 | + python scripts/benchmark_sklearnex_hdbscan.py |
| 16 | +""" |
| 17 | + |
| 18 | +import time |
| 19 | +import sys |
| 20 | +import warnings |
| 21 | + |
| 22 | +import numpy as np |
| 23 | +from sklearn.datasets import make_blobs |
| 24 | +from sklearn.metrics import adjusted_rand_score |
| 25 | +from sklearn.cluster import HDBSCAN as SklearnHDBSCAN |
| 26 | + |
| 27 | +try: |
| 28 | + from sklearnex.cluster import HDBSCAN as SklearnexHDBSCAN |
| 29 | +except ImportError: |
| 30 | + print("ERROR: sklearnex not available. Build it first:") |
| 31 | + print(" conda activate build_sklearnex") |
| 32 | + print(" cd scikit-learn-intelex && python setup.py develop --no-deps") |
| 33 | + sys.exit(1) |
| 34 | + |
| 35 | +# Suppress sklearn convergence/future warnings |
| 36 | +warnings.filterwarnings("ignore") |
| 37 | + |
| 38 | +# ---- Configuration ---- |
| 39 | +SIZES = [1000, 5000, 10000, 20000, 50000] |
| 40 | +N_FEATURES = 10 |
| 41 | +N_CENTERS = 10 |
| 42 | +MIN_CLUSTER_SIZE = 15 |
| 43 | +MIN_SAMPLES = 5 |
| 44 | +N_RUNS = 3 # take median |
| 45 | +ALGORITHMS = ["brute", "auto"] # auto maps to kd_tree for euclidean in sklearnex |
| 46 | +RANDOM_STATE = 42 |
| 47 | + |
| 48 | + |
| 49 | +def count_clusters(labels): |
| 50 | + return len(set(labels)) - (1 if -1 in labels else 0) |
| 51 | + |
| 52 | + |
| 53 | +def noise_fraction(labels): |
| 54 | + return np.sum(labels == -1) / len(labels) |
| 55 | + |
| 56 | + |
| 57 | +def bench_one(X, y_true, algorithm): |
| 58 | + """Benchmark sklearn and sklearnex on a single dataset.""" |
| 59 | + |
| 60 | + # sklearn |
| 61 | + times_sk = [] |
| 62 | + sk_labels = None |
| 63 | + for _ in range(N_RUNS): |
| 64 | + t0 = time.perf_counter() |
| 65 | + sk = SklearnHDBSCAN( |
| 66 | + min_cluster_size=MIN_CLUSTER_SIZE, |
| 67 | + min_samples=MIN_SAMPLES, |
| 68 | + algorithm=algorithm, |
| 69 | + ) |
| 70 | + sk.fit(X) |
| 71 | + times_sk.append(time.perf_counter() - t0) |
| 72 | + sk_labels = sk.labels_ |
| 73 | + |
| 74 | + # sklearnex |
| 75 | + times_sx = [] |
| 76 | + sx_labels = None |
| 77 | + for _ in range(N_RUNS): |
| 78 | + t0 = time.perf_counter() |
| 79 | + sx = SklearnexHDBSCAN( |
| 80 | + min_cluster_size=MIN_CLUSTER_SIZE, |
| 81 | + min_samples=MIN_SAMPLES, |
| 82 | + algorithm=algorithm, |
| 83 | + ) |
| 84 | + sx.fit(X) |
| 85 | + times_sx.append(time.perf_counter() - t0) |
| 86 | + sx_labels = sx.labels_ |
| 87 | + |
| 88 | + t_sk = np.median(times_sk) |
| 89 | + t_sx = np.median(times_sx) |
| 90 | + speedup = t_sk / t_sx if t_sx > 0 else float("inf") |
| 91 | + |
| 92 | + ari_sk_true = adjusted_rand_score(y_true, sk_labels) |
| 93 | + ari_sx_true = adjusted_rand_score(y_true, sx_labels) |
| 94 | + ari_cross = adjusted_rand_score(sk_labels, sx_labels) |
| 95 | + |
| 96 | + nc_sk = count_clusters(sk_labels) |
| 97 | + nc_sx = count_clusters(sx_labels) |
| 98 | + nf_sk = noise_fraction(sk_labels) |
| 99 | + nf_sx = noise_fraction(sx_labels) |
| 100 | + |
| 101 | + return { |
| 102 | + "t_sk": t_sk, |
| 103 | + "t_sx": t_sx, |
| 104 | + "speedup": speedup, |
| 105 | + "ari_sk_true": ari_sk_true, |
| 106 | + "ari_sx_true": ari_sx_true, |
| 107 | + "ari_cross": ari_cross, |
| 108 | + "nc_sk": nc_sk, |
| 109 | + "nc_sx": nc_sx, |
| 110 | + "nf_sk": nf_sk, |
| 111 | + "nf_sx": nf_sx, |
| 112 | + } |
| 113 | + |
| 114 | + |
| 115 | +def main(): |
| 116 | + print("=" * 110) |
| 117 | + print("HDBSCAN Benchmark: sklearnex (oneDAL) vs sklearn") |
| 118 | + print(f"Config: features={N_FEATURES}, centers={N_CENTERS}, " |
| 119 | + f"mcs={MIN_CLUSTER_SIZE}, ms={MIN_SAMPLES}, runs={N_RUNS}") |
| 120 | + print("=" * 110) |
| 121 | + |
| 122 | + for algorithm in ALGORITHMS: |
| 123 | + print(f"\n--- algorithm='{algorithm}' ---") |
| 124 | + print(f"{'N':>7s} | {'sklearn':>9s} {'sklearnex':>9s} {'speedup':>8s} | " |
| 125 | + f"{'ARI(sk)':>7s} {'ARI(sx)':>7s} {'ARI(x)':>7s} | " |
| 126 | + f"{'cl_sk':>5s} {'cl_sx':>5s} {'noise_sk':>8s} {'noise_sx':>8s}") |
| 127 | + print("-" * 110) |
| 128 | + |
| 129 | + # Warmup run |
| 130 | + X_warm, _ = make_blobs(n_samples=200, n_features=N_FEATURES, |
| 131 | + centers=3, random_state=0) |
| 132 | + SklearnHDBSCAN(min_cluster_size=5).fit(X_warm) |
| 133 | + SklearnexHDBSCAN(min_cluster_size=5).fit(X_warm) |
| 134 | + |
| 135 | + for n in SIZES: |
| 136 | + X, y_true = make_blobs( |
| 137 | + n_samples=n, |
| 138 | + n_features=N_FEATURES, |
| 139 | + centers=N_CENTERS, |
| 140 | + cluster_std=1.0, |
| 141 | + random_state=RANDOM_STATE, |
| 142 | + ) |
| 143 | + |
| 144 | + r = bench_one(X, y_true, algorithm) |
| 145 | + |
| 146 | + print( |
| 147 | + f"{n:7d} | " |
| 148 | + f"{r['t_sk']:9.3f}s {r['t_sx']:9.3f}s {r['speedup']:7.1f}x | " |
| 149 | + f"{r['ari_sk_true']:7.4f} {r['ari_sx_true']:7.4f} {r['ari_cross']:7.4f} | " |
| 150 | + f"{r['nc_sk']:5d} {r['nc_sx']:5d} {r['nf_sk']:8.2%} {r['nf_sx']:8.2%}" |
| 151 | + ) |
| 152 | + |
| 153 | + print("\n" + "=" * 110) |
| 154 | + print("Legend:") |
| 155 | + print(" sklearn = stock sklearn.cluster.HDBSCAN (median of 3 runs)") |
| 156 | + print(" sklearnex = sklearnex.cluster.HDBSCAN backed by oneDAL (median of 3 runs)") |
| 157 | + print(" speedup = sklearn_time / sklearnex_time") |
| 158 | + print(" ARI(sk) = Adjusted Rand Index of sklearn labels vs ground truth") |
| 159 | + print(" ARI(sx) = Adjusted Rand Index of sklearnex labels vs ground truth") |
| 160 | + print(" ARI(x) = Adjusted Rand Index between sklearn and sklearnex labels") |
| 161 | + print(" cl_sk/sx = number of clusters found") |
| 162 | + print(" noise_sk/sx = fraction of points labeled as noise") |
| 163 | + print("=" * 110) |
| 164 | + |
| 165 | + |
| 166 | +if __name__ == "__main__": |
| 167 | + main() |
0 commit comments