1+ import time
2+
3+ import pandas as pd
4+ import numpy as np
5+
6+ from sklearn .decomposition import PCA
7+ from sklearn .datasets import fetch_openml , load_digits
8+ from sklearn .base import ClusterMixin
9+
10+ from tdamapper .clustering import TrivialClustering
11+
12+ import tdamapper as tm
13+ import gtda .mapper as gm
14+ import kmapper as km
15+
16+
17+ def _segment (cardinality , dimension , noise = 0.1 , start = None , end = None ):
18+ if start is None :
19+ start = np .zeros (dimension )
20+ if end is None :
21+ end = np .ones (dimension )
22+ coefficients = np .random .rand (cardinality , 1 )
23+ points = start + coefficients * (end - start )
24+ noise = np .random .normal (0 , noise , size = (cardinality , dimension ))
25+ return points + noise
26+
27+
28+ def _load_openml (name ):
29+ XX , _ = fetch_openml (name = name , return_X_y = True )
30+ return XX .to_numpy ()
31+
32+
33+ def line (k ):
34+ return _segment (100000 , k , 0.01 )
35+
36+
37+ def digits (k ):
38+ X_digits , _ = load_digits (return_X_y = True )
39+ return PCA (k ).fit_transform (X_digits )
40+
41+
42+ def mnist (k ):
43+ X = _load_openml ('mnist_784' )
44+ return PCA (k ).fit_transform (X )
45+
46+
47+ def cifar10 (k ):
48+ X = _load_openml ('CIFAR_10' )
49+ return PCA (k ).fit_transform (X )
50+
51+
52+ def fashion_mnist (k ):
53+ X = _load_openml ('Fashion-MNIST' )
54+ return PCA (k ).fit_transform (X )
55+
56+
57+ # wrapper class to supply trivial clustering to giotto-tda
58+ class TrivialEstimator (ClusterMixin ):
59+
60+ def get_params (self , deep = True ):
61+ return {}
62+
63+ def set_params (self , ** parmeters ):
64+ return self
65+
66+ def fit (self , X , y = None ):
67+ clust = TrivialClustering ()
68+ self .labels_ = clust .fit (X , y ).labels_
69+ return self
70+
71+
72+ def run_gm (X , n , p ):
73+ t0 = time .time ()
74+ pipe = gm .make_mapper_pipeline (
75+ filter_func = lambda x : x ,
76+ cover = gm .CubicalCover (
77+ n_intervals = n ,
78+ overlap_frac = p ),
79+ clusterer = TrivialEstimator (),
80+ )
81+ mapper_graph = pipe .fit_transform (X )
82+ t1 = time .time ()
83+ return t1 - t0
84+
85+
86+ def run_tm (X , n , p ):
87+ t0 = time .time ()
88+ mapper_graph = tm .core .MapperAlgorithm (
89+ cover = tm .cover .CubicalCover (
90+ n_intervals = n ,
91+ overlap_frac = p ,
92+ #leaf_capacity=1000,
93+ #leaf_radius=1.0 / (2.0 - 2.0 * p),
94+ #kind='hierarchical',
95+ #pivoting='random',
96+ ),
97+ clustering = TrivialEstimator (),
98+ ).fit_transform (X , X )
99+ t1 = time .time ()
100+ return t1 - t0
101+
102+
103+ def run_km (X , n , p ):
104+ t0 = time .time ()
105+ mapper = km .KeplerMapper (verbose = 0 )
106+ graph = mapper .map (
107+ lens = X ,
108+ X = X ,
109+ cover = km .Cover (
110+ n_cubes = n ,
111+ perc_overlap = p
112+ ),
113+ clusterer = TrivialEstimator (),
114+ )
115+ t1 = time .time ()
116+ return t1 - t0
117+
118+
119+ def run_bench (benches , datasets , dimensions , overlaps , intervals ):
120+ df_bench = pd .DataFrame ({
121+ 'bench' : [],
122+ 'dataset' : [],
123+ 'p' : [],
124+ 'n' : [],
125+ 'k' : [],
126+ 'time' : [],
127+ })
128+ launch_time = int (time .time ())
129+ for bench_name , bench in benches :
130+ for dataset_name , dataset in datasets :
131+ for k in dimensions :
132+ X = dataset (k )
133+ for p in overlaps :
134+ for n in intervals :
135+ t = bench (X , n , p )
136+ df_delta = pd .DataFrame ({
137+ 'bench' : bench_name ,
138+ 'dataset' : dataset_name ,
139+ 'p' : p ,
140+ 'n' : n ,
141+ 'k' : k ,
142+ 'time' : t ,
143+ }, index = [0 ])
144+ print (df_delta )
145+ df_bench = pd .concat ([df_bench , df_delta ], ignore_index = True )
146+ df_bench .to_csv (f'./benchmark_{ launch_time } .csv' , index = False )
147+
148+
149+ if __name__ == '__main__' :
150+ run_tm (line (1 ), 1 , 0.5 ) # fist run to jit-compile numba decorated functions
151+
152+ run_bench (
153+ overlaps = [
154+ 0.125 ,
155+ 0.25 ,
156+ 0.5
157+ ],
158+ datasets = [
159+ ('line' , line ),
160+ ('digits' , digits ),
161+ ('mnist' , mnist ),
162+ ('cifar10' , cifar10 ),
163+ ('fashion_mnist' , fashion_mnist ),
164+ ],
165+ intervals = [
166+ 10 ,
167+ ],
168+ dimensions = [
169+ 1 ,
170+ 2 ,
171+ 3 ,
172+ 4 ,
173+ 5 ,
174+ ],
175+ benches = [
176+ ('tda-mapper' , run_tm ),
177+ ('kepler-mapper' , run_km ),
178+ ('giotto-tda' , run_gm ),
179+ ],
180+ )
0 commit comments