From 753413804d592286e8262b0b946cff351c17829f Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Mon, 26 May 2025 21:22:41 +0200 Subject: [PATCH] Improved code. Minor fixes --- .gitignore | 1 + Makefile | 23 +++++ app/streamlit_app.py | 110 ++++++++++++++++++++-- src/tdamapper/_plot_plotly.py | 2 +- src/tdamapper/clustering.py | 2 +- src/tdamapper/utils/heap.py | 58 ++++++------ src/tdamapper/utils/vptree_hier/common.py | 20 ++-- src/tdamapper/utils/vptree_hier/vptree.py | 28 +++--- tests/test_bench_cover.py | 3 + 9 files changed, 183 insertions(+), 64 deletions(-) create mode 100644 Makefile diff --git a/.gitignore b/.gitignore index 85900a39..d2d0e496 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ .idea dist/ build/ +coverage.xml \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..ff140ee8 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +PYTHON = python +PIP = pip + +.PHONY: all +all: install + +.PHONY: install +install: + $(PIP) install -e .[dev] + +.PHONY: test +test: + coverage run --source=src -m pytest tests/test_unit_*.py + coverage xml + +.PHONY: bench +bench: + $(PYTHON) -m pytest tests/test_bench_*.py -s -o log_cli=true --log-level=INFO + +.PHONY: clean +clean: + find . -type d -name "__pycache__" -exec rm -r {} + + find . -type f -name "*.pyc" -delete diff --git a/app/streamlit_app.py b/app/streamlit_app.py index 9637892a..39c778a7 100644 --- a/app/streamlit_app.py +++ b/app/streamlit_app.py @@ -23,8 +23,8 @@ from umap import UMAP from tdamapper.core import aggregate_graph -from tdamapper.cover import BallCover, CubicalCover -from tdamapper.learn import MapperAlgorithm +from tdamapper.cover import BallCover, CubicalCover, KNNCover +from tdamapper.learn import MapperAlgorithm, MapperClustering from tdamapper.plot import MapperPlot LIMITS_ENABLED = bool(os.environ.get("LIMITS_ENABLED", False)) @@ -63,8 +63,12 @@ V_COVER_CUBICAL = "Cubical" +V_COVER_KNN = "KNN" + V_CLUSTERING_TRIVIAL = "Trivial" +V_CLUSTERING_COVER = "Cover" + V_CLUSTERING_AGGLOMERATIVE = "Agglomerative" V_CLUSTERING_DBSCAN = "DBSCAN" @@ -198,7 +202,10 @@ def _get_data_summary(df_X, df_y): } ).T df_summary = pd.DataFrame( - {V_DATA_SUMMARY_FEAT: df.columns, V_DATA_SUMMARY_HIST: df_hist.values.tolist()} + { + V_DATA_SUMMARY_FEAT: df.columns, + V_DATA_SUMMARY_HIST: df_hist.values.tolist(), + } ) return df_summary @@ -316,9 +323,10 @@ def mapper_lens_input_section(X): if pca_n > n_feats: lens = X else: - lens = PCA(n_components=pca_n, random_state=pca_random_state).fit_transform( - X - ) + lens = PCA( + n_components=pca_n, + random_state=pca_random_state, + ).fit_transform(X) elif lens_type == V_LENS_UMAP: umap_n = st.number_input( "UMAP Components", @@ -343,7 +351,12 @@ def mapper_cover_input_section(): st.header("🌐 Cover") cover_type = st.selectbox( "Type", - options=[V_COVER_TRIVIAL, V_COVER_BALL, V_COVER_CUBICAL], + options=[ + V_COVER_TRIVIAL, + V_COVER_BALL, + V_COVER_CUBICAL, + V_COVER_KNN, + ], index=2, ) cover = None @@ -379,9 +392,79 @@ def mapper_cover_input_section(): "Overlap", value=0.25, min_value=0.0, max_value=1.0 ) cover = CubicalCover(n_intervals=cubical_n, overlap_frac=cubical_p) + elif cover_type == V_COVER_KNN: + knn_k = st.number_input("Neighbors", value=10, min_value=1) + cover = KNNCover(neighbors=knn_k) return cover +def mapper_clustering_cover(): + cover_type = st.selectbox( + "Type", + options=[ + V_COVER_TRIVIAL, + V_COVER_BALL, + V_COVER_CUBICAL, + V_COVER_KNN, + ], + index=2, + key="mapper_clustering_cover_type", + ) + cover = None + if cover_type == V_COVER_TRIVIAL: + cover = None + elif cover_type == V_COVER_BALL: + ball_r = st.number_input( + "Radius", + value=100.0, + min_value=0.0, + key="mapper_clustering_radius", + ) + metric = st.selectbox( + "Metric", + options=[ + "euclidean", + "chebyshev", + "manhattan", + "cosine", + ], + key="mapper_clustering_cover_metric", + ) + cover = BallCover(radius=ball_r, metric=metric) + elif cover_type == V_COVER_CUBICAL: + cubical_n = st.number_input( + "Intervals", + value=10, + min_value=0, + key="mapper_clustering_cover_intervals", + ) + cubical_overlap = st.checkbox( + "Set overlap", + value=False, + help="Uses a dimension-dependant default overlap when unchecked", + key="mapper_clustering_cover_set_overlap", + ) + cubical_p = None + if cubical_overlap: + cubical_p = st.number_input( + "Overlap", + value=0.25, + min_value=0.0, + max_value=1.0, + key="mapper_clustering_cover_overlap", + ) + cover = CubicalCover(n_intervals=cubical_n, overlap_frac=cubical_p) + elif cover_type == V_COVER_KNN: + knn_k = st.number_input( + "Neighbors", + value=10, + min_value=1, + key="mapper_clustering_knn_k", + ) + cover = KNNCover(neighbors=knn_k) + return MapperClustering(cover=cover, n_jobs=-2) + + def mapper_clustering_kmeans(): clust_num = st.number_input( "Clusters", @@ -485,17 +568,20 @@ def mapper_clustering_input_section(): "Type", options=[ V_CLUSTERING_TRIVIAL, + V_CLUSTERING_COVER, V_CLUSTERING_KMEANS, V_CLUSTERING_AGGLOMERATIVE, V_CLUSTERING_DBSCAN, V_CLUSTERING_HDBSCAN, V_CLUSTERING_AFFINITY_PROPAGATION, ], - index=1, + index=0, ) clustering = None if clustering_type == V_CLUSTERING_TRIVIAL: clustering = None + elif clustering_type == V_CLUSTERING_COVER: + clustering = mapper_clustering_cover() elif clustering_type == V_CLUSTERING_AGGLOMERATIVE: clustering = mapper_clustering_agglomerative() elif clustering_type == V_CLUSTERING_KMEANS: @@ -625,7 +711,13 @@ def compute_mapper_fig(mapper_plot, colors, node_size, cmap, _agg, agg_name): logger.info("Generating Mapper figure") mapper_fig = mapper_plot.plot_plotly( colors, - node_size=node_size, + node_size=[ + 0.0, + node_size / 2.0, + node_size, + node_size * 1.5, + node_size * 2.0, + ], agg=_agg, title=[f"{c}" for c in colors.columns], cmap=cmap, diff --git a/src/tdamapper/_plot_plotly.py b/src/tdamapper/_plot_plotly.py index 25654b88..77790a60 100644 --- a/src/tdamapper/_plot_plotly.py +++ b/src/tdamapper/_plot_plotly.py @@ -73,7 +73,7 @@ def plot_plotly( titles = [title for _ in range(colors_num)] elif isinstance(title, list) and len(title) == colors_num: titles = title - node_sizes = [node_size] if isinstance(node_size, int) else node_size + node_sizes = [node_size] if isinstance(node_size, (int, float)) else node_size fig = _figure(mapper_plot, width, height, node_sizes, colors, titles, agg, cmaps) _add_ui_to_layout(mapper_plot, fig, colors, titles, node_sizes, agg, cmaps) return fig diff --git a/src/tdamapper/clustering.py b/src/tdamapper/clustering.py index 72d10b02..1d63279c 100644 --- a/src/tdamapper/clustering.py +++ b/src/tdamapper/clustering.py @@ -43,6 +43,7 @@ def __init__(self, cover=None, clustering=None, n_jobs=1): self.n_jobs = n_jobs def fit(self, X, y=None): + y = X if y is None else y X, y = self._validate_X_y(X, y) cover = TrivialCover() if self.cover is None else self.cover cover = clone(cover) @@ -53,7 +54,6 @@ def fit(self, X, y=None): ) clustering = clone(clustering) n_jobs = self.n_jobs - y = X if y is None else y itm_lbls = mapper_connected_components( X, y, diff --git a/src/tdamapper/utils/heap.py b/src/tdamapper/utils/heap.py index a2b024ad..9b34eeeb 100644 --- a/src/tdamapper/utils/heap.py +++ b/src/tdamapper/utils/heap.py @@ -13,91 +13,91 @@ def _parent(i): class _HeapNode: def __init__(self, key, value): - self.__key = key - self.__value = value + self._key = key + self._value = value def get(self): - return self.__key, self.__value + return self._key, self._value def __lt__(self, other): - return self.__key < other + return self._key < other._key def __le__(self, other): - return self.__key <= other + return self._key <= other._key def __gt__(self, other): - return self.__key > other + return self._key > other._key def __ge__(self, other): - return self.__key >= other + return self._key >= other._key class MaxHeap: def __init__(self): - self.__heap = [] - self.__iter = None + self._heap = [] + self._iter = None def __iter__(self): - self.__iter = iter(self.__heap) + self._iter = iter(self._heap) return self def __next__(self): - node = next(self.__iter) + node = next(self._iter) return node.get() def __len__(self): - return len(self.__heap) + return len(self._heap) def top(self): - if not self.__heap: + if not self._heap: return (None, None) - return self.__heap[0].get() + return self._heap[0].get() def pop(self): - if not self.__heap: + if not self._heap: return - max_val = self.__heap[0] - self.__heap[0] = self.__heap[-1] - self.__heap.pop() + max_val = self._heap[0] + self._heap[0] = self._heap[-1] + self._heap.pop() self._bubble_down() return max_val.get() def add(self, key, val): - self.__heap.append(_HeapNode(key, val)) + self._heap.append(_HeapNode(key, val)) self._bubble_up() def _get_local_max(self, i): - heap_len = len(self.__heap) + heap_len = len(self._heap) left = _left(i) right = _right(i) if left >= heap_len: return i if right >= heap_len: - if self.__heap[i] < self.__heap[left]: + if self._heap[i] < self._heap[left]: return left return i max_child = left - if self.__heap[left] < self.__heap[right]: + if self._heap[left] < self._heap[right]: max_child = right - if self.__heap[i] < self.__heap[max_child]: + if self._heap[i] < self._heap[max_child]: return max_child return i def _fix_down(self, i): local_max = self._get_local_max(i) if i < local_max: - self.__heap[i], self.__heap[local_max] = ( - self.__heap[local_max], - self.__heap[i], + self._heap[i], self._heap[local_max] = ( + self._heap[local_max], + self._heap[i], ) return local_max return i def _fix_up(self, i): parent = _parent(i) - if self.__heap[parent] < self.__heap[i]: - self.__heap[i], self.__heap[parent] = self.__heap[parent], self.__heap[i] + if self._heap[parent] < self._heap[i]: + self._heap[i], self._heap[parent] = self._heap[parent], self._heap[i] return parent return i @@ -110,7 +110,7 @@ def _bubble_down(self): current = local_max def _bubble_up(self): - current = len(self.__heap) - 1 + current = len(self._heap) - 1 done = False while not done: local_max = self._fix_up(current) diff --git a/src/tdamapper/utils/vptree_hier/common.py b/src/tdamapper/utils/vptree_hier/common.py index 749c8fab..ec123731 100644 --- a/src/tdamapper/utils/vptree_hier/common.py +++ b/src/tdamapper/utils/vptree_hier/common.py @@ -38,32 +38,32 @@ def partition(self, s, e, k): class Node: def __init__(self, radius, center, left, right): - self.__radius = radius - self.__center = center - self.__left = left - self.__right = right + self._radius = radius + self._center = center + self._left = left + self._right = right def get_ball(self): - return self.__radius, self.__center + return self._radius, self._center def is_terminal(self): return False def get_left(self): - return self.__left + return self._left def get_right(self): - return self.__right + return self._right class Leaf: def __init__(self, start, end): - self.__start = start - self.__end = end + self._start = start + self._end = end def get_bounds(self): - return self.__start, self.__end + return self._start, self._end def is_terminal(self): return True diff --git a/src/tdamapper/utils/vptree_hier/vptree.py b/src/tdamapper/utils/vptree_hier/vptree.py index cd0f0172..68ce0aa5 100644 --- a/src/tdamapper/utils/vptree_hier/vptree.py +++ b/src/tdamapper/utils/vptree_hier/vptree.py @@ -15,37 +15,37 @@ def __init__( leaf_radius=0.0, pivoting=None, ): - self.__metric = metric - self.__metric_params = metric_params - self.__leaf_capacity = leaf_capacity - self.__leaf_radius = leaf_radius - self.__pivoting = pivoting - self.__tree, self._arr = Builder(self, X).build() + self._metric = metric + self._metric_params = metric_params + self._leaf_capacity = leaf_capacity + self._leaf_radius = leaf_radius + self._pivoting = pivoting + self._tree, self._arr = Builder(self, X).build() def get_metric(self): - return self.__metric + return self._metric def get_metric_params(self): - return self.__metric_params + return self._metric_params def get_leaf_capacity(self): - return self.__leaf_capacity + return self._leaf_capacity def get_leaf_radius(self): - return self.__leaf_radius + return self._leaf_radius def get_pivoting(self): - return self.__pivoting + return self._pivoting def _get_tree(self): - return self.__tree + return self._tree def _get_arr(self): return self._arr def _get_distance(self): - metric_params = self.__metric_params or {} - return get_metric(self.__metric, **metric_params) + metric_params = self._metric_params or {} + return get_metric(self._metric, **metric_params) def ball_search(self, point, eps, inclusive=True): return BallSearch(self, point, eps, inclusive).search() diff --git a/tests/test_bench_cover.py b/tests/test_bench_cover.py index ee3f4ac0..f150f3a9 100644 --- a/tests/test_bench_cover.py +++ b/tests/test_bench_cover.py @@ -42,6 +42,7 @@ def cover(vpt, X, r): def run_bench(X, r, dist, vp, **kwargs): XX = np.array([[i] + [xi for xi in x] for i, x in enumerate(X)]) + vpt = vp(XX, metric=dist_proj, **kwargs) # first run of jit-compiled functions t0 = time.time() vpt = vp(XX, metric=dist_proj, **kwargs) list(cover(vpt, XX, r)) @@ -57,9 +58,11 @@ def test_cover_random(): logger.info(f"[n: {n}, r: {r}]") X = dataset(num=n) logger.info(">>>>>>> HVPT >>>>>>") + run_bench(X, r, dist, HVPT, leaf_radius=r, pivoting=None) run_bench(X, r, dist, HVPT, leaf_radius=r, pivoting="random") run_bench(X, r, dist, HVPT, leaf_radius=r, pivoting="furthest") logger.info(">>>>>>> FVPT >>>>>>") + run_bench(X, r, dist, FVPT, leaf_radius=r, pivoting=None) run_bench(X, r, dist, FVPT, leaf_radius=r, pivoting="random") run_bench(X, r, dist, FVPT, leaf_radius=r, pivoting="furthest") logger.info(">>>>>> SKBT >>>>>>")