|
2 | 2 | Implements deduplication based on clustering string distance matrices. |
3 | 3 | """ |
4 | 4 |
|
| 5 | +import warnings |
| 6 | + |
5 | 7 | import numpy as np |
6 | 8 | import pandas as pd |
7 | 9 | from joblib import Parallel, delayed |
@@ -75,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None): |
75 | 77 | number of clusters that maximize the silhouette score. |
76 | 78 | """ |
77 | 79 | max_clusters = Z.shape[0] |
78 | | - n_clusters = np.arange(2, max_clusters) |
| 80 | + n_clusters = np.arange(1, max_clusters) |
79 | 81 | # silhouette score needs a redundant distance matrix |
80 | 82 | redundant_dist = squareform(distance_mat) |
81 | 83 | silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")( |
@@ -136,6 +138,7 @@ def deduplicate( |
136 | 138 | analyzer="char_wb", |
137 | 139 | linkage_method="average", |
138 | 140 | n_jobs=None, |
| 141 | + warn=False, |
139 | 142 | ): |
140 | 143 | """Deduplicate categorical data by hierarchically clustering similar strings. |
141 | 144 |
|
@@ -168,6 +171,9 @@ def deduplicate( |
168 | 171 | average distance between data points in the first and second cluster. |
169 | 172 | n_jobs : int, default=None |
170 | 173 | The number of jobs to run in parallel. |
| 174 | + warn : bool, default=False |
| 175 | + If True, emit a warning when clustering fails (e.g. too few or too |
| 176 | + similar entries) and the input is returned unchanged. |
171 | 177 |
|
172 | 178 | Returns |
173 | 179 | ------- |
@@ -260,14 +266,23 @@ def deduplicate( |
260 | 266 | 9 white 9 white |
261 | 267 | """ |
262 | 268 | unique_words, counts = np.unique(X, return_counts=True) |
263 | | - distance_mat = _compute_ngram_distance( |
264 | | - unique_words, ngram_range=ngram_range, analyzer=analyzer |
265 | | - ) |
266 | | - |
267 | | - Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True) |
268 | | - if n_clusters is None: |
269 | | - n_clusters = _guess_clusters(Z, distance_mat, n_jobs) |
270 | | - clusters = fcluster(Z, n_clusters, criterion="maxclust") |
| 269 | + try: |
| 270 | + distance_mat = _compute_ngram_distance( |
| 271 | + unique_words, ngram_range=ngram_range, analyzer=analyzer |
| 272 | + ) |
| 273 | + Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True) |
| 274 | + if n_clusters is None: |
| 275 | + n_clusters = _guess_clusters(Z, distance_mat, n_jobs) |
| 276 | + clusters = fcluster(Z, n_clusters, criterion="maxclust") |
| 277 | + except Exception: |
| 278 | + if warn: |
| 279 | + warnings.warn( |
| 280 | + "Deduplication could not cluster the data (too few or too similar" |
| 281 | + " entries). Returning the input unchanged.", |
| 282 | + UserWarning, |
| 283 | + stacklevel=2, |
| 284 | + ) |
| 285 | + return list(X) |
271 | 286 |
|
272 | 287 | translation_table = _create_spelling_correction(unique_words, counts, clusters) |
273 | 288 | unrolled_corrections = translation_table[X] |
|
0 commit comments