fix: improve the error message provided by deduplicate

MarieSacksick · MarieSacksick · commit 4508b7df4e9b · 2026-03-28T15:57:41.000+01:00
diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
@@ -2,6 +2,8 @@
 Implements deduplication based on clustering string distance matrices.
 """
 
+import warnings
+
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
@@ -75,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None):
         number of clusters that maximize the silhouette score.
     """
     max_clusters = Z.shape[0]
-    n_clusters = np.arange(2, max_clusters)
+    n_clusters = np.arange(1, max_clusters)
     # silhouette score needs a redundant distance matrix
     redundant_dist = squareform(distance_mat)
     silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(
@@ -136,6 +138,7 @@ def deduplicate(
     analyzer="char_wb",
     linkage_method="average",
     n_jobs=None,
+    warn=False,
 ):
     """Deduplicate categorical data by hierarchically clustering similar strings.
 
@@ -168,6 +171,9 @@ def deduplicate(
         average distance between data points in the first and second cluster.
     n_jobs : int, default=None
         The number of jobs to run in parallel.
+    warn : bool, default=False
+        If True, emit a warning when clustering fails (e.g. too few or too
+        similar entries) and the input is returned unchanged.
 
     Returns
     -------
@@ -260,14 +266,23 @@ def deduplicate(
     9  white      9              white
     """
     unique_words, counts = np.unique(X, return_counts=True)
-    distance_mat = _compute_ngram_distance(
-        unique_words, ngram_range=ngram_range, analyzer=analyzer
-    )
-
-    Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True)
-    if n_clusters is None:
-        n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
-    clusters = fcluster(Z, n_clusters, criterion="maxclust")
+    try:
+        distance_mat = _compute_ngram_distance(
+            unique_words, ngram_range=ngram_range, analyzer=analyzer
+        )
+        Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True)
+        if n_clusters is None:
+            n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
+        clusters = fcluster(Z, n_clusters, criterion="maxclust")
+    except Exception:
+        if warn:
+            warnings.warn(
+                "Deduplication could not cluster the data (too few or too similar"
+                " entries). Returning the input unchanged.",
+                UserWarning,
+                stacklevel=2,
+            )
+        return list(X)
 
     translation_table = _create_spelling_correction(unique_words, counts, clusters)
     unrolled_corrections = translation_table[X]
diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import warnings
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import squareform
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
@@ -145,6 +146,39 @@ def start_call(self):
 joblib.register_parallel_backend("testing", DummyBackend)
 
 
+@pytest.mark.parametrize(
+    "X",
+    [
+        # Too few unique entries for silhouette score (only 2 unique values)
+        ["black", "black", "black", "blac"],
+        # Too few unique entries (3 unique values, only 1 cluster possible)
+        ["black", "white", "black", "black", "blac"],
+        # 4 unique values but still not enough for clustering to succeed
+        ["black", "black", "black", "black", "white", "white", "white", "red", "green"],
+    ],
+)
+def test_deduplicate_failure_returns_input(X):
+    result = deduplicate(X)
+    assert isinstance(result, list)
+    assert result == X
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        ["black", "black", "black", "blac"],
+        ["black", "white", "black", "black", "blac"],
+    ],
+)
+def test_deduplicate_warn(X):
+    with pytest.warns(UserWarning, match="Returning the input unchanged"):
+        deduplicate(X, warn=True)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        deduplicate(X, warn=False)  # raises if any warning is emitted
+
+
 @skip_if_no_parallel
 def test_backend_respected():
     """