From 3492bdb2e63d92b08cb3ba2dd436c5840dae88d9 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Braun Date: Sun, 10 May 2026 00:49:38 +0200 Subject: [PATCH] Fix DiceGenetic.compute_proximity_loss for all-categorical datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the dataset has no continuous features, `continuous_feature_indexes` is empty, so `feature_weights` is an empty np.array and the original implementation hit either: * `proximity_loss / sum(feature_weights)` ⇒ ZeroDivisionError / RuntimeWarning + NaN losses (the symptom @kburchfiel reported in #276 with the original quoted snippet), or * `product.reshape(-1, product.shape[-1])` ⇒ ValueError on a 0-sized array, depending on input shape. Both paths poison `compute_loss` with NaN/exceptions and break the genetic search for legitimate all-categorical use cases. Proximity is conceptually undefined when there are no continuous distances to weigh, so short-circuit with a zero loss vector matching the population shape. The categorical penalty in `compute_loss` already accounts for categorical sparsity, so dropping the proximity contribution is the correct semantic — and it matches what users expect when they explicitly set up a categorical-only `dice_ml.Data`. Adds `TestComputeProximityLossNoContinuousFeatures` covering the all-categorical path. The test fails on `origin/main` with the ValueError reshape variant of this bug. Closes #276. Co-Authored-By: Claude Opus 4.7 --- dice_ml/explainer_interfaces/dice_genetic.py | 8 +++ .../test_dice_interface/test_dice_genetic.py | 61 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py index 0b1bc1b7..b37a901c 100644 --- a/dice_ml/explainer_interfaces/dice_genetic.py +++ b/dice_ml/explainer_interfaces/dice_genetic.py @@ -370,6 +370,14 @@ def compute_proximity_loss(self, x_hat_unnormalized, query_instance_normalized): x_hat = self.data_interface.normalize_data(x_hat_unnormalized) feature_weights = np.array( [self.feature_weights_list[0][i] for i in self.data_interface.continuous_feature_indexes]) + # When the dataset has no continuous features, feature_weights is an + # empty array and the original `proximity_loss / sum(feature_weights)` + # divided by zero, raising RuntimeWarning + producing NaN losses that + # poison the genetic search. Proximity is conceptually undefined in + # that case (there are no continuous distances to weigh), so return a + # zero loss vector matching the population shape — see issue #276. + if len(feature_weights) == 0: + return np.zeros(x_hat.shape[0]) product = np.multiply( (abs(x_hat - query_instance_normalized)[:, [self.data_interface.continuous_feature_indexes]]), feature_weights) diff --git a/tests/test_dice_interface/test_dice_genetic.py b/tests/test_dice_interface/test_dice_genetic.py index 897b42eb..ebfe7005 100644 --- a/tests/test_dice_interface/test_dice_genetic.py +++ b/tests/test_dice_interface/test_dice_genetic.py @@ -261,3 +261,64 @@ def test_maxiter(self, desired_range, sample_custom_query_2, total_CFs, initiali for cfs_example in ans.cf_examples_list: for i in cfs_example.final_cfs_df[self.exp.data_interface.outcome_name].values: assert desired_range[0] <= i <= desired_range[1] + + +class TestComputeProximityLossNoContinuousFeatures: + """Regression for issue #276. + + `DiceGenetic.compute_proximity_loss` divides by `sum(feature_weights)`, + where `feature_weights` is restricted to *continuous* feature indexes. + For an all-categorical dataset the array is empty and the original + implementation hit RuntimeWarning + NaN losses (or ZeroDivisionError on + older numpy). The genetic search then propagated NaN through `compute_loss` + and produced unusable counterfactuals. + """ + + def _make_explainer_categorical_only(self): + import numpy as np + import pandas as pd + from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + from sklearn.linear_model import LogisticRegression + + rng = np.random.default_rng(0) + df = pd.DataFrame({ + "color": rng.choice(["red", "green", "blue"], size=60), + "shape": rng.choice(["circle", "square"], size=60), + "label": rng.integers(0, 2, size=60), + }) + cat = ["color", "shape"] + X = df[cat] + y = df["label"] + clf = Pipeline([ + ("ohe", ColumnTransformer([("o", OneHotEncoder(), cat)])), + ("lr", LogisticRegression(max_iter=200)), + ]).fit(X, y) + + d = dice_ml.Data(dataframe=df, continuous_features=[], + categorical_features=cat, outcome_name="label") + m = dice_ml.Model(model=clf, backend="sklearn") + return dice_ml.Dice(d, m, method="genetic") + + def test_compute_proximity_loss_returns_zero_when_no_continuous_features(self): + import numpy as np + exp = self._make_explainer_categorical_only() + # Mirror the setup the explainer would normally do before any + # call to compute_proximity_loss: set continuous_feature_indexes + # (empty here) and populate feature_weights_list. Driving the full + # generate_counterfactuals path would exercise many other + # categorical-only code paths that aren't this bug. + exp.data_interface.continuous_feature_indexes = [] + exp.feature_weights_list = [np.ones(len(exp.data_interface.feature_names))] + n_features = len(exp.data_interface.feature_names) + # normalize_data() consumes a 2-D ndarray for non-DataFrame input. + x_hat = np.zeros((4, n_features), dtype=float) + query = np.zeros((1, n_features), dtype=float) + # Must not raise / warn / produce NaN — origin/main produced + # RuntimeWarning: invalid value encountered in scalar divide + # plus NaN losses (or ZeroDivisionError on older numpy). + loss = exp.compute_proximity_loss(x_hat, query) + assert loss.shape == (4,) + assert np.all(loss == 0.0) + assert not np.any(np.isnan(loss))