diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py index 0b1bc1b7..b37a901c 100644 --- a/dice_ml/explainer_interfaces/dice_genetic.py +++ b/dice_ml/explainer_interfaces/dice_genetic.py @@ -370,6 +370,14 @@ def compute_proximity_loss(self, x_hat_unnormalized, query_instance_normalized): x_hat = self.data_interface.normalize_data(x_hat_unnormalized) feature_weights = np.array( [self.feature_weights_list[0][i] for i in self.data_interface.continuous_feature_indexes]) + # When the dataset has no continuous features, feature_weights is an + # empty array and the original `proximity_loss / sum(feature_weights)` + # divided by zero, raising RuntimeWarning + producing NaN losses that + # poison the genetic search. Proximity is conceptually undefined in + # that case (there are no continuous distances to weigh), so return a + # zero loss vector matching the population shape — see issue #276. + if len(feature_weights) == 0: + return np.zeros(x_hat.shape[0]) product = np.multiply( (abs(x_hat - query_instance_normalized)[:, [self.data_interface.continuous_feature_indexes]]), feature_weights) diff --git a/tests/test_dice_interface/test_dice_genetic.py b/tests/test_dice_interface/test_dice_genetic.py index 897b42eb..ebfe7005 100644 --- a/tests/test_dice_interface/test_dice_genetic.py +++ b/tests/test_dice_interface/test_dice_genetic.py @@ -261,3 +261,64 @@ def test_maxiter(self, desired_range, sample_custom_query_2, total_CFs, initiali for cfs_example in ans.cf_examples_list: for i in cfs_example.final_cfs_df[self.exp.data_interface.outcome_name].values: assert desired_range[0] <= i <= desired_range[1] + + +class TestComputeProximityLossNoContinuousFeatures: + """Regression for issue #276. + + `DiceGenetic.compute_proximity_loss` divides by `sum(feature_weights)`, + where `feature_weights` is restricted to *continuous* feature indexes. + For an all-categorical dataset the array is empty and the original + implementation hit RuntimeWarning + NaN losses (or ZeroDivisionError on + older numpy). The genetic search then propagated NaN through `compute_loss` + and produced unusable counterfactuals. + """ + + def _make_explainer_categorical_only(self): + import numpy as np + import pandas as pd + from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + from sklearn.linear_model import LogisticRegression + + rng = np.random.default_rng(0) + df = pd.DataFrame({ + "color": rng.choice(["red", "green", "blue"], size=60), + "shape": rng.choice(["circle", "square"], size=60), + "label": rng.integers(0, 2, size=60), + }) + cat = ["color", "shape"] + X = df[cat] + y = df["label"] + clf = Pipeline([ + ("ohe", ColumnTransformer([("o", OneHotEncoder(), cat)])), + ("lr", LogisticRegression(max_iter=200)), + ]).fit(X, y) + + d = dice_ml.Data(dataframe=df, continuous_features=[], + categorical_features=cat, outcome_name="label") + m = dice_ml.Model(model=clf, backend="sklearn") + return dice_ml.Dice(d, m, method="genetic") + + def test_compute_proximity_loss_returns_zero_when_no_continuous_features(self): + import numpy as np + exp = self._make_explainer_categorical_only() + # Mirror the setup the explainer would normally do before any + # call to compute_proximity_loss: set continuous_feature_indexes + # (empty here) and populate feature_weights_list. Driving the full + # generate_counterfactuals path would exercise many other + # categorical-only code paths that aren't this bug. + exp.data_interface.continuous_feature_indexes = [] + exp.feature_weights_list = [np.ones(len(exp.data_interface.feature_names))] + n_features = len(exp.data_interface.feature_names) + # normalize_data() consumes a 2-D ndarray for non-DataFrame input. + x_hat = np.zeros((4, n_features), dtype=float) + query = np.zeros((1, n_features), dtype=float) + # Must not raise / warn / produce NaN — origin/main produced + # RuntimeWarning: invalid value encountered in scalar divide + # plus NaN losses (or ZeroDivisionError on older numpy). + loss = exp.compute_proximity_loss(x_hat, query) + assert loss.shape == (4,) + assert np.all(loss == 0.0) + assert not np.any(np.isnan(loss))