From 3492bdb2e63d92b08cb3ba2dd436c5840dae88d9 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Braun <jbaptiste.braun@gmail.com>
Date: Sun, 10 May 2026 00:49:38 +0200
Subject: [PATCH] Fix DiceGenetic.compute_proximity_loss for all-categorical
 datasets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the dataset has no continuous features,
`continuous_feature_indexes` is empty, so `feature_weights` is an empty
np.array and the original implementation hit either:

* `proximity_loss / sum(feature_weights)` ⇒ ZeroDivisionError /
  RuntimeWarning + NaN losses (the symptom @kburchfiel reported in #276
  with the original quoted snippet), or
* `product.reshape(-1, product.shape[-1])` ⇒ ValueError on a 0-sized
  array, depending on input shape.

Both paths poison `compute_loss` with NaN/exceptions and break the
genetic search for legitimate all-categorical use cases.

Proximity is conceptually undefined when there are no continuous
distances to weigh, so short-circuit with a zero loss vector matching
the population shape. The categorical penalty in `compute_loss`
already accounts for categorical sparsity, so dropping the proximity
contribution is the correct semantic — and it matches what users
expect when they explicitly set up a categorical-only `dice_ml.Data`.

Adds `TestComputeProximityLossNoContinuousFeatures` covering the
all-categorical path. The test fails on `origin/main` with the
ValueError reshape variant of this bug.

Closes #276.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 dice_ml/explainer_interfaces/dice_genetic.py  |  8 +++
 .../test_dice_interface/test_dice_genetic.py  | 61 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py
index 0b1bc1b7..b37a901c 100644
--- a/dice_ml/explainer_interfaces/dice_genetic.py
+++ b/dice_ml/explainer_interfaces/dice_genetic.py
@@ -370,6 +370,14 @@ def compute_proximity_loss(self, x_hat_unnormalized, query_instance_normalized):
         x_hat = self.data_interface.normalize_data(x_hat_unnormalized)
         feature_weights = np.array(
             [self.feature_weights_list[0][i] for i in self.data_interface.continuous_feature_indexes])
+        # When the dataset has no continuous features, feature_weights is an
+        # empty array and the original `proximity_loss / sum(feature_weights)`
+        # divided by zero, raising RuntimeWarning + producing NaN losses that
+        # poison the genetic search. Proximity is conceptually undefined in
+        # that case (there are no continuous distances to weigh), so return a
+        # zero loss vector matching the population shape — see issue #276.
+        if len(feature_weights) == 0:
+            return np.zeros(x_hat.shape[0])
         product = np.multiply(
             (abs(x_hat - query_instance_normalized)[:, [self.data_interface.continuous_feature_indexes]]),
             feature_weights)
diff --git a/tests/test_dice_interface/test_dice_genetic.py b/tests/test_dice_interface/test_dice_genetic.py
index 897b42eb..ebfe7005 100644
--- a/tests/test_dice_interface/test_dice_genetic.py
+++ b/tests/test_dice_interface/test_dice_genetic.py
@@ -261,3 +261,64 @@ def test_maxiter(self, desired_range, sample_custom_query_2, total_CFs, initiali
         for cfs_example in ans.cf_examples_list:
             for i in cfs_example.final_cfs_df[self.exp.data_interface.outcome_name].values:
                 assert desired_range[0] <= i <= desired_range[1]
+
+
+class TestComputeProximityLossNoContinuousFeatures:
+    """Regression for issue #276.
+
+    `DiceGenetic.compute_proximity_loss` divides by `sum(feature_weights)`,
+    where `feature_weights` is restricted to *continuous* feature indexes.
+    For an all-categorical dataset the array is empty and the original
+    implementation hit RuntimeWarning + NaN losses (or ZeroDivisionError on
+    older numpy). The genetic search then propagated NaN through `compute_loss`
+    and produced unusable counterfactuals.
+    """
+
+    def _make_explainer_categorical_only(self):
+        import numpy as np
+        import pandas as pd
+        from sklearn.compose import ColumnTransformer
+        from sklearn.pipeline import Pipeline
+        from sklearn.preprocessing import OneHotEncoder
+        from sklearn.linear_model import LogisticRegression
+
+        rng = np.random.default_rng(0)
+        df = pd.DataFrame({
+            "color": rng.choice(["red", "green", "blue"], size=60),
+            "shape": rng.choice(["circle", "square"], size=60),
+            "label": rng.integers(0, 2, size=60),
+        })
+        cat = ["color", "shape"]
+        X = df[cat]
+        y = df["label"]
+        clf = Pipeline([
+            ("ohe", ColumnTransformer([("o", OneHotEncoder(), cat)])),
+            ("lr", LogisticRegression(max_iter=200)),
+        ]).fit(X, y)
+
+        d = dice_ml.Data(dataframe=df, continuous_features=[],
+                         categorical_features=cat, outcome_name="label")
+        m = dice_ml.Model(model=clf, backend="sklearn")
+        return dice_ml.Dice(d, m, method="genetic")
+
+    def test_compute_proximity_loss_returns_zero_when_no_continuous_features(self):
+        import numpy as np
+        exp = self._make_explainer_categorical_only()
+        # Mirror the setup the explainer would normally do before any
+        # call to compute_proximity_loss: set continuous_feature_indexes
+        # (empty here) and populate feature_weights_list. Driving the full
+        # generate_counterfactuals path would exercise many other
+        # categorical-only code paths that aren't this bug.
+        exp.data_interface.continuous_feature_indexes = []
+        exp.feature_weights_list = [np.ones(len(exp.data_interface.feature_names))]
+        n_features = len(exp.data_interface.feature_names)
+        # normalize_data() consumes a 2-D ndarray for non-DataFrame input.
+        x_hat = np.zeros((4, n_features), dtype=float)
+        query = np.zeros((1, n_features), dtype=float)
+        # Must not raise / warn / produce NaN — origin/main produced
+        # RuntimeWarning: invalid value encountered in scalar divide
+        # plus NaN losses (or ZeroDivisionError on older numpy).
+        loss = exp.compute_proximity_loss(x_hat, query)
+        assert loss.shape == (4,)
+        assert np.all(loss == 0.0)
+        assert not np.any(np.isnan(loss))