Merge pull request #411 from StochasticTree/fix-bart-categorical-mean-only-weights

andrewherren · web-flow · commit 7c75236b628a · 2026-06-17T00:16:52.000-05:00
Fix IndexError in mean-only Python BART with categorical covariates
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,7 @@
 
 ## Bug Fixes
 
+* Fix `IndexError` when sampling a mean-only Python BART model with categorical covariates; the excluded-variable weight zero-out is now guarded by the forest-inclusion flags, matching the R implementation [#411](https://github.com/StochasticTree/stochtree/pull/411)
 * Fix ordinal class prediction bug in R BART [#399](https://github.com/StochasticTree/stochtree/issues/399)
 
 # stochtree 0.4.4
diff --git a/stochtree/bart.py b/stochtree/bart.py
@@ -990,13 +990,19 @@ def sample(
                     * variable_weights_adj
                 )
 
-        # Zero out weights for excluded variables
-        variable_weights_mean[
-            [variable_subset_mean.count(i) == 0 for i in original_var_indices]
-        ] = 0
-        variable_weights_variance[
-            [variable_subset_variance.count(i) == 0 for i in original_var_indices]
-        ] = 0
+        # Zero out weights for excluded variables. The weight arrays are only
+        # expanded to processed (post-preprocessing) length inside the
+        # include_*_forest guards above, so the zero-out must be guarded the same
+        # way -- otherwise a mean-only (or variance-only) model with categorical
+        # covariates indexes an unexpanded array and raises. (Matches R's logic.)
+        if self.include_mean_forest:
+            variable_weights_mean[
+                [variable_subset_mean.count(i) == 0 for i in original_var_indices]
+            ] = 0
+        if self.include_variance_forest:
+            variable_weights_variance[
+                [variable_subset_variance.count(i) == 0 for i in original_var_indices]
+            ] = 0
 
         # Set num_features_subsample to default, ncol(X_train), if not already set
         if num_features_subsample_mean is None:
diff --git a/test/python/test_bart.py b/test/python/test_bart.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.model_selection import train_test_split
 
@@ -1578,6 +1579,29 @@ def test_cloglog_ordinal_bart_with_gfr(self):
         assert bart_model.y_hat_test.shape == (n_test, num_mcmc)
         assert bart_model.cloglog_cutpoint_samples.shape == (2, num_mcmc)
 
+    def test_categorical_covariates_mean_only(self):
+        """A mean-only BART model with categorical (one-hot expanded) covariates
+        must sample and predict without error.
+
+        Regression test: the "zero out excluded variable weights" step ran
+        outside the include_*_forest guards, so variable_weights_variance was
+        never expanded to the processed (one-hot) length for a model without a
+        variance forest, and indexing it raised an IndexError.
+        """
+        rng = np.random.default_rng(0)
+        n = 100
+        X_num = rng.uniform(0, 1, (n, 3))
+        X = pd.DataFrame(X_num, columns=["a", "b", "c"])
+        X["cat"] = pd.Categorical(rng.choice(["x", "y", "z"], size=n))
+        y = X_num[:, 0] + rng.normal(scale=0.5, size=n)
+
+        model = BARTModel()
+        # Mean forest only (no variance forest) is the failing configuration.
+        model.sample(X_train=X, y_train=y, num_gfr=0, num_burnin=0, num_mcmc=5)
+
+        preds = model.predict(X)
+        assert preds["y_hat"].shape[0] == n
+
 
 class TestBARTFloat32:
     """Tests that float32 inputs are accepted and produce valid results (GH #389)."""