Address gemini and Copilot review on PR #923

lujiazho · lujiazho · commit 401d96d329d3 · 2026-05-06T20:05:49.000-07:00
Fixes the medium-severity comments raised on the differentiable_input
regressor path:

1. Feature instances per column: replace
   `[Feature(...)] * n_features` with a list comprehension so each
   column has its own dataclass and a later in-place update on one
   column does not leak across all columns.

2. y stats numerical robustness: switch `y_float.std()` (PyTorch's
   default `correction=1`, which differs from `np.std` and returns
   NaN for N=1) to `clamp(y_float.std(correction=0), min=1e-20)`.
   This matches the standard `fit()` path's `np.std` semantics and
   stays finite for single-sample input.

3. Constant-target guard: a constant y collapses the bardist borders
   to a single point and trips
   `FullSupportBarDistribution`'s strictly-increasing assertion.
   `fit()` short-circuits this with `is_constant_target_`; the
   differentiable path has no analogue, so reject up front with a
   clear ValueError pointing users at `fit()`.

4. Sequential preprocessing for diff input: force
   `n_preprocessing_jobs=1` inside `fit_with_differentiable_input`.
   When X carries an autograd graph, joblib's process-boundary
   pickling breaks the graph; sequential execution preserves it.

The detach-then-`.item()` of `y_train_mean_/std_` is intentional and
not changed: `raw_space_bardist_` is a frozen lookup buffer that
should not hold a y-grad graph; users wanting fully differentiable
target scaling should z-normalise y externally so mean/std become
constants here. Documented inline.

New tests:
- feature_schema_columns_are_independent: catches the alias bug.
- std_matches_population_definition: locks in `np.std` semantics.
- constant_target_rejected: locks in the explicit guard.
- single_sample_y_does_not_nan: confirms N=1 hits the guard cleanly
  rather than producing NaN deep in the bardist.

All 9 differentiable_input tests pass on CPU and CUDA.
diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
@@ -667,7 +667,24 @@ def _refresh_targets_for_differentiable_input(
             y, dtype=torch.float32
         )
         y_mean = y_float.mean()
-        y_std = y_float.std() + 1e-20
+        # Match the standard fit path's np.std (population std, ddof=0).
+        # torch.std defaults to correction=1 (sample std), which differs from
+        # numpy and returns NaN for N=1; clamp keeps the divisor non-zero.
+        y_std = torch.clamp(y_float.std(correction=0), min=1e-20)
+        # Constant targets would collapse the bardist borders to a single
+        # point; the differentiable path has no analogue of fit()'s
+        # is_constant_target_ short-circuit, so reject up front.
+        if y_std.detach().item() <= 1e-12:
+            raise ValueError(
+                "Constant or near-constant target (std≈0) is not supported "
+                "by fit_with_differentiable_input; there is no signal to "
+                "predict differentiably. Use fit() for constant-target data."
+            )
+        # Detach when storing as Python floats — raw_space_bardist_ is a
+        # frozen lookup table and must not hold a y-grad graph. Users who
+        # need fully differentiable target scaling should z-normalise y
+        # themselves before calling fit_with_differentiable_input so the
+        # mean/std are constants here.
         self.y_train_mean_ = y_mean.detach().item()
         self.y_train_std_ = y_std.detach().item()
         y_normalized = (y_float - y_mean) / y_std
@@ -705,7 +722,13 @@ def _initialize_for_differentiable_input(
                 "Categorical features are not supported for differentiable input."
             )
         n_features = X.shape[1]
-        features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
+        # One Feature instance per column — list multiplication would share
+        # the same dataclass and any later in-place update would leak across
+        # columns.
+        features = [
+            Feature(name=None, modality=FeatureModality.NUMERICAL)
+            for _ in range(n_features)
+        ]
         self.inferred_feature_schema_ = FeatureSchema(features=features)
         self.n_features_in_ = n_features
 
@@ -920,12 +943,15 @@ def fit_with_differentiable_input(
             # targets. The model load and ensemble configs stay cached.
             X, y = self._refresh_targets_for_differentiable_input(X, y)
 
+        # Force sequential preprocessing: with differentiable input, X carries
+        # an autograd graph that does not survive joblib's process-boundary
+        # pickling. Sequential execution preserves the graph in-process.
         self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
             configs=ensemble_configs,
             n_samples=X.shape[0],
             feature_schema=self.inferred_feature_schema_,
             random_state=static_seed,
-            n_preprocessing_jobs=self.n_preprocessing_jobs,
+            n_preprocessing_jobs=1,
             feature_subsampling_method=FeatureSubsamplingMethod(
                 self.inference_config_.FEATURE_SUBSAMPLING_METHOD
             ),
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
@@ -1052,6 +1052,79 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None
         reg.fit_with_differentiable_input(X, y)
 
 
+def test__fit_with_differentiable_input__constant_target_rejected() -> None:
+    """A constant-target y has no signal to predict differentiably and would
+    collapse the bardist borders; reject with a clear error."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(5, 4)
+    y = torch.full((5,), 3.14)
+    with pytest.raises(ValueError, match="Constant or near-constant target"):
+        reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None:
+    """torch.std defaults to sample std (correction=1) which returns NaN for
+    N=1. Our path uses correction=0 (population std) so std is well defined
+    even for a single sample (it just collapses to 0, which then trips the
+    constant-target guard — what we want). Verify the failure mode is the
+    explicit ValueError, not a downstream NaN."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(1, 4)
+    y = torch.tensor([2.0])
+    with pytest.raises(ValueError, match="Constant or near-constant target"):
+        reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__std_matches_population_definition() -> None:
+    """The differentiable path's y_train_std_ should match np.std (population
+    std, ddof=0), not torch's default sample std (correction=1), so it lines
+    up with the standard fit() path."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(20, 4)
+    y_np = np.random.default_rng(0).standard_normal(20).astype(np.float32)
+    y = torch.from_numpy(y_np)
+    reg.fit_with_differentiable_input(X, y)
+    expected = float(np.std(y_np))  # ddof=0
+    assert abs(reg.y_train_std_ - expected) < 1e-5, (
+        f"y_train_std_ should equal np.std(y) (population std); "
+        f"got {reg.y_train_std_}, expected {expected}"
+    )
+
+
+def test__fit_with_differentiable_input__feature_schema_columns_are_independent() -> None:
+    """Each column's Feature must be a distinct instance — list multiplication
+    `[Feature(...)] * n` would alias all columns to one mutable dataclass."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(10, 4)
+    y = torch.randn(10)
+    reg.fit_with_differentiable_input(X, y)
+    feats = reg.inferred_feature_schema_.features
+    assert len(feats) == 4
+    # Distinct instances, not aliases.
+    ids = {id(f) for f in feats}
+    assert len(ids) == 4, "feature columns share the same Feature instance"
+
+
 def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None:
     """A second call with different y must update y_train_mean_/std_ and the
     raw_space_bardist_; only the model load and ensemble configs are cached."""