Refresh target stats on every fit_with_differentiable_input call

lujiazho · lujiazho · commit 2b91390cbd9c · 2026-05-06T19:39:12.000-07:00
Address gemini-code-assist review on PR #923: the second fit call previously skipped re-normalising y, leaving y_train_mean_, y_train_std_, raw_space_bardist_ stuck on the first fit's stats — silently miscaling predictions when the new target distribution differed. Split _initialize_for_differentiable_input into: - _initialize_for_differentiable_input: first-call-only setup (categorical check, feature schema, ensemble configs). Cached in self.ensemble_configs_. - _refresh_targets_for_differentiable_input: per-call setup (validate_dataset_size, z-normalise y, rebuild raw_space_bardist_, update n_train_samples_). Runs on every fit. fit_with_differentiable_input's else branch now calls the per-call helper so subsequent fits track the current target distribution while still reusing the loaded model and ensemble configs. Add test__fit_with_differentiable_input__second_call_refreshes_target_stats that fits twice with very different y distributions and checks y_train_mean_, y_train_std_, and raw_space_bardist_.borders all move.
diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
@@ -640,33 +640,61 @@ def _initialize_model_variables(self) -> int:
         """
         return initialize_model_variables_helper(self, self.estimator_type)
 
+    def _refresh_targets_for_differentiable_input(
+        self, X: torch.Tensor, y: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Per-fit-call data-dependent setup for the differentiable path.
+
+        Validates input shape, z-normalises ``y`` as a torch op (preserves
+        grads), updates the standardisation stats, and rebuilds
+        ``raw_space_bardist_`` in the caller's current target scale. Run on
+        every ``fit_with_differentiable_input`` call so the regressor's
+        target stats always match the data being fit; the model load and
+        ensemble configs are cached in ``_initialize_for_differentiable_input``
+        and run only on the first call.
+        """
+        validate_dataset_size(
+            X=X,
+            y=y,
+            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
+            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
+            devices=self.devices_,
+            ignore_pretraining_limits=self.ignore_pretraining_limits,
+        )
+        self.n_train_samples_ = int(X.shape[0])
+
+        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
+            y, dtype=torch.float32
+        )
+        y_mean = y_float.mean()
+        y_std = y_float.std() + 1e-20
+        self.y_train_mean_ = y_mean.detach().item()
+        self.y_train_std_ = y_std.detach().item()
+        y_normalized = (y_float - y_mean) / y_std
+
+        # raw_space_bardist_ is a constant lookup in the caller's target
+        # scale; detach so the buffer does not hold onto y's grad graph.
+        borders = self.znorm_space_bardist_.borders.detach()
+        self.raw_space_bardist_ = FullSupportBarDistribution(
+            borders * self.y_train_std_ + self.y_train_mean_,
+        ).float()
+        return X, y_normalized
+
     def _initialize_for_differentiable_input(
         self,
         X: torch.Tensor,
         y: torch.Tensor,
         rng: np.random.Generator,
     ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]:
-        """Initialize the model for differentiable input.
+        """First-call setup for the differentiable path.
 
         Mirrors the classifier-side helper so that gradients can flow from a
         loss back to upstream torch modules feeding ``X`` (and optionally
         ``y``). Skips the standard numpy preprocessing path and uses a
-        differentiable identity preprocessor.
-
-        Returns the ensemble configs together with ``X`` and the
-        z-normalised ``y``. The standardisation parameters are stored on
-        ``self`` so ``raw_space_bardist_`` reflects the caller's target
-        scale.
+        differentiable identity preprocessor. Subsequent calls reuse the
+        feature schema and ensemble configs but re-run target normalization
+        via ``_refresh_targets_for_differentiable_input``.
         """
-        validate_dataset_size(
-            X=X,
-            y=y,
-            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
-            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
-            devices=self.devices_,
-            ignore_pretraining_limits=self.ignore_pretraining_limits,
-        )
-
         # Minimal preprocessing for prompt tuning: no categorical features,
         # all-numerical schema, identity preprocessor that preserves grads.
         if (
@@ -680,24 +708,8 @@ def _initialize_for_differentiable_input(
         features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
         self.inferred_feature_schema_ = FeatureSchema(features=features)
         self.n_features_in_ = n_features
-        self.n_train_samples_ = int(X.shape[0])
 
-        # z-normalise y as a torch op so that gradients flow if y has them.
-        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
-            y, dtype=torch.float32
-        )
-        y_mean = y_float.mean()
-        y_std = y_float.std() + 1e-20
-        self.y_train_mean_ = y_mean.detach().item()
-        self.y_train_std_ = y_std.detach().item()
-        y_normalized = (y_float - y_mean) / y_std
-
-        # raw_space_bardist_ is a constant lookup in caller's target scale; we
-        # detach so the buffer does not accidentally hold onto y's grad graph.
-        borders = self.znorm_space_bardist_.borders.detach()
-        self.raw_space_bardist_ = FullSupportBarDistribution(
-            borders * self.y_train_std_ + self.y_train_mean_,
-        ).float()
+        X, y_normalized = self._refresh_targets_for_differentiable_input(X, y)
 
         preprocessor_configs = [PreprocessorConfig("none", differentiable=True)]
         # Polynomial features go through sklearn StandardScaler on numpy and
@@ -903,6 +915,10 @@ def fit_with_differentiable_input(
                 self.inference_precision, self.devices_
             )
             ensemble_configs = self.ensemble_configs_  # Reuse from first fit
+            # Re-validate and re-normalise y for the new fit data so that
+            # raw_space_bardist_ and y_train_mean_/std_ track the current
+            # targets. The model load and ensemble configs stay cached.
+            X, y = self._refresh_targets_for_differentiable_input(X, y)
 
         self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
             configs=ensemble_configs,
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
@@ -1050,3 +1050,36 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None
     y = torch.randn(20)
     with pytest.raises(ValueError, match="Categorical features"):
         reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None:
+    """A second call with different y must update y_train_mean_/std_ and the
+    raw_space_bardist_; only the model load and ensemble configs are cached."""
+    torch.manual_seed(0)
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X1 = torch.randn(20, 4)
+    y1 = torch.randn(20) * 10.0 + 100.0  # mean ~100, std ~10
+    reg.fit_with_differentiable_input(X1, y1)
+    mean1, std1 = reg.y_train_mean_, reg.y_train_std_
+    bardist_borders1 = reg.raw_space_bardist_.borders.clone()
+
+    X2 = torch.randn(20, 4)
+    y2 = torch.randn(20) * 0.5 - 5.0  # mean ~-5, std ~0.5
+    reg.fit_with_differentiable_input(X2, y2)
+    mean2, std2 = reg.y_train_mean_, reg.y_train_std_
+
+    assert abs(mean2 - mean1) > 1.0, (
+        f"y_train_mean_ should reflect new y; got {mean1} -> {mean2}"
+    )
+    assert abs(std2 - std1) > 1.0, (
+        f"y_train_std_ should reflect new y; got {std1} -> {std2}"
+    )
+    # raw_space_bardist_ borders are derived from y stats; they must move.
+    assert not torch.allclose(reg.raw_space_bardist_.borders, bardist_borders1), (
+        "raw_space_bardist_ must be rebuilt to the new target scale"
+    )