Hide the code of the collect_cv_predictions hack in the helpers

ogrisel · ogrisel · commit d7f0e7cd533d · 2025-07-10T07:39:33.000+02:00
diff --git a/content/python_files/feature_engineering.py b/content/python_files/feature_engineering.py
@@ -67,6 +67,7 @@
     plot_residuals_vs_predicted,
     plot_binned_residuals,
     plot_horizon_forecast,
+    collect_cv_predictions,
 )
 
 # Ignore warnings from pkg_resources triggered by Python 3.13's multiprocessing.
@@ -773,42 +774,6 @@ def build_targets(prediction_time, electricity, horizons):
 # We further analyze our cross-validated model by collecting the predictions on each
 # split.
 
-# %%
-def collect_cv_predictions(
-    pipelines,
-    cv_splitter,
-    predictions,
-    prediction_time,
-):
-    index_generator = cv_splitter.split(prediction_time.skb.eval())
-
-    def splitter(X, y, index_generator):
-        """Workaround to transform a scikit-learn splitter into a function understood
-        by `skrub.train_test_split`."""
-        train_idx, test_idx = next(index_generator)
-        return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
-
-    results = []
-
-    for (_, test_idx), pipeline in zip(
-        cv_splitter.split(prediction_time.skb.eval()), pipelines
-    ):
-        split = predictions.skb.train_test_split(
-            predictions.skb.get_data(),
-            splitter=splitter,
-            index_generator=index_generator,
-        )
-        results.append(
-            pl.DataFrame(
-                {
-                    "prediction_time": prediction_time.skb.eval()[test_idx],
-                    "load_mw": split["y_test"],
-                    "predicted_load_mw": pipeline.predict(split["test"]),
-                }
-            )
-        )
-    return results
-
 
 # %%
 hgbr_cv_predictions = collect_cv_predictions(
diff --git a/content/python_files/tutorial_helpers.py b/content/python_files/tutorial_helpers.py
@@ -689,3 +689,39 @@ def binned_coverage(y_true_folds, y_quantile_low, y_quantile_high, n_bins=10):
             )
 
     return pd.DataFrame(results)
+
+
+def collect_cv_predictions(
+    pipelines,
+    cv_splitter,
+    predictions,
+    prediction_time,
+):
+    index_generator = cv_splitter.split(prediction_time.skb.eval())
+
+    def splitter(X, y, index_generator):
+        """Workaround to transform a scikit-learn splitter into a function understood
+        by `skrub.train_test_split`."""
+        train_idx, test_idx = next(index_generator)
+        return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
+
+    results = []
+
+    for (_, test_idx), pipeline in zip(
+        cv_splitter.split(prediction_time.skb.eval()), pipelines
+    ):
+        split = predictions.skb.train_test_split(
+            predictions.skb.get_data(),
+            splitter=splitter,
+            index_generator=index_generator,
+        )
+        results.append(
+            pl.DataFrame(
+                {
+                    "prediction_time": prediction_time.skb.eval()[test_idx],
+                    "load_mw": split["y_test"],
+                    "predicted_load_mw": pipeline.predict(split["test"]),
+                }
+            )
+        )
+    return results