Skrub magic and format

ArturoAmorQ · ArturoAmorQ · commit a48c98629c09 · 2026-05-05T15:15:38.000+02:00
diff --git a/notebooks/dimred_components.ipynb b/notebooks/dimred_components.ipynb
@@ -58,6 +58,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%pip install skrub\n",
     "from skrub import TableReport\n",
     "\n",
     "TableReport(X)"
@@ -171,7 +172,9 @@
    "source": [
     "pipe_90 = make_pipeline(StandardScaler(), PCA(n_components=0.90))\n",
     "pipe_90.fit(X)\n",
-    "print(f\"n_components_ for 90% threshold: {pipe_90.named_steps['pca'].n_components_}\")"
+    "print(\n",
+    "    f\"n_components_ for 90% threshold: {pipe_90.named_steps['pca'].n_components_}\"\n",
+    ")"
    ]
   },
   {
@@ -204,7 +207,9 @@
     "    X_split, _ = train_test_split(X, train_size=0.5, random_state=random_state)\n",
     "    pipe_split = make_pipeline(StandardScaler(), PCA())\n",
     "    pipe_split.fit(X_split)\n",
-    "    split_explained.append(pipe_split.named_steps[\"pca\"].explained_variance_ratio_)"
+    "    split_explained.append(\n",
+    "        pipe_split.named_steps[\"pca\"].explained_variance_ratio_\n",
+    "    )"
    ]
   },
   {
@@ -216,9 +221,13 @@
     "fig, ax = plt.subplots(figsize=(8, 4))\n",
     "\n",
     "for ev in split_explained:\n",
-    "    ax.plot(np.arange(1, len(ev) + 1), np.cumsum(ev), color=\"tab:blue\", alpha=0.2)\n",
+    "    ax.plot(\n",
+    "        np.arange(1, len(ev) + 1), np.cumsum(ev), color=\"tab:blue\", alpha=0.2\n",
+    "    )\n",
     "\n",
-    "ax.plot(components, cumulative, color=\"tab:blue\", linewidth=2, label=\"Full dataset\")\n",
+    "ax.plot(\n",
+    "    components, cumulative, color=\"tab:blue\", linewidth=2, label=\"Full dataset\"\n",
+    ")\n",
     "ax.axhline(0.90, color=\"tab:orange\", linestyle=\"--\", label=\"90%\")\n",
     "ax.axhline(0.95, color=\"tab:red\", linestyle=\"--\", label=\"95%\")\n",
     "ax.set_xlabel(\"Number of components\")\n",
@@ -318,7 +327,9 @@
     "for ev in split_explained:\n",
     "    ax.plot(np.arange(1, len(ev) + 1), ev, color=\"tab:blue\", alpha=0.2)\n",
     "\n",
-    "ax.plot(components, explained, color=\"tab:blue\", linewidth=2, label=\"Full dataset\")\n",
+    "ax.plot(\n",
+    "    components, explained, color=\"tab:blue\", linewidth=2, label=\"Full dataset\"\n",
+    ")\n",
     "ax.axhline(\n",
     "    kaiser_threshold,\n",
     "    color=\"tab:red\",\n",
@@ -424,15 +435,20 @@
     "for ax, n_components, label in zip(\n",
     "    axes,\n",
     "    [kaiser_n, threshold_90],\n",
-    "    [f\"Kaiser ({kaiser_n} components)\", f\"90% threshold ({threshold_90} components)\"],\n",
+    "    [\n",
+    "        f\"Kaiser ({kaiser_n} components)\",\n",
+    "        f\"90% threshold ({threshold_90} components)\",\n",
+    "    ],\n",
     "):\n",
     "    pipe_km = make_pipeline(\n",
     "        StandardScaler(),\n",
     "        PCA(n_components=n_components),\n",
     "        KMeans(random_state=0),\n",
     "    )\n",
     "    for random_state in range(1, 11):\n",
-    "        X_sub, _ = train_test_split(X, train_size=0.5, random_state=random_state)\n",
+    "        X_sub, _ = train_test_split(\n",
+    "            X, train_size=0.5, random_state=random_state\n",
+    "        )\n",
     "        scores = []\n",
     "        for k in n_clusters_range:\n",
     "            pipe_km[-1].set_params(n_clusters=k)\n",
diff --git a/notebooks/dimred_ex_01.ipynb b/notebooks/dimred_ex_01.ipynb
@@ -87,6 +87,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%pip install skrub\n",
     "from sklearn.decomposition import PCA\n",
     "from sklearn.linear_model import Ridge\n",
     "from sklearn.pipeline import make_pipeline\n",
@@ -140,7 +141,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Write your code here."
+    "# Write your code here.\n",
+    "\n",
+    "grid_search_results = [...]  # complete this code"
    ]
   },
   {
@@ -164,7 +167,9 @@
     "    \"mean_fit_time\": \"CV fit time (s)\",\n",
     "    \"mean_test_error\": \"CV score (MAE)\",\n",
     "}\n",
-    "grid_search_results[\"n_components\"] = grid_search_results[\"n_components\"].fillna(\"None\")\n",
+    "grid_search_results[\"n_components\"] = grid_search_results[\n",
+    "    \"n_components\"\n",
+    "].fillna(\"None\")\n",
     "fig = px.scatter(\n",
     "    grid_search_results,\n",
     "    x=\"mean_fit_time\",\n",
diff --git a/notebooks/dimred_intuitions.ipynb b/notebooks/dimred_intuitions.ipynb
@@ -100,7 +100,9 @@
    "source": [
     "feature_names = penguins.columns.tolist()\n",
     "for i, component in enumerate(pca.components_):\n",
-    "    terms = \" + \".join(f\"{w:.1f} * {f}\" for w, f in zip(component, feature_names))\n",
+    "    terms = \" + \".join(\n",
+    "        f\"{w:.1f} * {f}\" for w, f in zip(component, feature_names)\n",
+    "    )\n",
     "    print(f\"PC{i + 1} = {terms}\")"
    ]
   },
@@ -319,7 +321,9 @@
     "ax1.axis(\"equal\")\n",
     "\n",
     "ax2.scatter(\n",
-    "    penguins_transformed.ravel(), np.zeros(len(penguins_transformed)), alpha=0.6\n",
+    "    penguins_transformed.ravel(),\n",
+    "    np.zeros(len(penguins_transformed)),\n",
+    "    alpha=0.6,\n",
     ")\n",
     "ax2.set_xlabel(\"First Principal Component\")\n",
     "ax2.set_title(\n",
@@ -403,7 +407,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "reconstruction_error = np.mean(np.sum((penguins - penguins_reconstructed) ** 2, axis=1))\n",
+    "reconstruction_error = np.mean(\n",
+    "    np.sum((penguins - penguins_reconstructed) ** 2, axis=1)\n",
+    ")\n",
     "print(f\"Mean squared reconstruction error: {reconstruction_error:.4f}\")"
    ]
   },
diff --git a/notebooks/dimred_preprocessing.ipynb b/notebooks/dimred_preprocessing.ipynb
@@ -87,11 +87,15 @@
     "    ax.set_zlabel(features_3d[2], labelpad=8)\n",
     "\n",
     "\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(15, 5), subplot_kw={\"projection\": \"3d\"})\n",
+    "fig, axes = plt.subplots(\n",
+    "    1, 2, figsize=(15, 5), subplot_kw={\"projection\": \"3d\"}\n",
+    ")\n",
     "\n",
     "for ax, title in zip(axes, (\"Raw\", \"StandardScaler\")):\n",
     "    X_t = X_3D if title == \"Raw\" else standard_scaler.fit_transform(X_3D)\n",
-    "    ax.scatter(X_t[\"GrLivArea\"], X_t[\"OverallQual\"], X_t[\"YearBuilt\"], alpha=0.2, s=5)\n",
+    "    ax.scatter(\n",
+    "        X_t[\"GrLivArea\"], X_t[\"OverallQual\"], X_t[\"YearBuilt\"], alpha=0.2, s=5\n",
+    "    )\n",
     "    set_equal_3d_axes(ax, X_t)\n",
     "    ax.set_title(title)\n",
     "    ax.view_init(elev=20, azim=30)\n",
@@ -205,7 +209,10 @@
     "    ax.set_xticklabels(feature_names, rotation=45, ha=\"right\", fontsize=14)\n",
     "    ax.set_yticks(range(len(components)))\n",
     "    ax.set_yticklabels(\n",
-    "        [f\"PC{i + 1}\\n({v:.1%})\" for i, v in enumerate(pca.explained_variance_ratio_)],\n",
+    "        [\n",
+    "            f\"PC{i + 1}\\n({v:.1%})\"\n",
+    "            for i, v in enumerate(pca.explained_variance_ratio_)\n",
+    "        ],\n",
     "        fontsize=14,\n",
     "    )\n",
     "    return im\n",
@@ -280,6 +287,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%pip install skrub\n",
     "from sklearn.preprocessing import RobustScaler\n",
     "from skrub import SquashingScaler\n",
     "\n",
@@ -642,15 +650,19 @@
    "source": [
     "from sklearn.preprocessing import OneHotEncoder\n",
     "\n",
-    "pipe_ohe = make_pipeline(OneHotEncoder(sparse_output=False), PCA(n_components=8))\n",
+    "pipe_ohe = make_pipeline(\n",
+    "    OneHotEncoder(sparse_output=False), PCA(n_components=8)\n",
+    ")\n",
     "pipe_ohe.fit(data[[\"Neighborhood\"]])\n",
     "categories = pipe_ohe[0].categories_[0]\n",
     "\n",
     "freq_order = data[\"Neighborhood\"].value_counts(normalize=True)\n",
     "sorted_idx = np.searchsorted(categories, freq_order.index)\n",
     "\n",
     "fig, ax = plt.subplots(figsize=(18, 6))\n",
-    "im = plot_sq_loadings(ax, pipe_ohe[-1], categories, col_order=sorted_idx, decimals=1)\n",
+    "im = plot_sq_loadings(\n",
+    "    ax, pipe_ohe[-1], categories, col_order=sorted_idx, decimals=1\n",
+    ")\n",
     "fig.colorbar(im, ax=ax)\n",
     "plt.show()"
    ]
@@ -691,7 +703,9 @@
     "    [\"A\", \"B\", \"C\", \"D\"], size=n, p=[0.60, 0.19, 0.18, 0.03]\n",
     ").reshape(-1, 1)\n",
     "\n",
-    "pipe_ohe = make_pipeline(OneHotEncoder(sparse_output=False), PCA(n_components=3))\n",
+    "pipe_ohe = make_pipeline(\n",
+    "    OneHotEncoder(sparse_output=False), PCA(n_components=3)\n",
+    ")\n",
     "pipe_ohe.fit(categories)\n",
     "\n",
     "fig, ax = plt.subplots(figsize=(5, 3))\n",
diff --git a/notebooks/dimred_sol_01.ipynb b/notebooks/dimred_sol_01.ipynb
@@ -106,6 +106,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%pip install skrub\n",
     "from sklearn.decomposition import PCA\n",
     "from sklearn.linear_model import Ridge\n",
     "from sklearn.pipeline import make_pipeline\n",
@@ -170,7 +171,9 @@
    "source": [
     "# solution\n",
     "grid_search_results = pd.DataFrame(grid_search.cv_results_)[results_columns]\n",
-    "grid_search_results[\"mean_test_error\"] = -grid_search_results[\"mean_test_score\"]\n",
+    "grid_search_results[\"mean_test_error\"] = -grid_search_results[\n",
+    "    \"mean_test_score\"\n",
+    "]\n",
     "grid_search_results = (\n",
     "    grid_search_results.drop(columns=[\"mean_test_score\"])\n",
     "    .rename(columns={\"param_\" + param_name: \"n_components\"})\n",
@@ -200,7 +203,9 @@
     "    \"mean_fit_time\": \"CV fit time (s)\",\n",
     "    \"mean_test_error\": \"CV score (MAE)\",\n",
     "}\n",
-    "grid_search_results[\"n_components\"] = grid_search_results[\"n_components\"].fillna(\"None\")\n",
+    "grid_search_results[\"n_components\"] = grid_search_results[\n",
+    "    \"n_components\"\n",
+    "].fillna(\"None\")\n",
     "fig = px.scatter(\n",
     "    grid_search_results,\n",
     "    x=\"mean_fit_time\",\n",
diff --git a/python_scripts/dimred_components.py b/python_scripts/dimred_components.py
@@ -44,6 +44,7 @@
 # real data where the features have very different scales and units.
 
 # %%
+# %pip install skrub
 from skrub import TableReport
 
 TableReport(X)
diff --git a/python_scripts/dimred_ex_01.py b/python_scripts/dimred_ex_01.py
@@ -87,6 +87,7 @@
 # `n_components` using the grid defined below. Fit it on the full dataset.
 
 # %%
+# %pip install skrub
 from sklearn.decomposition import PCA
 from sklearn.linear_model import Ridge
 from sklearn.pipeline import make_pipeline
diff --git a/python_scripts/dimred_preprocessing.py b/python_scripts/dimred_preprocessing.py
@@ -242,6 +242,7 @@ def plot_sq_loadings(ax, pca, feature_names, col_order=None, decimals=2):
 # fair description of what PCA computed in that space.
 
 # %%
+# %pip install skrub
 from sklearn.preprocessing import RobustScaler
 from skrub import SquashingScaler
 
diff --git a/python_scripts/dimred_sol_01.py b/python_scripts/dimred_sol_01.py
@@ -92,6 +92,7 @@
 # `n_components` using the grid defined below. Fit it on the full dataset.
 
 # %%
+# %pip install skrub
 from sklearn.decomposition import PCA
 from sklearn.linear_model import Ridge
 from sklearn.pipeline import make_pipeline