Skip to content

Commit a48c986

Browse files
author
ArturoAmorQ
committed
Skrub magic and format
1 parent 94c45b2 commit a48c986

9 files changed

Lines changed: 70 additions & 20 deletions

notebooks/dimred_components.ipynb

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
"metadata": {},
5959
"outputs": [],
6060
"source": [
61+
"%pip install skrub\n",
6162
"from skrub import TableReport\n",
6263
"\n",
6364
"TableReport(X)"
@@ -171,7 +172,9 @@
171172
"source": [
172173
"pipe_90 = make_pipeline(StandardScaler(), PCA(n_components=0.90))\n",
173174
"pipe_90.fit(X)\n",
174-
"print(f\"n_components_ for 90% threshold: {pipe_90.named_steps['pca'].n_components_}\")"
175+
"print(\n",
176+
" f\"n_components_ for 90% threshold: {pipe_90.named_steps['pca'].n_components_}\"\n",
177+
")"
175178
]
176179
},
177180
{
@@ -204,7 +207,9 @@
204207
" X_split, _ = train_test_split(X, train_size=0.5, random_state=random_state)\n",
205208
" pipe_split = make_pipeline(StandardScaler(), PCA())\n",
206209
" pipe_split.fit(X_split)\n",
207-
" split_explained.append(pipe_split.named_steps[\"pca\"].explained_variance_ratio_)"
210+
" split_explained.append(\n",
211+
" pipe_split.named_steps[\"pca\"].explained_variance_ratio_\n",
212+
" )"
208213
]
209214
},
210215
{
@@ -216,9 +221,13 @@
216221
"fig, ax = plt.subplots(figsize=(8, 4))\n",
217222
"\n",
218223
"for ev in split_explained:\n",
219-
" ax.plot(np.arange(1, len(ev) + 1), np.cumsum(ev), color=\"tab:blue\", alpha=0.2)\n",
224+
" ax.plot(\n",
225+
" np.arange(1, len(ev) + 1), np.cumsum(ev), color=\"tab:blue\", alpha=0.2\n",
226+
" )\n",
220227
"\n",
221-
"ax.plot(components, cumulative, color=\"tab:blue\", linewidth=2, label=\"Full dataset\")\n",
228+
"ax.plot(\n",
229+
" components, cumulative, color=\"tab:blue\", linewidth=2, label=\"Full dataset\"\n",
230+
")\n",
222231
"ax.axhline(0.90, color=\"tab:orange\", linestyle=\"--\", label=\"90%\")\n",
223232
"ax.axhline(0.95, color=\"tab:red\", linestyle=\"--\", label=\"95%\")\n",
224233
"ax.set_xlabel(\"Number of components\")\n",
@@ -318,7 +327,9 @@
318327
"for ev in split_explained:\n",
319328
" ax.plot(np.arange(1, len(ev) + 1), ev, color=\"tab:blue\", alpha=0.2)\n",
320329
"\n",
321-
"ax.plot(components, explained, color=\"tab:blue\", linewidth=2, label=\"Full dataset\")\n",
330+
"ax.plot(\n",
331+
" components, explained, color=\"tab:blue\", linewidth=2, label=\"Full dataset\"\n",
332+
")\n",
322333
"ax.axhline(\n",
323334
" kaiser_threshold,\n",
324335
" color=\"tab:red\",\n",
@@ -424,15 +435,20 @@
424435
"for ax, n_components, label in zip(\n",
425436
" axes,\n",
426437
" [kaiser_n, threshold_90],\n",
427-
" [f\"Kaiser ({kaiser_n} components)\", f\"90% threshold ({threshold_90} components)\"],\n",
438+
" [\n",
439+
" f\"Kaiser ({kaiser_n} components)\",\n",
440+
" f\"90% threshold ({threshold_90} components)\",\n",
441+
" ],\n",
428442
"):\n",
429443
" pipe_km = make_pipeline(\n",
430444
" StandardScaler(),\n",
431445
" PCA(n_components=n_components),\n",
432446
" KMeans(random_state=0),\n",
433447
" )\n",
434448
" for random_state in range(1, 11):\n",
435-
" X_sub, _ = train_test_split(X, train_size=0.5, random_state=random_state)\n",
449+
" X_sub, _ = train_test_split(\n",
450+
" X, train_size=0.5, random_state=random_state\n",
451+
" )\n",
436452
" scores = []\n",
437453
" for k in n_clusters_range:\n",
438454
" pipe_km[-1].set_params(n_clusters=k)\n",

notebooks/dimred_ex_01.ipynb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
"metadata": {},
8888
"outputs": [],
8989
"source": [
90+
"%pip install skrub\n",
9091
"from sklearn.decomposition import PCA\n",
9192
"from sklearn.linear_model import Ridge\n",
9293
"from sklearn.pipeline import make_pipeline\n",
@@ -140,7 +141,9 @@
140141
"metadata": {},
141142
"outputs": [],
142143
"source": [
143-
"# Write your code here."
144+
"# Write your code here.\n",
145+
"\n",
146+
"grid_search_results = [...] # complete this code"
144147
]
145148
},
146149
{
@@ -164,7 +167,9 @@
164167
" \"mean_fit_time\": \"CV fit time (s)\",\n",
165168
" \"mean_test_error\": \"CV score (MAE)\",\n",
166169
"}\n",
167-
"grid_search_results[\"n_components\"] = grid_search_results[\"n_components\"].fillna(\"None\")\n",
170+
"grid_search_results[\"n_components\"] = grid_search_results[\n",
171+
" \"n_components\"\n",
172+
"].fillna(\"None\")\n",
168173
"fig = px.scatter(\n",
169174
" grid_search_results,\n",
170175
" x=\"mean_fit_time\",\n",

notebooks/dimred_intuitions.ipynb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@
100100
"source": [
101101
"feature_names = penguins.columns.tolist()\n",
102102
"for i, component in enumerate(pca.components_):\n",
103-
" terms = \" + \".join(f\"{w:.1f} * {f}\" for w, f in zip(component, feature_names))\n",
103+
" terms = \" + \".join(\n",
104+
" f\"{w:.1f} * {f}\" for w, f in zip(component, feature_names)\n",
105+
" )\n",
104106
" print(f\"PC{i + 1} = {terms}\")"
105107
]
106108
},
@@ -319,7 +321,9 @@
319321
"ax1.axis(\"equal\")\n",
320322
"\n",
321323
"ax2.scatter(\n",
322-
" penguins_transformed.ravel(), np.zeros(len(penguins_transformed)), alpha=0.6\n",
324+
" penguins_transformed.ravel(),\n",
325+
" np.zeros(len(penguins_transformed)),\n",
326+
" alpha=0.6,\n",
323327
")\n",
324328
"ax2.set_xlabel(\"First Principal Component\")\n",
325329
"ax2.set_title(\n",
@@ -403,7 +407,9 @@
403407
"metadata": {},
404408
"outputs": [],
405409
"source": [
406-
"reconstruction_error = np.mean(np.sum((penguins - penguins_reconstructed) ** 2, axis=1))\n",
410+
"reconstruction_error = np.mean(\n",
411+
" np.sum((penguins - penguins_reconstructed) ** 2, axis=1)\n",
412+
")\n",
407413
"print(f\"Mean squared reconstruction error: {reconstruction_error:.4f}\")"
408414
]
409415
},

notebooks/dimred_preprocessing.ipynb

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,15 @@
8787
" ax.set_zlabel(features_3d[2], labelpad=8)\n",
8888
"\n",
8989
"\n",
90-
"fig, axes = plt.subplots(1, 2, figsize=(15, 5), subplot_kw={\"projection\": \"3d\"})\n",
90+
"fig, axes = plt.subplots(\n",
91+
" 1, 2, figsize=(15, 5), subplot_kw={\"projection\": \"3d\"}\n",
92+
")\n",
9193
"\n",
9294
"for ax, title in zip(axes, (\"Raw\", \"StandardScaler\")):\n",
9395
" X_t = X_3D if title == \"Raw\" else standard_scaler.fit_transform(X_3D)\n",
94-
" ax.scatter(X_t[\"GrLivArea\"], X_t[\"OverallQual\"], X_t[\"YearBuilt\"], alpha=0.2, s=5)\n",
96+
" ax.scatter(\n",
97+
" X_t[\"GrLivArea\"], X_t[\"OverallQual\"], X_t[\"YearBuilt\"], alpha=0.2, s=5\n",
98+
" )\n",
9599
" set_equal_3d_axes(ax, X_t)\n",
96100
" ax.set_title(title)\n",
97101
" ax.view_init(elev=20, azim=30)\n",
@@ -205,7 +209,10 @@
205209
" ax.set_xticklabels(feature_names, rotation=45, ha=\"right\", fontsize=14)\n",
206210
" ax.set_yticks(range(len(components)))\n",
207211
" ax.set_yticklabels(\n",
208-
" [f\"PC{i + 1}\\n({v:.1%})\" for i, v in enumerate(pca.explained_variance_ratio_)],\n",
212+
" [\n",
213+
" f\"PC{i + 1}\\n({v:.1%})\"\n",
214+
" for i, v in enumerate(pca.explained_variance_ratio_)\n",
215+
" ],\n",
209216
" fontsize=14,\n",
210217
" )\n",
211218
" return im\n",
@@ -280,6 +287,7 @@
280287
"metadata": {},
281288
"outputs": [],
282289
"source": [
290+
"%pip install skrub\n",
283291
"from sklearn.preprocessing import RobustScaler\n",
284292
"from skrub import SquashingScaler\n",
285293
"\n",
@@ -642,15 +650,19 @@
642650
"source": [
643651
"from sklearn.preprocessing import OneHotEncoder\n",
644652
"\n",
645-
"pipe_ohe = make_pipeline(OneHotEncoder(sparse_output=False), PCA(n_components=8))\n",
653+
"pipe_ohe = make_pipeline(\n",
654+
" OneHotEncoder(sparse_output=False), PCA(n_components=8)\n",
655+
")\n",
646656
"pipe_ohe.fit(data[[\"Neighborhood\"]])\n",
647657
"categories = pipe_ohe[0].categories_[0]\n",
648658
"\n",
649659
"freq_order = data[\"Neighborhood\"].value_counts(normalize=True)\n",
650660
"sorted_idx = np.searchsorted(categories, freq_order.index)\n",
651661
"\n",
652662
"fig, ax = plt.subplots(figsize=(18, 6))\n",
653-
"im = plot_sq_loadings(ax, pipe_ohe[-1], categories, col_order=sorted_idx, decimals=1)\n",
663+
"im = plot_sq_loadings(\n",
664+
" ax, pipe_ohe[-1], categories, col_order=sorted_idx, decimals=1\n",
665+
")\n",
654666
"fig.colorbar(im, ax=ax)\n",
655667
"plt.show()"
656668
]
@@ -691,7 +703,9 @@
691703
" [\"A\", \"B\", \"C\", \"D\"], size=n, p=[0.60, 0.19, 0.18, 0.03]\n",
692704
").reshape(-1, 1)\n",
693705
"\n",
694-
"pipe_ohe = make_pipeline(OneHotEncoder(sparse_output=False), PCA(n_components=3))\n",
706+
"pipe_ohe = make_pipeline(\n",
707+
" OneHotEncoder(sparse_output=False), PCA(n_components=3)\n",
708+
")\n",
695709
"pipe_ohe.fit(categories)\n",
696710
"\n",
697711
"fig, ax = plt.subplots(figsize=(5, 3))\n",

notebooks/dimred_sol_01.ipynb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
"metadata": {},
107107
"outputs": [],
108108
"source": [
109+
"%pip install skrub\n",
109110
"from sklearn.decomposition import PCA\n",
110111
"from sklearn.linear_model import Ridge\n",
111112
"from sklearn.pipeline import make_pipeline\n",
@@ -170,7 +171,9 @@
170171
"source": [
171172
"# solution\n",
172173
"grid_search_results = pd.DataFrame(grid_search.cv_results_)[results_columns]\n",
173-
"grid_search_results[\"mean_test_error\"] = -grid_search_results[\"mean_test_score\"]\n",
174+
"grid_search_results[\"mean_test_error\"] = -grid_search_results[\n",
175+
" \"mean_test_score\"\n",
176+
"]\n",
174177
"grid_search_results = (\n",
175178
" grid_search_results.drop(columns=[\"mean_test_score\"])\n",
176179
" .rename(columns={\"param_\" + param_name: \"n_components\"})\n",
@@ -200,7 +203,9 @@
200203
" \"mean_fit_time\": \"CV fit time (s)\",\n",
201204
" \"mean_test_error\": \"CV score (MAE)\",\n",
202205
"}\n",
203-
"grid_search_results[\"n_components\"] = grid_search_results[\"n_components\"].fillna(\"None\")\n",
206+
"grid_search_results[\"n_components\"] = grid_search_results[\n",
207+
" \"n_components\"\n",
208+
"].fillna(\"None\")\n",
204209
"fig = px.scatter(\n",
205210
" grid_search_results,\n",
206211
" x=\"mean_fit_time\",\n",

python_scripts/dimred_components.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
# real data where the features have very different scales and units.
4545

4646
# %%
47+
# %pip install skrub
4748
from skrub import TableReport
4849

4950
TableReport(X)

python_scripts/dimred_ex_01.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
# `n_components` using the grid defined below. Fit it on the full dataset.
8888

8989
# %%
90+
# %pip install skrub
9091
from sklearn.decomposition import PCA
9192
from sklearn.linear_model import Ridge
9293
from sklearn.pipeline import make_pipeline

python_scripts/dimred_preprocessing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ def plot_sq_loadings(ax, pca, feature_names, col_order=None, decimals=2):
242242
# fair description of what PCA computed in that space.
243243

244244
# %%
245+
# %pip install skrub
245246
from sklearn.preprocessing import RobustScaler
246247
from skrub import SquashingScaler
247248

python_scripts/dimred_sol_01.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
# `n_components` using the grid defined below. Fit it on the full dataset.
9393

9494
# %%
95+
# %pip install skrub
9596
from sklearn.decomposition import PCA
9697
from sklearn.linear_model import Ridge
9798
from sklearn.pipeline import make_pipeline

0 commit comments

Comments
 (0)