Skip to content

Commit bf96872

Browse files
committed
add exercise with ridge model
1 parent f051922 commit bf96872

2 files changed

Lines changed: 111 additions & 26 deletions

File tree

content/python_files/feature_engineering.py

Lines changed: 107 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ def iqr(col, *, window_size: int):
392392
"prediction_end_time", historical_data_end_time.skb.eval() - pl.duration(hours=24)
393393
)
394394

395+
395396
@skrub.deferred
396397
def define_prediction_time_range(prediction_start_time, prediction_end_time):
397398
return pl.DataFrame().with_columns(
@@ -403,7 +404,10 @@ def define_prediction_time_range(prediction_start_time, prediction_end_time):
403404
).alias("prediction_time"),
404405
)
405406

406-
prediction_time = define_prediction_time_range(prediction_start_time, prediction_end_time)
407+
408+
prediction_time = define_prediction_time_range(
409+
prediction_start_time, prediction_end_time
410+
)
407411
prediction_time
408412

409413

@@ -488,7 +492,7 @@ def build_features(
488492
import skrub.selectors as s
489493

490494

491-
predictions = features.skb.apply(
495+
features_with_dropped_cols = features.skb.apply(
492496
skrub.DropCols(
493497
cols=skrub.choose_from(
494498
{
@@ -508,7 +512,9 @@ def build_features(
508512
name="dropped_cols",
509513
)
510514
)
511-
).skb.apply(
515+
)
516+
517+
predictions = features_with_dropped_cols.skb.apply(
512518
HistGradientBoostingRegressor(
513519
random_state=0,
514520
loss=skrub.choose_from(["squared_error", "poisson", "gamma"], name="loss"),
@@ -720,6 +726,22 @@ def plot_lorenz_curve(cv_predictions, n_samples=1_000):
720726
)
721727
)
722728

729+
model_chart = (
730+
altair.Chart(results)
731+
.mark_line(strokeDash=[4, 2, 4, 2], opacity=0.8, tooltip=True)
732+
.encode(
733+
x=altair.X(
734+
"cum_population:Q",
735+
title="Fraction of observations sorted by predicted label",
736+
),
737+
y=altair.Y("cum_observed:Q", title="Cumulative observed load proportion"),
738+
color=altair.Color(
739+
"model_label:N", legend=altair.Legend(title="Models"), sort=None
740+
),
741+
detail="cv_idx:N",
742+
)
743+
)
744+
723745
diagonal_chart = (
724746
altair.Chart(
725747
pl.DataFrame(
@@ -737,21 +759,9 @@ def plot_lorenz_curve(cv_predictions, n_samples=1_000):
737759
title="Fraction of observations sorted by predicted label",
738760
),
739761
y=altair.Y("cum_observed:Q", title="Cumulative observed load proportion"),
740-
color=altair.Color("model_label:N", legend=altair.Legend(title="Models")),
741-
)
742-
)
743-
744-
model_chart = (
745-
altair.Chart(results)
746-
.mark_line(opacity=0.3, tooltip=True)
747-
.encode(
748-
x=altair.X(
749-
"cum_population:Q",
750-
title="Fraction of observations sorted by predicted label",
762+
color=altair.Color(
763+
"model_label:N", legend=altair.Legend(title="Models"), sort=None
751764
),
752-
y=altair.Y("cum_observed:Q", title="Cumulative observed load proportion"),
753-
color=altair.Color("model_label:N", legend=altair.Legend(title="Models")),
754-
detail="cv_idx:N",
755765
)
756766
)
757767

@@ -992,10 +1002,6 @@ def plot_residuals_by_month(cv_predictions):
9921002

9931003
plot_residuals_by_month(cv_predictions).interactive()
9941004

995-
996-
# %%
997-
998-
9991005
# %%
10001006
ts_cv_2 = TimeSeriesSplit(
10011007
n_splits=2, test_size=test_size, max_train_size=max_train_size, gap=24
@@ -1038,12 +1044,87 @@ def plot_residuals_by_month(cv_predictions):
10381044
# )
10391045

10401046
# %%
1041-
# from joblib import Parallel, delayed
1047+
# TODO: Exercise applying a a linear model with some additional feature engineering
1048+
from sklearn.linear_model import Ridge
1049+
from sklearn.kernel_approximation import Nystroem
10421050

1043-
# cv_predictions = []
1044-
# for ts_cv_train_idx, ts_cv_test_idx in ts_cv_5.split(prediction_time.skb.eval()):
1045-
# features[ts_cv_train_idx].fit
1051+
model = skrub.tabular_learner(
1052+
estimator=Ridge(alpha=skrub.choose_float(1e-6, 1e6, log=True, default=1e-3))
1053+
)
1054+
model.steps.insert(
1055+
-1,
1056+
(
1057+
"nystroem",
1058+
Nystroem(n_components=skrub.choose_int(10, 200, log=True, default=150)),
1059+
),
1060+
)
10461061

1062+
predictions_ridge = features_with_dropped_cols.skb.apply(model, y=target)
1063+
predictions_ridge
1064+
1065+
# %%
1066+
altair.Chart(
1067+
pl.concat(
1068+
[
1069+
targets.skb.eval(),
1070+
predictions_ridge.rename(
1071+
{target_column_name: predicted_target_column_name}
1072+
).skb.eval(),
1073+
],
1074+
how="horizontal",
1075+
).tail(24 * 7)
1076+
).transform_fold(
1077+
[target_column_name, predicted_target_column_name],
1078+
).mark_line(
1079+
tooltip=True
1080+
).encode(
1081+
x="prediction_time:T", y="value:Q", color="key:N"
1082+
).interactive()
1083+
1084+
# %%
1085+
ts_cv_2 = TimeSeriesSplit(
1086+
n_splits=2, test_size=test_size, max_train_size=max_train_size, gap=24
1087+
)
1088+
randomized_search = predictions_ridge.skb.get_randomized_search(
1089+
cv=ts_cv_2,
1090+
scoring="r2",
1091+
n_iter=100,
1092+
fitted=True,
1093+
verbose=1,
1094+
n_jobs=-1,
1095+
)
1096+
1097+
# %%
1098+
randomized_search.plot_results().update_layout(margin=dict(l=200))
1099+
1100+
# %%
1101+
nested_cv_results = skrub.cross_validate(
1102+
environment=predictions_ridge.skb.get_data(),
1103+
pipeline=randomized_search,
1104+
cv=ts_cv_5,
1105+
scoring={
1106+
"r2": get_scorer("r2"),
1107+
"mape": mape_scorer,
1108+
},
1109+
n_jobs=-1,
1110+
return_pipeline=True,
1111+
).round(3)
1112+
1113+
# %%
1114+
nested_cv_results.round(3)
1115+
1116+
# %%
1117+
cv_predictions_ridge = collect_cv_predictions(
1118+
nested_cv_results["pipeline"], ts_cv_5, predictions_ridge, prediction_time
1119+
)
1120+
1121+
# %%
1122+
plot_lorenz_curve(cv_predictions_ridge, n_samples=500).interactive()
1123+
1124+
# %%
1125+
plot_reliability_diagram(cv_predictions_ridge).interactive().properties(
1126+
title="Reliability diagram from cross-validation predictions"
1127+
)
10471128

10481129
# %%
10491130
from sklearn.multioutput import MultiOutputRegressor
@@ -1053,7 +1134,7 @@ def plot_residuals_by_month(cv_predictions):
10531134
)
10541135

10551136
# %%
1056-
multioutput_predictions = features.skb.apply(
1137+
multioutput_predictions = features_with_dropped_cols.skb.apply(
10571138
model, y=targets.skb.drop(cols=["prediction_time", "load_mw"]).skb.mark_as_y()
10581139
).skb.set_name("multioutput_gbdt")
10591140

objectives.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@
7070

7171
## Exercises
7272

73+
- Exercise: pipeline with missing value support: SimpleImputer with
74+
indicator, Spline, Nystroem, RidgeCV or TableVectorizer
75+
- hyper tuning + per analysis of the CV results of the best model.
76+
7377
- Adapt the main skrub pipeline to treat weather data as past covariates
7478
instead of future covariates.
7579

0 commit comments

Comments
 (0)