@@ -392,6 +392,7 @@ def iqr(col, *, window_size: int):
392392 "prediction_end_time" , historical_data_end_time .skb .eval () - pl .duration (hours = 24 )
393393)
394394
395+
395396@skrub .deferred
396397def define_prediction_time_range (prediction_start_time , prediction_end_time ):
397398 return pl .DataFrame ().with_columns (
@@ -403,7 +404,10 @@ def define_prediction_time_range(prediction_start_time, prediction_end_time):
403404 ).alias ("prediction_time" ),
404405 )
405406
406- prediction_time = define_prediction_time_range (prediction_start_time , prediction_end_time )
407+
408+ prediction_time = define_prediction_time_range (
409+ prediction_start_time , prediction_end_time
410+ )
407411prediction_time
408412
409413
@@ -488,7 +492,7 @@ def build_features(
488492import skrub .selectors as s
489493
490494
491- predictions = features .skb .apply (
495+ features_with_dropped_cols = features .skb .apply (
492496 skrub .DropCols (
493497 cols = skrub .choose_from (
494498 {
@@ -508,7 +512,9 @@ def build_features(
508512 name = "dropped_cols" ,
509513 )
510514 )
511- ).skb .apply (
515+ )
516+
517+ predictions = features_with_dropped_cols .skb .apply (
512518 HistGradientBoostingRegressor (
513519 random_state = 0 ,
514520 loss = skrub .choose_from (["squared_error" , "poisson" , "gamma" ], name = "loss" ),
@@ -720,6 +726,22 @@ def plot_lorenz_curve(cv_predictions, n_samples=1_000):
720726 )
721727 )
722728
729+ model_chart = (
730+ altair .Chart (results )
731+ .mark_line (strokeDash = [4 , 2 , 4 , 2 ], opacity = 0.8 , tooltip = True )
732+ .encode (
733+ x = altair .X (
734+ "cum_population:Q" ,
735+ title = "Fraction of observations sorted by predicted label" ,
736+ ),
737+ y = altair .Y ("cum_observed:Q" , title = "Cumulative observed load proportion" ),
738+ color = altair .Color (
739+ "model_label:N" , legend = altair .Legend (title = "Models" ), sort = None
740+ ),
741+ detail = "cv_idx:N" ,
742+ )
743+ )
744+
723745 diagonal_chart = (
724746 altair .Chart (
725747 pl .DataFrame (
@@ -737,21 +759,9 @@ def plot_lorenz_curve(cv_predictions, n_samples=1_000):
737759 title = "Fraction of observations sorted by predicted label" ,
738760 ),
739761 y = altair .Y ("cum_observed:Q" , title = "Cumulative observed load proportion" ),
740- color = altair .Color ("model_label:N" , legend = altair .Legend (title = "Models" )),
741- )
742- )
743-
744- model_chart = (
745- altair .Chart (results )
746- .mark_line (opacity = 0.3 , tooltip = True )
747- .encode (
748- x = altair .X (
749- "cum_population:Q" ,
750- title = "Fraction of observations sorted by predicted label" ,
762+ color = altair .Color (
763+ "model_label:N" , legend = altair .Legend (title = "Models" ), sort = None
751764 ),
752- y = altair .Y ("cum_observed:Q" , title = "Cumulative observed load proportion" ),
753- color = altair .Color ("model_label:N" , legend = altair .Legend (title = "Models" )),
754- detail = "cv_idx:N" ,
755765 )
756766 )
757767
@@ -992,10 +1002,6 @@ def plot_residuals_by_month(cv_predictions):
9921002
9931003plot_residuals_by_month (cv_predictions ).interactive ()
9941004
995-
996- # %%
997-
998-
9991005# %%
10001006ts_cv_2 = TimeSeriesSplit (
10011007 n_splits = 2 , test_size = test_size , max_train_size = max_train_size , gap = 24
@@ -1038,12 +1044,87 @@ def plot_residuals_by_month(cv_predictions):
10381044# )
10391045
10401046# %%
1041- # from joblib import Parallel, delayed
1047+ # TODO: Exercise applying a a linear model with some additional feature engineering
1048+ from sklearn .linear_model import Ridge
1049+ from sklearn .kernel_approximation import Nystroem
10421050
1043- # cv_predictions = []
1044- # for ts_cv_train_idx, ts_cv_test_idx in ts_cv_5.split(prediction_time.skb.eval()):
1045- # features[ts_cv_train_idx].fit
1051+ model = skrub .tabular_learner (
1052+ estimator = Ridge (alpha = skrub .choose_float (1e-6 , 1e6 , log = True , default = 1e-3 ))
1053+ )
1054+ model .steps .insert (
1055+ - 1 ,
1056+ (
1057+ "nystroem" ,
1058+ Nystroem (n_components = skrub .choose_int (10 , 200 , log = True , default = 150 )),
1059+ ),
1060+ )
10461061
1062+ predictions_ridge = features_with_dropped_cols .skb .apply (model , y = target )
1063+ predictions_ridge
1064+
1065+ # %%
1066+ altair .Chart (
1067+ pl .concat (
1068+ [
1069+ targets .skb .eval (),
1070+ predictions_ridge .rename (
1071+ {target_column_name : predicted_target_column_name }
1072+ ).skb .eval (),
1073+ ],
1074+ how = "horizontal" ,
1075+ ).tail (24 * 7 )
1076+ ).transform_fold (
1077+ [target_column_name , predicted_target_column_name ],
1078+ ).mark_line (
1079+ tooltip = True
1080+ ).encode (
1081+ x = "prediction_time:T" , y = "value:Q" , color = "key:N"
1082+ ).interactive ()
1083+
1084+ # %%
1085+ ts_cv_2 = TimeSeriesSplit (
1086+ n_splits = 2 , test_size = test_size , max_train_size = max_train_size , gap = 24
1087+ )
1088+ randomized_search = predictions_ridge .skb .get_randomized_search (
1089+ cv = ts_cv_2 ,
1090+ scoring = "r2" ,
1091+ n_iter = 100 ,
1092+ fitted = True ,
1093+ verbose = 1 ,
1094+ n_jobs = - 1 ,
1095+ )
1096+
1097+ # %%
1098+ randomized_search .plot_results ().update_layout (margin = dict (l = 200 ))
1099+
1100+ # %%
1101+ nested_cv_results = skrub .cross_validate (
1102+ environment = predictions_ridge .skb .get_data (),
1103+ pipeline = randomized_search ,
1104+ cv = ts_cv_5 ,
1105+ scoring = {
1106+ "r2" : get_scorer ("r2" ),
1107+ "mape" : mape_scorer ,
1108+ },
1109+ n_jobs = - 1 ,
1110+ return_pipeline = True ,
1111+ ).round (3 )
1112+
1113+ # %%
1114+ nested_cv_results .round (3 )
1115+
1116+ # %%
1117+ cv_predictions_ridge = collect_cv_predictions (
1118+ nested_cv_results ["pipeline" ], ts_cv_5 , predictions_ridge , prediction_time
1119+ )
1120+
1121+ # %%
1122+ plot_lorenz_curve (cv_predictions_ridge , n_samples = 500 ).interactive ()
1123+
1124+ # %%
1125+ plot_reliability_diagram (cv_predictions_ridge ).interactive ().properties (
1126+ title = "Reliability diagram from cross-validation predictions"
1127+ )
10471128
10481129# %%
10491130from sklearn .multioutput import MultiOutputRegressor
@@ -1053,7 +1134,7 @@ def plot_residuals_by_month(cv_predictions):
10531134)
10541135
10551136# %%
1056- multioutput_predictions = features .skb .apply (
1137+ multioutput_predictions = features_with_dropped_cols .skb .apply (
10571138 model , y = targets .skb .drop (cols = ["prediction_time" , "load_mw" ]).skb .mark_as_y ()
10581139).skb .set_name ("multioutput_gbdt" )
10591140
0 commit comments