@@ -750,8 +750,6 @@ def collect_cv_predictions(
750750 cv_splitter ,
751751 predictions ,
752752 prediction_time ,
753- method_name = "predict" ,
754- method_kwargs = None ,
755753):
756754 index_generator = cv_splitter .split (prediction_time .skb .eval ())
757755
@@ -762,13 +760,10 @@ def splitter(X, y, index_generator):
762760 return X [train_idx ], X [test_idx ], y [train_idx ], y [test_idx ]
763761
764762 results = []
765- if method_kwargs is None :
766- method_kwargs = {}
767763
768764 for (_ , test_idx ), pipeline in zip (
769765 cv_splitter .split (prediction_time .skb .eval ()), pipelines
770766 ):
771- method = getattr (pipeline , method_name )
772767 split = predictions .skb .train_test_split (
773768 predictions .skb .get_data (),
774769 splitter = splitter ,
@@ -779,7 +774,7 @@ def splitter(X, y, index_generator):
779774 {
780775 "prediction_time" : prediction_time .skb .eval ()[test_idx ],
781776 "load_mw" : split ["y_test" ],
782- "predicted_load_mw" : method (split ["test" ], ** method_kwargs ),
777+ "predicted_load_mw" : pipeline . predict (split ["test" ]),
783778 }
784779 )
785780 )
@@ -1306,9 +1301,9 @@ def scoring(regressor, X, y):
13061301scoring = {
13071302 "r2" : get_scorer ("r2" ),
13081303 "mape" : make_scorer (mean_absolute_percentage_error ),
1309- "d2_pinball_05_loss " : make_scorer (d2_pinball_score , alpha = 0.05 ),
1310- "d2_pinball_50_loss " : make_scorer (d2_pinball_score , alpha = 0.5 ),
1311- "d2_pinball_95_loss " : make_scorer (d2_pinball_score , alpha = 0.95 ),
1304+ "d2_pinball_05 " : make_scorer (d2_pinball_score , alpha = 0.05 ),
1305+ "d2_pinball_50 " : make_scorer (d2_pinball_score , alpha = 0.50 ),
1306+ "d2_pinball_95 " : make_scorer (d2_pinball_score , alpha = 0.95 ),
13121307}
13131308
13141309# %%
@@ -1492,7 +1487,6 @@ def mean_width(y_true, y_quantile_low, y_quantile_high):
14921487 return float (np .abs (y_quantile_high - y_quantile_low ).mean ().round (1 ))
14931488
14941489
1495-
14961490# %%
14971491coverage (
14981492 cv_predictions_hgbr_50_concat ["load_mw" ].to_numpy (),
@@ -1573,6 +1567,24 @@ def mean_width(y_true, y_quantile_low, y_quantile_high):
15731567# Ideally, the classifier should be efficient when trained on a large number of
15741568# classes (induced by the number of bins). Therefore we use a Random Forest
15751569# classifier as the default base estimator.
1570+ #
1571+ # There are several advantages to this approach:
1572+ # - a single model is trained and can jointly estimate quantiles for all
1573+ # quantile levels (assuming a well tuned number of bins);
1574+ # - the quantile levels can be chosen at prediction time, which allows for a
1575+ # flexible quantile regression model;
1576+ # - in practice, the resulting predictions are often reasonably well calibrated
1577+ # as we will see in the reliability diagrams below.
1578+ #
1579+ # One possible drawback is that current implementations of gradient boosting
1580+ # models tend to be very slow to train with a large number of classes. Random
1581+ # Forests are much more efficient in this case, but they do not always provide
1582+ # the best predictive performance. It could be the case that combining this
1583+ # approach with tabular neural networks can lead to competitive results.
1584+ #
1585+ # However, the current scikit-learn API is not expressive enough to to handle
1586+ # the output shape of the quantile prediction function. We therefore cannot
1587+ # make it fit into a skrub pipeline.
15761588
15771589# %%
15781590from scipy .interpolate import interp1d
@@ -1650,40 +1662,107 @@ def predict_quantiles(self, X, quantiles=(0.05, 0.5, 0.95)):
16501662 return np .asarray ([interp1d (y_cdf_i , edges )(quantiles ) for y_cdf_i in y_cdf ])
16511663
16521664 def predict (self , X ):
1653- return self .predict_quantiles (X , self .quantile ).ravel ()
1665+ return self .predict_quantiles (X , quantiles = ( self .quantile ,) ).ravel ()
16541666
16551667
16561668# %%
1657- from sklearn .ensemble import HistGradientBoostingClassifier
1658- from threadpoolctl import threadpool_limits
1659-
1660-
1661- # with threadpool_limits(1):
1662- if True :
1663- predictions_bqr = features_with_dropped_cols .skb .apply (
1664- BinnedQuantileRegressor (
1665- RandomForestClassifier (
1666- n_jobs = - 1 , n_estimators = 200 , min_samples_leaf = 5 , random_state = 0
1667- ),
1668- # HistGradientBoostingClassifier(random_state=0),
1669- n_bins = 30 ,
1670- ),
1671- y = target ,
1672- )
1669+ quantiles = (0.05 , 0.5 , 0.95 )
1670+ bqr = BinnedQuantileRegressor (
1671+ RandomForestClassifier (
1672+ n_estimators = 300 ,
1673+ min_samples_leaf = 5 ,
1674+ max_features = 0.2 ,
1675+ n_jobs = - 1 ,
1676+ random_state = 0 ,
1677+ ),
1678+ n_bins = 30 ,
1679+ )
1680+ bqr
16731681
16741682# %%
1675- predictions_bqr
1683+ from sklearn . model_selection import cross_validate
16761684
1677- # %%
1678- cv_results_bqr = predictions_bqr .skb .cross_validate (
1685+ X , y = features_with_dropped_cols .skb .eval (), target .skb .eval ()
1686+
1687+ cv_results_bqr = cross_validate (
1688+ bqr ,
1689+ X ,
1690+ y ,
16791691 cv = ts_cv_5 ,
16801692 scoring = {
1681- "d2_pinball" : make_scorer (d2_pinball_score , alpha = 0.5 ),
1682- "MAPE" : make_scorer (mean_absolute_percentage_error ),
1693+ "d2_pinball_50" : make_scorer (d2_pinball_score , alpha = 0.5 ),
16831694 },
1684- return_pipeline = True ,
1695+ return_estimator = True ,
1696+ return_indices = True ,
16851697 verbose = 1 ,
16861698 n_jobs = - 1 ,
16871699)
1688- cv_results_bqr
1700+
16891701# %%
1702+ cv_predictions_bqr_all = [
1703+ cv_predictions_bqr_05 := [],
1704+ cv_predictions_bqr_50 := [],
1705+ cv_predictions_bqr_95 := [],
1706+ ]
1707+ for fold_ix , (qreg , test_idx ) in enumerate (
1708+ zip (cv_results_bqr ["estimator" ], cv_results_bqr ["indices" ]["test" ])
1709+ ):
1710+ print (f"CV iteration #{ fold_ix } " )
1711+ print (f"Test set size: { test_idx .shape [0 ]} rows" )
1712+ print (
1713+ f"Test time range: { prediction_time .skb .eval ()[test_idx ][0 , 0 ]} to "
1714+ f"{ prediction_time .skb .eval ()[test_idx ][- 1 , 0 ]} "
1715+ )
1716+ y_pred_all_quantiles = qreg .predict_quantiles (X [test_idx ], quantiles = quantiles )
1717+
1718+ coverage_score = coverage (
1719+ y [test_idx ],
1720+ y_pred_all_quantiles [:, 0 ],
1721+ y_pred_all_quantiles [:, 2 ],
1722+ )
1723+ print (f"Coverage: { coverage_score :.3f} " )
1724+
1725+ mean_width_score = mean_width (
1726+ y [test_idx ],
1727+ y_pred_all_quantiles [:, 0 ],
1728+ y_pred_all_quantiles [:, 2 ],
1729+ )
1730+ print (f"Mean prediction interval width: " f"{ mean_width_score :.1f} MW" )
1731+
1732+ for q_idx , (quantile , predictions ) in enumerate (
1733+ zip (quantiles , cv_predictions_bqr_all )
1734+ ):
1735+ observed = y [test_idx ]
1736+ predicted = y_pred_all_quantiles [:, q_idx ]
1737+ predictions .append (
1738+ pl .DataFrame (
1739+ {
1740+ "prediction_time" : prediction_time .skb .eval ()[test_idx ],
1741+ "load_mw" : observed ,
1742+ "predicted_load_mw" : predicted ,
1743+ }
1744+ )
1745+ )
1746+ print (f"d2_pinball score: { d2_pinball_score (observed , predicted ):.3f} " )
1747+ print ()
1748+
1749+ # %%
1750+ plot_reliability_diagram (
1751+ cv_predictions_bqr_50 , kind = "quantile" , quantile_level = 0.50
1752+ ).interactive ().properties (
1753+ title = "Reliability diagram for quantile 0.50 from cross-validation predictions"
1754+ )
1755+
1756+ # %%
1757+ plot_reliability_diagram (
1758+ cv_predictions_bqr_05 , kind = "quantile" , quantile_level = 0.05
1759+ ).interactive ().properties (
1760+ title = "Reliability diagram for quantile 0.05 from cross-validation predictions"
1761+ )
1762+
1763+ # %%
1764+ plot_reliability_diagram (
1765+ cv_predictions_bqr_95 , kind = "quantile" , quantile_level = 0.95
1766+ ).interactive ().properties (
1767+ title = "Reliability diagram for quantile 0.95 from cross-validation predictions"
1768+ )
0 commit comments