Skip to content

Commit b467781

Browse files
committed
Quantile regression as classification
1 parent 40e33ec commit b467781

1 file changed

Lines changed: 113 additions & 34 deletions

File tree

content/python_files/feature_engineering.py

Lines changed: 113 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -750,8 +750,6 @@ def collect_cv_predictions(
750750
cv_splitter,
751751
predictions,
752752
prediction_time,
753-
method_name="predict",
754-
method_kwargs=None,
755753
):
756754
index_generator = cv_splitter.split(prediction_time.skb.eval())
757755

@@ -762,13 +760,10 @@ def splitter(X, y, index_generator):
762760
return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
763761

764762
results = []
765-
if method_kwargs is None:
766-
method_kwargs = {}
767763

768764
for (_, test_idx), pipeline in zip(
769765
cv_splitter.split(prediction_time.skb.eval()), pipelines
770766
):
771-
method = getattr(pipeline, method_name)
772767
split = predictions.skb.train_test_split(
773768
predictions.skb.get_data(),
774769
splitter=splitter,
@@ -779,7 +774,7 @@ def splitter(X, y, index_generator):
779774
{
780775
"prediction_time": prediction_time.skb.eval()[test_idx],
781776
"load_mw": split["y_test"],
782-
"predicted_load_mw": method(split["test"], **method_kwargs),
777+
"predicted_load_mw": pipeline.predict(split["test"]),
783778
}
784779
)
785780
)
@@ -1306,9 +1301,9 @@ def scoring(regressor, X, y):
13061301
scoring = {
13071302
"r2": get_scorer("r2"),
13081303
"mape": make_scorer(mean_absolute_percentage_error),
1309-
"d2_pinball_05_loss": make_scorer(d2_pinball_score, alpha=0.05),
1310-
"d2_pinball_50_loss": make_scorer(d2_pinball_score, alpha=0.5),
1311-
"d2_pinball_95_loss": make_scorer(d2_pinball_score, alpha=0.95),
1304+
"d2_pinball_05": make_scorer(d2_pinball_score, alpha=0.05),
1305+
"d2_pinball_50": make_scorer(d2_pinball_score, alpha=0.50),
1306+
"d2_pinball_95": make_scorer(d2_pinball_score, alpha=0.95),
13121307
}
13131308

13141309
# %%
@@ -1492,7 +1487,6 @@ def mean_width(y_true, y_quantile_low, y_quantile_high):
14921487
return float(np.abs(y_quantile_high - y_quantile_low).mean().round(1))
14931488

14941489

1495-
14961490
# %%
14971491
coverage(
14981492
cv_predictions_hgbr_50_concat["load_mw"].to_numpy(),
@@ -1573,6 +1567,24 @@ def mean_width(y_true, y_quantile_low, y_quantile_high):
15731567
# Ideally, the classifier should be efficient when trained on a large number of
15741568
# classes (induced by the number of bins). Therefore we use a Random Forest
15751569
# classifier as the default base estimator.
1570+
#
1571+
# There are several advantages to this approach:
1572+
# - a single model is trained and can jointly estimate quantiles for all
1573+
# quantile levels (assuming a well tuned number of bins);
1574+
# - the quantile levels can be chosen at prediction time, which allows for a
1575+
# flexible quantile regression model;
1576+
# - in practice, the resulting predictions are often reasonably well calibrated
1577+
# as we will see in the reliability diagrams below.
1578+
#
1579+
# One possible drawback is that current implementations of gradient boosting
1580+
# models tend to be very slow to train with a large number of classes. Random
1581+
# Forests are much more efficient in this case, but they do not always provide
1582+
# the best predictive performance. It could be the case that combining this
1583+
# approach with tabular neural networks can lead to competitive results.
1584+
#
1585+
# However, the current scikit-learn API is not expressive enough to to handle
1586+
# the output shape of the quantile prediction function. We therefore cannot
1587+
# make it fit into a skrub pipeline.
15761588

15771589
# %%
15781590
from scipy.interpolate import interp1d
@@ -1650,40 +1662,107 @@ def predict_quantiles(self, X, quantiles=(0.05, 0.5, 0.95)):
16501662
return np.asarray([interp1d(y_cdf_i, edges)(quantiles) for y_cdf_i in y_cdf])
16511663

16521664
def predict(self, X):
1653-
return self.predict_quantiles(X, self.quantile).ravel()
1665+
return self.predict_quantiles(X, quantiles=(self.quantile,)).ravel()
16541666

16551667

16561668
# %%
1657-
from sklearn.ensemble import HistGradientBoostingClassifier
1658-
from threadpoolctl import threadpool_limits
1659-
1660-
1661-
# with threadpool_limits(1):
1662-
if True:
1663-
predictions_bqr = features_with_dropped_cols.skb.apply(
1664-
BinnedQuantileRegressor(
1665-
RandomForestClassifier(
1666-
n_jobs=-1, n_estimators=200, min_samples_leaf=5, random_state=0
1667-
),
1668-
# HistGradientBoostingClassifier(random_state=0),
1669-
n_bins=30,
1670-
),
1671-
y=target,
1672-
)
1669+
quantiles = (0.05, 0.5, 0.95)
1670+
bqr = BinnedQuantileRegressor(
1671+
RandomForestClassifier(
1672+
n_estimators=300,
1673+
min_samples_leaf=5,
1674+
max_features=0.2,
1675+
n_jobs=-1,
1676+
random_state=0,
1677+
),
1678+
n_bins=30,
1679+
)
1680+
bqr
16731681

16741682
# %%
1675-
predictions_bqr
1683+
from sklearn.model_selection import cross_validate
16761684

1677-
# %%
1678-
cv_results_bqr = predictions_bqr.skb.cross_validate(
1685+
X, y = features_with_dropped_cols.skb.eval(), target.skb.eval()
1686+
1687+
cv_results_bqr = cross_validate(
1688+
bqr,
1689+
X,
1690+
y,
16791691
cv=ts_cv_5,
16801692
scoring={
1681-
"d2_pinball": make_scorer(d2_pinball_score, alpha=0.5),
1682-
"MAPE": make_scorer(mean_absolute_percentage_error),
1693+
"d2_pinball_50": make_scorer(d2_pinball_score, alpha=0.5),
16831694
},
1684-
return_pipeline=True,
1695+
return_estimator=True,
1696+
return_indices=True,
16851697
verbose=1,
16861698
n_jobs=-1,
16871699
)
1688-
cv_results_bqr
1700+
16891701
# %%
1702+
cv_predictions_bqr_all = [
1703+
cv_predictions_bqr_05 := [],
1704+
cv_predictions_bqr_50 := [],
1705+
cv_predictions_bqr_95 := [],
1706+
]
1707+
for fold_ix, (qreg, test_idx) in enumerate(
1708+
zip(cv_results_bqr["estimator"], cv_results_bqr["indices"]["test"])
1709+
):
1710+
print(f"CV iteration #{fold_ix}")
1711+
print(f"Test set size: {test_idx.shape[0]} rows")
1712+
print(
1713+
f"Test time range: {prediction_time.skb.eval()[test_idx][0, 0]} to "
1714+
f"{prediction_time.skb.eval()[test_idx][-1, 0]} "
1715+
)
1716+
y_pred_all_quantiles = qreg.predict_quantiles(X[test_idx], quantiles=quantiles)
1717+
1718+
coverage_score = coverage(
1719+
y[test_idx],
1720+
y_pred_all_quantiles[:, 0],
1721+
y_pred_all_quantiles[:, 2],
1722+
)
1723+
print(f"Coverage: {coverage_score:.3f}")
1724+
1725+
mean_width_score = mean_width(
1726+
y[test_idx],
1727+
y_pred_all_quantiles[:, 0],
1728+
y_pred_all_quantiles[:, 2],
1729+
)
1730+
print(f"Mean prediction interval width: " f"{mean_width_score:.1f} MW")
1731+
1732+
for q_idx, (quantile, predictions) in enumerate(
1733+
zip(quantiles, cv_predictions_bqr_all)
1734+
):
1735+
observed = y[test_idx]
1736+
predicted = y_pred_all_quantiles[:, q_idx]
1737+
predictions.append(
1738+
pl.DataFrame(
1739+
{
1740+
"prediction_time": prediction_time.skb.eval()[test_idx],
1741+
"load_mw": observed,
1742+
"predicted_load_mw": predicted,
1743+
}
1744+
)
1745+
)
1746+
print(f"d2_pinball score: {d2_pinball_score(observed, predicted):.3f}")
1747+
print()
1748+
1749+
# %%
1750+
plot_reliability_diagram(
1751+
cv_predictions_bqr_50, kind="quantile", quantile_level=0.50
1752+
).interactive().properties(
1753+
title="Reliability diagram for quantile 0.50 from cross-validation predictions"
1754+
)
1755+
1756+
# %%
1757+
plot_reliability_diagram(
1758+
cv_predictions_bqr_05, kind="quantile", quantile_level=0.05
1759+
).interactive().properties(
1760+
title="Reliability diagram for quantile 0.05 from cross-validation predictions"
1761+
)
1762+
1763+
# %%
1764+
plot_reliability_diagram(
1765+
cv_predictions_bqr_95, kind="quantile", quantile_level=0.95
1766+
).interactive().properties(
1767+
title="Reliability diagram for quantile 0.95 from cross-validation predictions"
1768+
)

0 commit comments

Comments
 (0)