-
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathquantileregression.py
More file actions
82 lines (61 loc) · 3.04 KB
/
quantileregression.py
File metadata and controls
82 lines (61 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import nnetsauce as ns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
scoring = ["conformal", "residuals", "predictions", "studentized", "conformal-studentized"]
datasets = [load_diabetes, fetch_california_housing]
dataset_names = ["diabetes", "california_housing"]
regrs = [RandomForestRegressor(), RidgeCV(), KNeighborsRegressor()]
for dataset, dataset_name in zip(datasets, dataset_names):
print("\n dataset", dataset_name)
X, y = dataset(return_X_y=True)
if dataset_name == "california_housing":
X, y = X[:1000], y[:1000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
for score in tqdm(scoring):
print("\n score", score)
for regr in regrs:
print("\n regr", regr.__class__.__name__)
regressor = ns.QuantileRegressor(
obj=regr,
scoring = score
)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test, return_pi=True)
# Check ordering
lower_bound, median, upper_bound = predictions.lower, predictions.median, predictions.upper
is_ordered = np.all(np.logical_and(lower_bound < median, median < upper_bound))
print(f"Are the predictions ordered correctly? {is_ordered}")
# Calculate coverage
coverage = np.mean((lower_bound <= y_test)*(upper_bound >= y_test))
print(f"Coverage: {coverage:.2f}")
# Plot
plt.figure(figsize=(10, 6))
# Plot the actual values
plt.plot(y_test, label='Actual', color='black', alpha=0.5)
# Plot the predictions and confidence interval
plt.plot(predictions.median, label='Median prediction', color='blue', linewidth=2)
plt.plot(predictions.mean, label='Mean prediction', color='orange', linestyle='--', linewidth=2)
plt.fill_between(range(len(y_test)),
lower_bound, upper_bound,
alpha=0.3, color='gray',
edgecolor='gray',
label='Prediction interval')
plt.title(f'{regr.__class__.__name__} - {score} scoring')
plt.xlabel('Sample index')
plt.ylabel('Value')
plt.legend()
plt.show()