Skip to content

Commit f21d1ad

Browse files
committed
multiregression: Start of example
1 parent c94b64f commit f21d1ad

1 file changed

Lines changed: 116 additions & 0 deletions

File tree

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import pandas as pd
2+
import numpy as np
3+
from pathlib import Path
4+
from sklearn.ensemble import RandomForestRegressor
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.preprocessing import MinMaxScaler
7+
from sklearn.pipeline import Pipeline
8+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
9+
import urllib.request
10+
import zipfile
11+
import os.path
12+
13+
from sklearn.multioutput import MultiOutputRegressor
14+
15+
16+
def airquality_download(data_dir='data'):
17+
"""
18+
UCI Air Quality dataset
19+
https://archive.ics.uci.edu/dataset/360/air+quality
20+
"""
21+
22+
data_path = Path(data_dir)
23+
data_path.mkdir(exist_ok=True)
24+
25+
csv_file = data_path / 'AirQualityUCI.csv'
26+
27+
if not csv_file.exists():
28+
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
29+
zip_path = data_path / 'AirQualityUCI.zip'
30+
31+
urllib.request.urlretrieve(url, zip_path)
32+
33+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
34+
zip_ref.extractall(data_path)
35+
36+
zip_path.unlink()
37+
38+
return csv_file
39+
40+
41+
def airquality_load(csv_file):
42+
df = pd.read_csv(csv_file, sep=';', decimal=',')
43+
44+
# Remove missing values
45+
df = df.replace(-200, np.nan)
46+
df = df.dropna(axis=1, how='all').dropna()
47+
48+
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
49+
df = df.drop(['Date', 'Time'], axis=1)
50+
51+
target_cols = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'NMHC(GT)', 'C6H6(GT)']
52+
exclude_cols = target_cols + ['datetime', 'Unnamed: 15', 'Unnamed: 16']
53+
feature_cols = [col for col in df.columns if col not in exclude_cols]
54+
55+
X = df[feature_cols]
56+
y = df[target_cols]
57+
58+
return X, y
59+
60+
61+
from emlearn.preprocessing import Quantizer
62+
import emlearn
63+
64+
def convert_multiregressor(multi, out_dir, format=None, prefix='regressor', **kwargs):
65+
66+
out_dir = Path(out_dir)
67+
out_dir.mkdir(exist_ok=True)
68+
69+
if format is not None:
70+
kwargs['format'] = format
71+
72+
for i, estimator in enumerate(multi.estimators_):
73+
ext = '.h' if format is None else '.'+format
74+
p = out_dir / (f'{prefix}{i}' + ext)
75+
converted = emlearn.convert(estimator)
76+
converted.save(file=p, **kwargs)
77+
78+
79+
80+
def main():
81+
82+
print('Load dataset...')
83+
csv_file = airquality_download()
84+
X, y = airquality_load(csv_file)
85+
86+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
87+
88+
print('Training...')
89+
rf = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)
90+
regressor = MultiOutputRegressor(estimator=rf)
91+
92+
pipeline = Pipeline([
93+
('scaler', Quantizer()), # convert data to int16 range
94+
('regressor', regressor),
95+
])
96+
pipeline.fit(X_train, y_train)
97+
98+
model_dir = 'models/'
99+
convert_multiregressor(pipeline.named_steps['regressor'], out_dir=model_dir, format='csv')
100+
print('Models exported to:', model_dir)
101+
102+
103+
print("Performance Metrics:")
104+
print("-" * 60)
105+
y_pred = pd.DataFrame(pipeline.predict(X_test), columns=y_train.columns)
106+
for i, target in enumerate(y.columns):
107+
rmse = np.sqrt(mean_squared_error(y_test[target], y_pred[target]))
108+
r2 = r2_score(y_test[target], y_pred[target])
109+
mape = mean_absolute_percentage_error(y_test[target], y_pred[target]) * 100
110+
111+
print(f"{target:12} | RMSE: {rmse:8.3f} | MAPE: {mape:6.2f}% | R²: {r2:6.3f}")
112+
113+
if __name__ == '__main__':
114+
main()
115+
116+

0 commit comments

Comments
 (0)