Skip to content

Commit 7d6fd7d

Browse files
committed
plsr: Standardize the airquality test
1 parent 38e5eac commit 7d6fd7d

10 files changed

Lines changed: 166 additions & 162 deletions

File tree

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ emlearn_arrayutils_SRC = src/emlearn_arrayutils
6363
emlearn_linreg_SRC = src/emlearn_linreg
6464
emlearn_logreg_SRC = src/emlearn_logreg
6565
emlearn_extratrees_SRC = src/emlearn_extratrees
66+
emlearn_plsr_SRC = src/emlearn_plsr
6667

6768
# Dependencies for each .mpy file: .c, .h, .py files, and Makefile
6869
$(foreach mod,$(MODULES),\

airquality_check.py

Lines changed: 0 additions & 76 deletions
This file was deleted.

airquality_download.py

Lines changed: 0 additions & 86 deletions
This file was deleted.
8.55 KB
Binary file not shown.
33.7 KB
Binary file not shown.
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env python3
2+
"""Download and preprocess the Air Quality UCI dataset for PLS regression.
3+
4+
Also computes sklearn PLSR reference results for comparison.
5+
Run with CPython: python3 examples/datasets/airquality/prepare.py
6+
"""
7+
8+
from pathlib import Path
9+
import os
10+
import urllib.request
11+
import zipfile
12+
13+
import numpy as np
14+
import pandas as pd
15+
from sklearn.model_selection import train_test_split
16+
from sklearn.preprocessing import StandardScaler
17+
from sklearn.cross_decomposition import PLSRegression
18+
from sklearn.metrics import r2_score, mean_squared_error
19+
20+
21+
def main():
22+
23+
here = os.path.dirname(__file__)
24+
OUTPUT_DIR = Path(here)
25+
OUTPUT_DIR.mkdir(exist_ok=True)
26+
27+
# Download
28+
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
29+
zip_path = OUTPUT_DIR / "AirQualityUCI.zip"
30+
if not zip_path.exists():
31+
print("Downloading Air Quality UCI dataset...")
32+
urllib.request.urlretrieve(url, zip_path)
33+
with zipfile.ZipFile(zip_path, 'r') as zf:
34+
zf.extractall(OUTPUT_DIR)
35+
36+
# Load and preprocess
37+
csv_file = OUTPUT_DIR / "AirQualityUCI.csv"
38+
df = pd.read_csv(csv_file, sep=';', decimal=',')
39+
df = df.iloc[:, :-2] # drop last two empty columns
40+
df.replace(-200, np.nan, inplace=True)
41+
df.dropna(inplace=True)
42+
43+
X = df.iloc[:, 2:].values.astype(np.float32) # sensor columns
44+
y = df["CO(GT)"].values.astype(np.float32)
45+
46+
scaler_X = StandardScaler()
47+
X = scaler_X.fit_transform(X).astype(np.float32)
48+
49+
X_train, X_test, y_train, y_test = train_test_split(
50+
X, y, test_size=0.2, random_state=42
51+
)
52+
53+
FILENAMES = {
54+
'X_train': OUTPUT_DIR / 'X_train.npy',
55+
'X_test': OUTPUT_DIR / 'X_test.npy',
56+
'y_train': OUTPUT_DIR / 'y_train.npy',
57+
'y_test': OUTPUT_DIR / 'y_test.npy',
58+
}
59+
60+
np.save(FILENAMES['X_train'], X_train)
61+
np.save(FILENAMES['X_test'], X_test)
62+
np.save(FILENAMES['y_train'], y_train)
63+
np.save(FILENAMES['y_test'], y_test)
64+
65+
print('Saved datasets:')
66+
print(f" X_train: {X_train.shape} -> {FILENAMES['X_train']}")
67+
print(f" X_test : {X_test.shape} -> {FILENAMES['X_test']}")
68+
print(f" y_train: {y_train.shape} -> {FILENAMES['y_train']}")
69+
print(f" y_test : {y_test.shape} -> {FILENAMES['y_test']}")
70+
71+
# Sklearn PLSR reference results
72+
print('\nSklearn PLSR reference:')
73+
for nc in [3, 5]:
74+
pls = PLSRegression(n_components=nc)
75+
pls.fit(X_train, y_train)
76+
y_pred = pls.predict(X_test).ravel()
77+
mse = mean_squared_error(y_test, y_pred)
78+
r2 = r2_score(y_test, y_pred)
79+
print(f" n_components={nc}: MSE={mse:.5f}, R^2={r2:.5f}")
80+
81+
82+
if __name__ == '__main__':
83+
main()
792 Bytes
Binary file not shown.
2.71 KB
Binary file not shown.

tests/test_all.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
'test_extratrees_xor',
3838
'test_extratrees_cancer',
3939
'test_extratrees_wine',
40+
'test_plsr_airquality',
4041
]
4142

4243
def main():

tests/test_plsr_airquality.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python3
2+
"""MicroPython test for PLSR on the Air Quality UCI dataset."""
3+
4+
import array
5+
import emlearn_plsr
6+
import npyfile
7+
8+
9+
DATA_DIR = 'examples/datasets/airquality/'
10+
11+
12+
def mean_squared_error(y_true, y_pred):
13+
n = len(y_true)
14+
return sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred)) / n
15+
16+
17+
def r2_score(y_true, y_pred):
18+
n = len(y_true)
19+
y_mean = sum(y_true) / n
20+
ss_tot = sum((yi - y_mean) ** 2 for yi in y_true)
21+
ss_res = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred))
22+
return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0
23+
24+
25+
def test_plsr_airquality():
26+
"""Test PLSR on Air Quality UCI dataset (regression with 13 features)."""
27+
print("\n=== Air Quality PLSR Test ===")
28+
29+
# Load data
30+
shape_X_train, X_train = npyfile.load(DATA_DIR + 'X_train.npy')
31+
shape_y_train, y_train = npyfile.load(DATA_DIR + 'y_train.npy')
32+
shape_X_test, X_test = npyfile.load(DATA_DIR + 'X_test.npy')
33+
shape_y_test, y_test = npyfile.load(DATA_DIR + 'y_test.npy')
34+
35+
n_train = shape_X_train[0]
36+
n_features = shape_X_train[1]
37+
n_test = shape_X_test[0]
38+
39+
print(f"Loaded: {n_train} train, {n_test} test samples")
40+
print(f"Features: {n_features}")
41+
42+
n_components = 3
43+
44+
# Create and train model
45+
model = emlearn_plsr.new(n_train, n_features, n_components)
46+
total_iter, final_metric = emlearn_plsr.fit(
47+
model, X_train, y_train,
48+
max_iterations=2000,
49+
tolerance=1e-5,
50+
verbose=0,
51+
)
52+
53+
assert total_iter > 0, "Some iterations performed"
54+
assert model.is_complete(), "Training complete"
55+
print(f"Trained: {total_iter} iterations")
56+
57+
# Predict on test set
58+
y_pred = array.array('f')
59+
for i in range(n_test):
60+
row = X_test[i * n_features:(i + 1) * n_features]
61+
y_pred.append(model.predict(row))
62+
63+
# Compute metrics
64+
mse = mean_squared_error(y_test, y_pred)
65+
r2 = r2_score(y_test, y_pred)
66+
67+
print(f"Test MSE: {mse:.5f}")
68+
print(f"Test R^2: {r2:.5f}")
69+
print(f"Target (sklearn PLSR): ~0.97")
70+
71+
# emlearn PLSR should be close to sklearn (which gets ~0.977)
72+
assert r2 > 0.90, "R^2 above 0.90"
73+
74+
if r2 >= 0.90:
75+
print("✅ GOOD: Solid regression performance on real data!")
76+
else:
77+
print("❌ POOR: R^2 below threshold")
78+
79+
80+
if __name__ == '__main__':
81+
test_plsr_airquality()

0 commit comments

Comments
 (0)