Skip to content

Commit 4f8b5b8

Browse files
committed
plsr: Some examples on real-world data
1 parent f1a4af6 commit 4f8b5b8

3 files changed

Lines changed: 295 additions & 0 deletions

File tree

airquality_check.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
2+
import array
3+
import emlearn_plsr
4+
import npyfile
5+
import os
6+
import os.path
7+
8+
def mean_squared_error(y_true, y_pred):
9+
n = len(y_true)
10+
return sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred)) / n
11+
12+
13+
def r2_score(y_true, y_pred):
14+
n = len(y_true)
15+
y_mean = sum(y_true) / n
16+
ss_tot = sum((yi - y_mean) ** 2 for yi in y_true)
17+
ss_res = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred))
18+
return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0
19+
20+
21+
def load_data(data_dir):
22+
x_file = os.path.join(data_dir, "X.npy")
23+
y_file = os.path.join(data_dir, "y.npy")
24+
shape_X, X_array = npyfile.load(x_file) # X_array is array.array('f')
25+
shape_y, y_array = npyfile.load(y_file) # y_array is array.array('f')
26+
return shape_X, X_array, shape_y, y_array
27+
28+
29+
def run_plsr_reference(data_dir, n_components=5):
30+
# -----------------------------
31+
# Load data
32+
# -----------------------------
33+
shape_X, X_array, shape_y, y_array = load_data(data_dir)
34+
n_samples = shape_y[0]
35+
n_features = shape_X[1]
36+
37+
# -----------------------------
38+
# Create and train model
39+
# -----------------------------
40+
model = emlearn_plsr.new(n_samples, n_features, n_components)
41+
success = emlearn_plsr.fit(
42+
model, X_array, y_array,
43+
max_iterations=1000,
44+
tolerance=1e-5,
45+
verbose=0
46+
)
47+
48+
print(success, model.is_complete())
49+
50+
# -----------------------------
51+
# Compute predictions
52+
# -----------------------------
53+
y_pred = array.array('f')
54+
for i in range(n_samples):
55+
x_row = X_array[i * n_features:(i + 1) * n_features]
56+
y_val = model.predict(x_row)
57+
y_pred.append(y_val)
58+
59+
# -----------------------------
60+
# Compute metrics
61+
# -----------------------------
62+
mse = mean_squared_error(y_array, y_pred)
63+
r2 = r2_score(y_array, y_pred)
64+
65+
print(f"PLSR Reference Results (n_components={n_components}):")
66+
print(f" MSE: {mse:.5f}")
67+
print(f" R^2 score: {r2:.5f}")
68+
69+
70+
if __name__ == "__main__":
71+
# Example usage: adjust data_dir as needed
72+
run_plsr_reference(data_dir="data", n_components=3)
73+
74+
run_plsr_reference(data_dir="my_spectrofood_data_L1", n_components=10)
75+
76+

airquality_download.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
import os
3+
import urllib.request
4+
import zipfile
5+
import pandas as pd
6+
import numpy as np
7+
from sklearn.model_selection import train_test_split
8+
from sklearn.preprocessing import StandardScaler
9+
from sklearn.cross_decomposition import PLSRegression
10+
from sklearn.metrics import mean_squared_error, r2_score
11+
12+
13+
def download_dataset(url="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip",
14+
data_dir="data",
15+
zip_name="AirQualityUCI.zip"):
16+
os.makedirs(data_dir, exist_ok=True)
17+
zip_file = os.path.join(data_dir, zip_name)
18+
if not os.path.exists(zip_file):
19+
print("Downloading dataset...")
20+
urllib.request.urlretrieve(url, zip_file)
21+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
22+
zip_ref.extractall(data_dir)
23+
return data_dir
24+
25+
26+
def load_and_preprocess(csv_file=None, data_dir="data", feature_cols=None, target_col="CO(GT)"):
27+
if csv_file is None:
28+
csv_file = os.path.join(data_dir, "AirQualityUCI.csv")
29+
df = pd.read_csv(csv_file, sep=';', decimal=',')
30+
df = df.iloc[:, :-2] # drop last two empty columns
31+
df.replace(-200, np.nan, inplace=True)
32+
df.dropna(inplace=True)
33+
34+
if feature_cols is None:
35+
X = df.iloc[:, 2:].values.astype(np.float32) # default all sensor columns
36+
else:
37+
X = df[feature_cols].values.astype(np.float32)
38+
39+
y = df[target_col].values.astype(np.float32).reshape(-1, 1)
40+
41+
scaler_X = StandardScaler()
42+
X_scaled = np.ascontiguousarray(scaler_X.fit_transform(X))
43+
44+
scaler_y = StandardScaler()
45+
y_scaled = np.ascontiguousarray(scaler_y.fit_transform(y))
46+
47+
np.save(os.path.join(data_dir, "X.npy"), X_scaled, allow_pickle=False)
48+
np.save(os.path.join(data_dir, "y.npy"), y_scaled, allow_pickle=False)
49+
return X_scaled, y_scaled, scaler_X, scaler_y
50+
51+
52+
def train_and_evaluate(X, y, n_components=5, test_size=0.2, random_state=42, data_dir="data"):
53+
X_train, X_test, y_train, y_test = train_test_split(
54+
X, y, test_size=test_size, random_state=random_state
55+
)
56+
57+
pls = PLSRegression(n_components=n_components)
58+
pls.fit(X_train, y_train)
59+
y_pred = pls.predict(X_test)
60+
61+
mse = mean_squared_error(y_test, y_pred)
62+
r2 = r2_score(y_test, y_pred)
63+
64+
np.save(os.path.join(data_dir, "pls_coef.npy"), pls.coef_)
65+
66+
return mse, r2, pls
67+
68+
69+
def load_numpy_data(data_dir="data"):
70+
X_loaded = np.load(os.path.join(data_dir, "X.npy"))
71+
y_loaded = np.load(os.path.join(data_dir, "y.npy"))
72+
return X_loaded, y_loaded
73+
74+
75+
def main():
76+
n_components = 3
77+
data_dir = download_dataset()
78+
X, y, _, _ = load_and_preprocess(data_dir=data_dir)
79+
mse, r2, _ = train_and_evaluate(X, y, n_components=n_components, data_dir=data_dir)
80+
print(f"PLSR Reference Results (n_components={n_components}):")
81+
print(f" MSE: {mse:.5f}")
82+
print(f" R^2: {r2:.5f}")
83+
84+
85+
if __name__ == "__main__":
86+
main()

spectrofood_download.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import os
2+
import pandas as pd
3+
import numpy as np
4+
import urllib.request
5+
from io import StringIO
6+
7+
import pandas as pd
8+
import numpy as np
9+
from sklearn.model_selection import train_test_split
10+
from sklearn.preprocessing import StandardScaler
11+
from sklearn.cross_decomposition import PLSRegression
12+
from sklearn.metrics import mean_squared_error, r2_score
13+
14+
15+
DATA_URL = "https://zenodo.org/records/8362947/files/SpectroFood_dataset.csv?download=1"
16+
17+
def download_dataset(data_dir):
18+
os.makedirs(data_dir, exist_ok=True)
19+
csv_file = os.path.join(data_dir, "SpectroFood_dataset.csv")
20+
if not os.path.exists(csv_file):
21+
print("Downloading SpectroFood CSV...")
22+
urllib.request.urlretrieve(DATA_URL, csv_file)
23+
return csv_file
24+
25+
26+
def load_spectrofood_chunks(csv_file, target_col="dry_matter", food_col="food"):
27+
"""
28+
Splits CSV into chunks using empty lines (newlines) as separators.
29+
Each chunk is loaded with pandas.read_csv separately.
30+
Returns list of tuples: (food_name, DataFrame)
31+
"""
32+
chunks = []
33+
with open(csv_file, 'r') as f:
34+
content = f.read()
35+
36+
# Split into raw text blocks on empty lines
37+
raw_chunks = [c.strip() for c in content.split("\n\n") if c.strip()]
38+
# FIXME: only returns 1 chunk right now
39+
print(len(raw_chunks))
40+
41+
for chunk_text in raw_chunks:
42+
# Use StringIO to read the chunk as CSV
43+
chunk_io = StringIO(chunk_text)
44+
try:
45+
df_chunk = pd.read_csv(chunk_io, dtype=str, keep_default_na=False)
46+
except pd.errors.EmptyDataError:
47+
continue # skip empty chunks
48+
49+
# Determine food name: use the first column of the first row
50+
if food_col in df_chunk.columns:
51+
food_name = df_chunk[food_col].iloc[0].strip().replace(" ", "_")
52+
else:
53+
food_name = str(df_chunk.iloc[0, 0]).strip().replace(" ", "_")
54+
55+
# Convert numeric columns to float, ignore errors
56+
df_chunk = df_chunk.apply(pd.to_numeric, errors='coerce')
57+
chunks.append((food_name, df_chunk))
58+
59+
return chunks
60+
61+
def preprocess_chunk(df_chunk, target_col="DRY MATTER"):
62+
"""
63+
Converts DataFrame to C-contiguous X and y numpy arrays
64+
"""
65+
66+
#print(df_chunk.columns)
67+
68+
# Keep only rows where the target column is numeric
69+
df_chunk = df_chunk[pd.to_numeric(df_chunk[target_col], errors='coerce').notna()].copy()
70+
71+
# Drop columns that are entirely NaN
72+
df_chunk = df_chunk.dropna(axis=1, how='all')
73+
74+
# Drop rows that are entirely NaN
75+
df_chunk = df_chunk.dropna(axis=0, how='any')
76+
77+
exclude_cols = [c for c in df_chunk.columns if c == target_col or df_chunk[c].dtype == object]
78+
X = df_chunk.drop(columns=exclude_cols).values.astype(np.float32)
79+
y = df_chunk[target_col].values.astype(np.float32).reshape(-1, 1)
80+
81+
# Standardize
82+
scaler_X = StandardScaler()
83+
#scaler_y = StandardScaler()
84+
X = scaler_X.fit_transform(X)
85+
#y = scaler_y.fit_transform(y)
86+
87+
X = np.ascontiguousarray(X)
88+
y = np.ascontiguousarray(y)
89+
return X, y
90+
91+
def save_all_chunks(chunks, data_dir):
92+
"""
93+
Saves all chunks as numpy files
94+
"""
95+
for food_name, df in chunks:
96+
X, y = preprocess_chunk(df)
97+
dataset_dir = data_dir+f'_{food_name}'
98+
os.makedirs(dataset_dir, exist_ok=True)
99+
np.save(os.path.join(dataset_dir, f"X.npy"), X)
100+
np.save(os.path.join(dataset_dir, f"y.npy"), y)
101+
print(f"Saved chunk for {food_name}: {dataset_dir}")
102+
103+
def train_pls_for_chunks(chunks, n_components=10):
104+
"""
105+
Trains a scikit-learn PLSRegression model for each chunk
106+
and prints MSE and R2
107+
"""
108+
for food_name, df in chunks:
109+
X, y = preprocess_chunk(df)
110+
# Split 80/20
111+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
112+
113+
# Train PLS
114+
pls = PLSRegression(n_components=n_components)
115+
pls.fit(X_train, y_train)
116+
117+
# Predict and inverse scale
118+
y_pred = pls.predict(X_test)
119+
120+
mse = mean_squared_error(y_test, y_pred)
121+
r2 = r2_score(y_test, y_pred)
122+
print(f"{food_name}: PLSRegression n_components={n_components} | MSE={np.sqrt(mse):.4f} | R2={r2:.4f}")
123+
124+
def main(data_dir="spectrofood_data"):
125+
csv_file = download_dataset(data_dir)
126+
chunks = load_spectrofood_chunks(csv_file)
127+
print(f"Found {len(chunks)} chunks (food types)")
128+
save_all_chunks(chunks, data_dir)
129+
train_pls_for_chunks(chunks, n_components=5)
130+
131+
if __name__ == "__main__":
132+
main(data_dir="my_spectrofood_data")
133+

0 commit comments

Comments
 (0)