plsr: Some examples on real-world data

jonnor · jonnor · commit 4f8b5b830074 · 2025-11-30T22:36:05.000+01:00
diff --git a/airquality_check.py b/airquality_check.py
@@ -0,0 +1,76 @@
+
+import array
+import emlearn_plsr
+import npyfile
+import os
+import os.path
+
+def mean_squared_error(y_true, y_pred):
+    n = len(y_true)
+    return sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred)) / n
+
+
+def r2_score(y_true, y_pred):
+    n = len(y_true)
+    y_mean = sum(y_true) / n
+    ss_tot = sum((yi - y_mean) ** 2 for yi in y_true)
+    ss_res = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred))
+    return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0
+
+
+def load_data(data_dir):
+    x_file = os.path.join(data_dir, "X.npy")
+    y_file = os.path.join(data_dir, "y.npy")
+    shape_X, X_array = npyfile.load(x_file)  # X_array is array.array('f')
+    shape_y, y_array = npyfile.load(y_file)  # y_array is array.array('f')
+    return shape_X, X_array, shape_y, y_array
+
+
+def run_plsr_reference(data_dir, n_components=5):
+    # -----------------------------
+    # Load data
+    # -----------------------------
+    shape_X, X_array, shape_y, y_array = load_data(data_dir)
+    n_samples = shape_y[0]
+    n_features = shape_X[1]
+
+    # -----------------------------
+    # Create and train model
+    # -----------------------------
+    model = emlearn_plsr.new(n_samples, n_features, n_components)
+    success = emlearn_plsr.fit(
+        model, X_array, y_array,
+        max_iterations=1000,
+        tolerance=1e-5,
+        verbose=0
+    )
+
+    print(success, model.is_complete())
+
+    # -----------------------------
+    # Compute predictions
+    # -----------------------------
+    y_pred = array.array('f')
+    for i in range(n_samples):
+        x_row = X_array[i * n_features:(i + 1) * n_features]
+        y_val = model.predict(x_row)
+        y_pred.append(y_val)
+
+    # -----------------------------
+    # Compute metrics
+    # -----------------------------
+    mse = mean_squared_error(y_array, y_pred)
+    r2 = r2_score(y_array, y_pred)
+
+    print(f"PLSR Reference Results (n_components={n_components}):")
+    print(f"  MSE: {mse:.5f}")
+    print(f"  R^2 score: {r2:.5f}")
+
+
+if __name__ == "__main__":
+    # Example usage: adjust data_dir as needed
+    run_plsr_reference(data_dir="data", n_components=3)
+
+    run_plsr_reference(data_dir="my_spectrofood_data_L1", n_components=10)
+
+
diff --git a/airquality_download.py b/airquality_download.py
@@ -0,0 +1,86 @@
+
+import os
+import urllib.request
+import zipfile
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.cross_decomposition import PLSRegression
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+def download_dataset(url="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip",
+                     data_dir="data",
+                     zip_name="AirQualityUCI.zip"):
+    os.makedirs(data_dir, exist_ok=True)
+    zip_file = os.path.join(data_dir, zip_name)
+    if not os.path.exists(zip_file):
+        print("Downloading dataset...")
+        urllib.request.urlretrieve(url, zip_file)
+    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+        zip_ref.extractall(data_dir)
+    return data_dir
+
+
+def load_and_preprocess(csv_file=None, data_dir="data", feature_cols=None, target_col="CO(GT)"):
+    if csv_file is None:
+        csv_file = os.path.join(data_dir, "AirQualityUCI.csv")
+    df = pd.read_csv(csv_file, sep=';', decimal=',')
+    df = df.iloc[:, :-2]  # drop last two empty columns
+    df.replace(-200, np.nan, inplace=True)
+    df.dropna(inplace=True)
+
+    if feature_cols is None:
+        X = df.iloc[:, 2:].values.astype(np.float32)  # default all sensor columns
+    else:
+        X = df[feature_cols].values.astype(np.float32)
+
+    y = df[target_col].values.astype(np.float32).reshape(-1, 1)
+
+    scaler_X = StandardScaler()
+    X_scaled = np.ascontiguousarray(scaler_X.fit_transform(X))
+
+    scaler_y = StandardScaler()
+    y_scaled = np.ascontiguousarray(scaler_y.fit_transform(y))
+
+    np.save(os.path.join(data_dir, "X.npy"), X_scaled, allow_pickle=False)
+    np.save(os.path.join(data_dir, "y.npy"), y_scaled, allow_pickle=False)
+    return X_scaled, y_scaled, scaler_X, scaler_y
+
+
+def train_and_evaluate(X, y, n_components=5, test_size=0.2, random_state=42, data_dir="data"):
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state
+    )
+
+    pls = PLSRegression(n_components=n_components)
+    pls.fit(X_train, y_train)
+    y_pred = pls.predict(X_test)
+
+    mse = mean_squared_error(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
+
+    np.save(os.path.join(data_dir, "pls_coef.npy"), pls.coef_)
+
+    return mse, r2, pls
+
+
+def load_numpy_data(data_dir="data"):
+    X_loaded = np.load(os.path.join(data_dir, "X.npy"))
+    y_loaded = np.load(os.path.join(data_dir, "y.npy"))
+    return X_loaded, y_loaded
+
+
+def main():
+    n_components = 3
+    data_dir = download_dataset()
+    X, y, _, _ = load_and_preprocess(data_dir=data_dir)
+    mse, r2, _ = train_and_evaluate(X, y, n_components=n_components, data_dir=data_dir)
+    print(f"PLSR Reference Results (n_components={n_components}):")
+    print(f"  MSE: {mse:.5f}")
+    print(f"  R^2: {r2:.5f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/spectrofood_download.py b/spectrofood_download.py
@@ -0,0 +1,133 @@
+import os
+import pandas as pd
+import numpy as np
+import urllib.request
+from io import StringIO
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.cross_decomposition import PLSRegression
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+DATA_URL = "https://zenodo.org/records/8362947/files/SpectroFood_dataset.csv?download=1"
+
+def download_dataset(data_dir):
+    os.makedirs(data_dir, exist_ok=True)
+    csv_file = os.path.join(data_dir, "SpectroFood_dataset.csv")
+    if not os.path.exists(csv_file):
+        print("Downloading SpectroFood CSV...")
+        urllib.request.urlretrieve(DATA_URL, csv_file)
+    return csv_file
+
+
+def load_spectrofood_chunks(csv_file, target_col="dry_matter", food_col="food"):
+    """
+    Splits CSV into chunks using empty lines (newlines) as separators.
+    Each chunk is loaded with pandas.read_csv separately.
+    Returns list of tuples: (food_name, DataFrame)
+    """
+    chunks = []
+    with open(csv_file, 'r') as f:
+        content = f.read()
+
+    # Split into raw text blocks on empty lines
+    raw_chunks = [c.strip() for c in content.split("\n\n") if c.strip()]
+    # FIXME: only returns 1 chunk right now
+    print(len(raw_chunks))
+
+    for chunk_text in raw_chunks:
+        # Use StringIO to read the chunk as CSV
+        chunk_io = StringIO(chunk_text)
+        try:
+            df_chunk = pd.read_csv(chunk_io, dtype=str, keep_default_na=False)
+        except pd.errors.EmptyDataError:
+            continue  # skip empty chunks
+
+        # Determine food name: use the first column of the first row
+        if food_col in df_chunk.columns:
+            food_name = df_chunk[food_col].iloc[0].strip().replace(" ", "_")
+        else:
+            food_name = str(df_chunk.iloc[0, 0]).strip().replace(" ", "_")
+
+        # Convert numeric columns to float, ignore errors
+        df_chunk = df_chunk.apply(pd.to_numeric, errors='coerce')
+        chunks.append((food_name, df_chunk))
+
+    return chunks
+
+def preprocess_chunk(df_chunk, target_col="DRY MATTER"):
+    """
+    Converts DataFrame to C-contiguous X and y numpy arrays
+    """
+
+    #print(df_chunk.columns)
+
+    # Keep only rows where the target column is numeric
+    df_chunk = df_chunk[pd.to_numeric(df_chunk[target_col], errors='coerce').notna()].copy()
+
+    # Drop columns that are entirely NaN
+    df_chunk = df_chunk.dropna(axis=1, how='all')
+
+    # Drop rows that are entirely NaN
+    df_chunk = df_chunk.dropna(axis=0, how='any')
+
+    exclude_cols = [c for c in df_chunk.columns if c == target_col or df_chunk[c].dtype == object]
+    X = df_chunk.drop(columns=exclude_cols).values.astype(np.float32)
+    y = df_chunk[target_col].values.astype(np.float32).reshape(-1, 1)
+
+    # Standardize
+    scaler_X = StandardScaler()
+    #scaler_y = StandardScaler()
+    X = scaler_X.fit_transform(X)
+    #y = scaler_y.fit_transform(y)
+
+    X = np.ascontiguousarray(X)
+    y = np.ascontiguousarray(y)
+    return X, y
+
+def save_all_chunks(chunks, data_dir):
+    """
+    Saves all chunks as numpy files
+    """
+    for food_name, df in chunks:
+        X, y = preprocess_chunk(df)
+        dataset_dir = data_dir+f'_{food_name}'
+        os.makedirs(dataset_dir, exist_ok=True)
+        np.save(os.path.join(dataset_dir, f"X.npy"), X)
+        np.save(os.path.join(dataset_dir, f"y.npy"), y)
+        print(f"Saved chunk for {food_name}: {dataset_dir}")
+
+def train_pls_for_chunks(chunks, n_components=10):
+    """
+    Trains a scikit-learn PLSRegression model for each chunk
+    and prints MSE and R2
+    """
+    for food_name, df in chunks:
+        X, y = preprocess_chunk(df)
+        # Split 80/20
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+        # Train PLS
+        pls = PLSRegression(n_components=n_components)
+        pls.fit(X_train, y_train)
+
+        # Predict and inverse scale
+        y_pred = pls.predict(X_test)
+
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        print(f"{food_name}: PLSRegression n_components={n_components} | MSE={np.sqrt(mse):.4f} | R2={r2:.4f}")
+
+def main(data_dir="spectrofood_data"):
+    csv_file = download_dataset(data_dir)
+    chunks = load_spectrofood_chunks(csv_file)
+    print(f"Found {len(chunks)} chunks (food types)")
+    save_all_chunks(chunks, data_dir)
+    train_pls_for_chunks(chunks, n_components=5)
+
+if __name__ == "__main__":
+    main(data_dir="my_spectrofood_data")
+