|
| 1 | +import os |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +import urllib.request |
| 5 | +from io import StringIO |
| 6 | + |
| 7 | +import pandas as pd |
| 8 | +import numpy as np |
| 9 | +from sklearn.model_selection import train_test_split |
| 10 | +from sklearn.preprocessing import StandardScaler |
| 11 | +from sklearn.cross_decomposition import PLSRegression |
| 12 | +from sklearn.metrics import mean_squared_error, r2_score |
| 13 | + |
| 14 | + |
| 15 | +DATA_URL = "https://zenodo.org/records/8362947/files/SpectroFood_dataset.csv?download=1" |
| 16 | + |
| 17 | +def download_dataset(data_dir): |
| 18 | + os.makedirs(data_dir, exist_ok=True) |
| 19 | + csv_file = os.path.join(data_dir, "SpectroFood_dataset.csv") |
| 20 | + if not os.path.exists(csv_file): |
| 21 | + print("Downloading SpectroFood CSV...") |
| 22 | + urllib.request.urlretrieve(DATA_URL, csv_file) |
| 23 | + return csv_file |
| 24 | + |
| 25 | + |
| 26 | +def load_spectrofood_chunks(csv_file, target_col="dry_matter", food_col="food"): |
| 27 | + """ |
| 28 | + Splits CSV into chunks using empty lines (newlines) as separators. |
| 29 | + Each chunk is loaded with pandas.read_csv separately. |
| 30 | + Returns list of tuples: (food_name, DataFrame) |
| 31 | + """ |
| 32 | + chunks = [] |
| 33 | + with open(csv_file, 'r') as f: |
| 34 | + content = f.read() |
| 35 | + |
| 36 | + # Split into raw text blocks on empty lines |
| 37 | + raw_chunks = [c.strip() for c in content.split("\n\n") if c.strip()] |
| 38 | + # FIXME: only returns 1 chunk right now |
| 39 | + print(len(raw_chunks)) |
| 40 | + |
| 41 | + for chunk_text in raw_chunks: |
| 42 | + # Use StringIO to read the chunk as CSV |
| 43 | + chunk_io = StringIO(chunk_text) |
| 44 | + try: |
| 45 | + df_chunk = pd.read_csv(chunk_io, dtype=str, keep_default_na=False) |
| 46 | + except pd.errors.EmptyDataError: |
| 47 | + continue # skip empty chunks |
| 48 | + |
| 49 | + # Determine food name: use the first column of the first row |
| 50 | + if food_col in df_chunk.columns: |
| 51 | + food_name = df_chunk[food_col].iloc[0].strip().replace(" ", "_") |
| 52 | + else: |
| 53 | + food_name = str(df_chunk.iloc[0, 0]).strip().replace(" ", "_") |
| 54 | + |
| 55 | + # Convert numeric columns to float, ignore errors |
| 56 | + df_chunk = df_chunk.apply(pd.to_numeric, errors='coerce') |
| 57 | + chunks.append((food_name, df_chunk)) |
| 58 | + |
| 59 | + return chunks |
| 60 | + |
| 61 | +def preprocess_chunk(df_chunk, target_col="DRY MATTER"): |
| 62 | + """ |
| 63 | + Converts DataFrame to C-contiguous X and y numpy arrays |
| 64 | + """ |
| 65 | + |
| 66 | + #print(df_chunk.columns) |
| 67 | + |
| 68 | + # Keep only rows where the target column is numeric |
| 69 | + df_chunk = df_chunk[pd.to_numeric(df_chunk[target_col], errors='coerce').notna()].copy() |
| 70 | + |
| 71 | + # Drop columns that are entirely NaN |
| 72 | + df_chunk = df_chunk.dropna(axis=1, how='all') |
| 73 | + |
| 74 | + # Drop rows that are entirely NaN |
| 75 | + df_chunk = df_chunk.dropna(axis=0, how='any') |
| 76 | + |
| 77 | + exclude_cols = [c for c in df_chunk.columns if c == target_col or df_chunk[c].dtype == object] |
| 78 | + X = df_chunk.drop(columns=exclude_cols).values.astype(np.float32) |
| 79 | + y = df_chunk[target_col].values.astype(np.float32).reshape(-1, 1) |
| 80 | + |
| 81 | + # Standardize |
| 82 | + scaler_X = StandardScaler() |
| 83 | + #scaler_y = StandardScaler() |
| 84 | + X = scaler_X.fit_transform(X) |
| 85 | + #y = scaler_y.fit_transform(y) |
| 86 | + |
| 87 | + X = np.ascontiguousarray(X) |
| 88 | + y = np.ascontiguousarray(y) |
| 89 | + return X, y |
| 90 | + |
| 91 | +def save_all_chunks(chunks, data_dir): |
| 92 | + """ |
| 93 | + Saves all chunks as numpy files |
| 94 | + """ |
| 95 | + for food_name, df in chunks: |
| 96 | + X, y = preprocess_chunk(df) |
| 97 | + dataset_dir = data_dir+f'_{food_name}' |
| 98 | + os.makedirs(dataset_dir, exist_ok=True) |
| 99 | + np.save(os.path.join(dataset_dir, f"X.npy"), X) |
| 100 | + np.save(os.path.join(dataset_dir, f"y.npy"), y) |
| 101 | + print(f"Saved chunk for {food_name}: {dataset_dir}") |
| 102 | + |
| 103 | +def train_pls_for_chunks(chunks, n_components=10): |
| 104 | + """ |
| 105 | + Trains a scikit-learn PLSRegression model for each chunk |
| 106 | + and prints MSE and R2 |
| 107 | + """ |
| 108 | + for food_name, df in chunks: |
| 109 | + X, y = preprocess_chunk(df) |
| 110 | + # Split 80/20 |
| 111 | + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) |
| 112 | + |
| 113 | + # Train PLS |
| 114 | + pls = PLSRegression(n_components=n_components) |
| 115 | + pls.fit(X_train, y_train) |
| 116 | + |
| 117 | + # Predict and inverse scale |
| 118 | + y_pred = pls.predict(X_test) |
| 119 | + |
| 120 | + mse = mean_squared_error(y_test, y_pred) |
| 121 | + r2 = r2_score(y_test, y_pred) |
| 122 | + print(f"{food_name}: PLSRegression n_components={n_components} | MSE={np.sqrt(mse):.4f} | R2={r2:.4f}") |
| 123 | + |
| 124 | +def main(data_dir="spectrofood_data"): |
| 125 | + csv_file = download_dataset(data_dir) |
| 126 | + chunks = load_spectrofood_chunks(csv_file) |
| 127 | + print(f"Found {len(chunks)} chunks (food types)") |
| 128 | + save_all_chunks(chunks, data_dir) |
| 129 | + train_pls_for_chunks(chunks, n_components=5) |
| 130 | + |
| 131 | +if __name__ == "__main__": |
| 132 | + main(data_dir="my_spectrofood_data") |
| 133 | + |
0 commit comments