Skip to content

Commit 2291498

Browse files
committed
linreg: Combine reference code
1 parent 3870650 commit 2291498

2 files changed

Lines changed: 253 additions & 268 deletions

File tree

prepare_california.py

Lines changed: 253 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,30 @@
55
Saves scaled train/test splits as .npy files.
66
"""
77

8+
import numpy as np
9+
from sklearn.linear_model import ElasticNet
10+
from sklearn.metrics import mean_squared_error, r2_score
11+
import time
12+
13+
814
import numpy as np
915
from sklearn.datasets import fetch_california_housing
1016
from sklearn.model_selection import train_test_split
1117
from sklearn.preprocessing import StandardScaler
1218

13-
def prepare_california_housing_data():
19+
def prepare_california_housing_data(sample=None):
1420
"""Download, preprocess and save California housing dataset."""
1521

1622
print("Downloading California housing dataset...")
1723
# Load the dataset
1824
housing = fetch_california_housing()
1925
X, y = housing.data, housing.target
2026

27+
if sample is not None:
28+
indices = np.random.choice(X.shape[0], size=sample, replace=False)
29+
X = X[indices]
30+
y = y[indices]
31+
2132
print(f"Dataset shape: X={X.shape}, y={y.shape}")
2233
print(f"Features: {housing.feature_names}")
2334
print(f"Target: median house value in hundreds of thousands of dollars")
@@ -67,10 +78,246 @@ def prepare_california_housing_data():
6778

6879

6980

70-
if __name__ == "__main__":
81+
def load_data():
82+
"""Load the preprocessed California housing data."""
83+
print("Loading data...")
84+
X_train = np.load('X_train.npy')
85+
X_test = np.load('X_test.npy')
86+
y_train = np.load('y_train.npy')
87+
y_test = np.load('y_test.npy')
88+
89+
print(f"Train set: X={X_train.shape}, y={y_train.shape}")
90+
print(f"Test set: X={X_test.shape}, y={y_test.shape}")
91+
print(f"Data types: X={X_train.dtype}, y={y_train.dtype}")
92+
93+
return X_train, X_test, y_train, y_test
94+
95+
def test_elasticnet_configurations():
96+
"""Test different ElasticNet configurations to find good baselines."""
97+
98+
X_train, X_test, y_train, y_test = load_data()
99+
100+
# Test configurations: (alpha, l1_ratio, description)
101+
configs = [
102+
(0.0, 0.0, "No regularization (OLS)"),
103+
(0.01, 0.0, "Ridge (alpha=0.01)"),
104+
(0.01, 1.0, "LASSO (alpha=0.01)"),
105+
(0.01, 0.5, "ElasticNet (alpha=0.01, l1_ratio=0.5)"),
106+
(0.001, 0.5, "ElasticNet (alpha=0.001, l1_ratio=0.5)"),
107+
(0.1, 0.5, "ElasticNet (alpha=0.1, l1_ratio=0.5)"),
108+
]
109+
110+
print("\n" + "="*70)
111+
print("ElasticNet Configuration Comparison")
112+
print("="*70)
113+
print(f"{'Configuration':<35} {'Train MSE':<12} {'Test MSE':<12} {'R²':<8} {'Time':<8}")
114+
print("-"*70)
115+
116+
results = []
117+
118+
for alpha, l1_ratio, description in configs:
119+
start_time = time.time()
120+
121+
# Create and train model
122+
if alpha == 0.0:
123+
# Use regular linear regression for no regularization
124+
from sklearn.linear_model import LinearRegression
125+
model = LinearRegression()
126+
else:
127+
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=2000, random_state=42)
128+
129+
model.fit(X_train, y_train)
130+
131+
# Make predictions
132+
y_train_pred = model.predict(X_train)
133+
y_test_pred = model.predict(X_test)
134+
135+
# Calculate metrics
136+
train_mse = mean_squared_error(y_train, y_train_pred)
137+
test_mse = mean_squared_error(y_test, y_test_pred)
138+
test_r2 = r2_score(y_test, y_test_pred)
139+
140+
elapsed_time = time.time() - start_time
141+
142+
print(f"{description:<35} {train_mse:<12.6f} {test_mse:<12.6f} {test_r2:<8.3f} {elapsed_time:<8.3f}")
143+
144+
results.append({
145+
'config': description,
146+
'alpha': alpha,
147+
'l1_ratio': l1_ratio,
148+
'train_mse': train_mse,
149+
'test_mse': test_mse,
150+
'r2': test_r2,
151+
'time': elapsed_time,
152+
'model': model
153+
})
154+
155+
return results
156+
157+
def detailed_analysis_best_model(results):
158+
"""Perform detailed analysis on the best performing model."""
159+
160+
# Find best model by test MSE
161+
best_result = min(results, key=lambda x: x['test_mse'])
162+
print(f"\n" + "="*50)
163+
print("Detailed Analysis - Best Model")
164+
print("="*50)
165+
print(f"Best configuration: {best_result['config']}")
166+
print(f"Alpha: {best_result['alpha']}, L1 ratio: {best_result['l1_ratio']}")
167+
print(f"Test MSE: {best_result['test_mse']:.6f}")
168+
print(f"Test RMSE: {np.sqrt(best_result['test_mse']):.6f}")
169+
print(f"Test R²: {best_result['r2']:.6f}")
170+
171+
model = best_result['model']
172+
173+
# Load data again for detailed analysis
174+
X_train, X_test, y_train, y_test = load_data()
175+
176+
# Show coefficients (if available)
177+
if hasattr(model, 'coef_'):
178+
print(f"\nModel coefficients:")
179+
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
180+
'Population', 'AveOccup', 'Latitude', 'Longitude']
181+
for i, (name, coef) in enumerate(zip(feature_names, model.coef_)):
182+
print(f" {name:12}: {coef:8.4f}")
183+
184+
if hasattr(model, 'intercept_'):
185+
print(f" {'Intercept':12}: {model.intercept_:8.4f}")
186+
187+
# Count non-zero coefficients
188+
non_zero = np.sum(np.abs(model.coef_) > 1e-6)
189+
print(f"\nSparsity: {non_zero}/{len(model.coef_)} non-zero coefficients")
190+
191+
# Sample predictions
192+
print(f"\nSample predictions (first 10 test samples):")
193+
y_test_pred = model.predict(X_test)
194+
print(f"{'Actual':<10} {'Predicted':<10} {'Error':<10}")
195+
print("-"*30)
196+
for i in range(min(10, len(y_test))):
197+
actual = y_test[i]
198+
predicted = y_test_pred[i]
199+
error = abs(actual - predicted)
200+
print(f"{actual:<10.3f} {predicted:<10.3f} {error:<10.3f}")
201+
202+
return best_result
203+
204+
def compare_with_micropython_format():
205+
"""Create a comparison that matches the MicroPython module format."""
206+
207+
print(f"\n" + "="*50)
208+
print("MicroPython Module Comparison Format")
209+
print("="*50)
210+
211+
X_train, X_test, y_train, y_test = load_data()
212+
213+
# Test with parameters similar to what MicroPython module might use
214+
configs_mp = [
215+
(0.01, 0.5, "emlearn_linreg equivalent 1"),
216+
(0.001, 0.5, "emlearn_linreg equivalent 2"),
217+
(0.1, 0.5, "emlearn_linreg equivalent 3"),
218+
]
219+
220+
for alpha, l1_ratio, description in configs_mp:
221+
print(f"\nTesting: {description}")
222+
print(f"Parameters: alpha={alpha}, l1_ratio={l1_ratio}")
223+
224+
# Train model
225+
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=2000, random_state=42)
226+
model.fit(X_train, y_train)
227+
228+
# Test on small subset (like MicroPython test)
229+
n_small = 100
230+
X_small = X_train[:n_small]
231+
y_small = y_train[:n_small]
232+
233+
small_mse = mean_squared_error(y_small, model.predict(X_small))
234+
full_train_mse = mean_squared_error(y_train, model.predict(X_train))
235+
test_mse = mean_squared_error(y_test, model.predict(X_test))
236+
237+
print(f" Small subset MSE (100 samples): {small_mse:.6f}")
238+
print(f" Full training MSE: {full_train_mse:.6f}")
239+
print(f" Test MSE: {test_mse:.6f}")
240+
241+
# Show first sample prediction for debugging
242+
first_pred = model.predict(X_test[:1])[0]
243+
first_actual = y_test[0]
244+
print(f" First test sample: actual={first_actual:.3f}, predicted={first_pred:.3f}")
245+
246+
# Show learned parameters
247+
print(f" Learned bias: {model.intercept_:.6f}")
248+
print(f" Weight range: [{model.coef_.min():.6f}, {model.coef_.max():.6f}]")
249+
print(f" Non-zero weights: {np.sum(np.abs(model.coef_) > 1e-6)}/{len(model.coef_)}")
250+
251+
def create_reference_outputs():
252+
"""Create reference outputs for validating MicroPython implementation."""
253+
254+
print(f"\n" + "="*50)
255+
print("Reference Outputs for MicroPython Validation")
256+
print("="*50)
257+
258+
X_train, X_test, y_train, y_test = load_data()
259+
260+
# Use specific parameters for reference
261+
alpha, l1_ratio = 0.01, 0.5
262+
263+
print(f"Reference model: alpha={alpha}, l1_ratio={l1_ratio}")
264+
265+
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=2000, random_state=42)
266+
model.fit(X_train, y_train)
267+
268+
# Save reference results
269+
reference_data = {
270+
'alpha': alpha,
271+
'l1_ratio': l1_ratio,
272+
'intercept': model.intercept_,
273+
'coefficients': model.coef_,
274+
'train_mse': mean_squared_error(y_train, model.predict(X_train)),
275+
'test_mse': mean_squared_error(y_test, model.predict(X_test)),
276+
'first_test_prediction': model.predict(X_test[:1])[0],
277+
'first_test_actual': y_test[0]
278+
}
279+
280+
print(f"Intercept: {reference_data['intercept']:.8f}")
281+
print(f"Coefficients: {reference_data['coefficients']}")
282+
print(f"Training MSE: {reference_data['train_mse']:.8f}")
283+
print(f"Test MSE: {reference_data['test_mse']:.8f}")
284+
print(f"First test prediction: {reference_data['first_test_prediction']:.8f}")
285+
print(f"First test actual: {reference_data['first_test_actual']:.8f}")
286+
287+
# Save to file for MicroPython comparison
288+
np.savez('reference_results.npz', **reference_data)
289+
print(f"\nReference results saved to 'reference_results.npz'")
290+
291+
return reference_data
292+
293+
294+
def main():
295+
71296
# Prepare the data
72-
X_train, X_test, y_train, y_test = prepare_california_housing_data()
297+
prepare_california_housing_data(sample=4000)
298+
299+
# Test different configurations
300+
results = test_elasticnet_configurations()
73301

74-
print("\nData preparation complete!")
75-
print("Files ready for MicroPython testing:")
76-
print("- X_train.npy, X_test.npy, y_train.npy, y_test.npy")
302+
# Detailed analysis of best model
303+
best_result = detailed_analysis_best_model(results)
304+
305+
# Compare with MicroPython format
306+
compare_with_micropython_format()
307+
308+
# Create reference outputs
309+
reference_data = create_reference_outputs()
310+
311+
print(f"\n" + "="*60)
312+
print("Summary")
313+
print("="*60)
314+
print(f"Best overall performance: {best_result['config']}")
315+
print(f"Best test MSE: {best_result['test_mse']:.6f}")
316+
print(f"Target for MicroPython module: MSE < {best_result['test_mse']:.3f}")
317+
print("\nFiles created:")
318+
print("- reference_results.npz (for MicroPython validation)")
319+
320+
321+
if __name__ == "__main__":
322+
main()
323+

0 commit comments

Comments
 (0)