Skip to content

Commit 9015978

Browse files
committed
logreg: Cleanup cancer dataset example/test
1 parent 988108e commit 9015978

6 files changed

Lines changed: 92 additions & 10 deletions

File tree

16.9 KB
Binary file not shown.
50 KB
Binary file not shown.
Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,28 @@
11
#!/usr/bin/env python3
2-
"""Download and preprocess the Breast Cancer Wisconsin dataset for logreg tests."""
2+
"""Download and preprocess the Breast Cancer Wisconsin dataset."""
33

44
from pathlib import Path
5+
import os
56

67
import numpy as np
78
from sklearn.datasets import load_breast_cancer
89
from sklearn.model_selection import train_test_split
910
from sklearn.preprocessing import StandardScaler
1011

11-
OUTPUT_DIR = Path('data')
12-
OUTPUT_DIR.mkdir(exist_ok=True)
12+
def main():
1313

14-
FILENAMES = {
15-
'X_train': OUTPUT_DIR / 'logreg_bc_X_train.npy',
16-
'X_test': OUTPUT_DIR / 'logreg_bc_X_test.npy',
17-
'y_train': OUTPUT_DIR / 'logreg_bc_y_train.npy',
18-
'y_test': OUTPUT_DIR / 'logreg_bc_y_test.npy',
19-
}
14+
here = os.path.dirname(__file__)
2015

16+
OUTPUT_DIR = Path(here)
17+
OUTPUT_DIR.mkdir(exist_ok=True)
18+
19+
FILENAMES = {
20+
'X_train': OUTPUT_DIR / 'X_train.npy',
21+
'X_test': OUTPUT_DIR / 'X_test.npy',
22+
'y_train': OUTPUT_DIR / 'y_train.npy',
23+
'y_test': OUTPUT_DIR / 'y_test.npy',
24+
}
2125

22-
def main():
2326
X, y = load_breast_cancer(return_X_y=True)
2427
scaler = StandardScaler()
2528
X = scaler.fit_transform(X).astype('float32')
700 Bytes
Binary file not shown.
1.79 KB
Binary file not shown.

tests/test_logreg_cancer.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import array
2+
import gc
3+
import emlearn_logreg
4+
import npyfile
5+
6+
7+
def load_flattened(path):
8+
shape, buf = npyfile.load(path)
9+
return shape, array.array('f', buf)
10+
11+
12+
def predict_class_from_proba(model, features, threshold=0.5):
13+
proba = model.predict(features)
14+
return 1 if proba >= threshold else 0
15+
16+
17+
def accuracy_on_dataset(model, X, y, n_features, threshold=0.5):
18+
correct = 0
19+
n_samples = len(y)
20+
for idx in range(n_samples):
21+
start = idx * n_features
22+
features = array.array('f', X[start:start + n_features])
23+
pred = predict_class_from_proba(model, features, threshold)
24+
if pred == int(y[idx]):
25+
correct += 1
26+
return correct / n_samples
27+
28+
29+
def test_logreg_real_dataset_binary_classification():
30+
31+
data_dir = 'examples/datasets/cancer/'
32+
DATA_FILES = {
33+
'X_train': data_dir+'X_train.npy',
34+
'X_test': data_dir+'X_test.npy',
35+
'y_train': data_dir+'y_train.npy',
36+
'y_test': data_dir+'y_test.npy',
37+
}
38+
39+
gc.collect()
40+
X_train_shape, X_train = load_flattened(DATA_FILES['X_train'])
41+
y_train_shape, y_train = load_flattened(DATA_FILES['y_train'])
42+
X_test_shape, X_test = load_flattened(DATA_FILES['X_test'])
43+
y_test_shape, y_test = load_flattened(DATA_FILES['y_test'])
44+
45+
n_features = X_train_shape[1]
46+
n_train = y_train_shape[0]
47+
n_test = y_test_shape[0]
48+
49+
assert len(X_train) == n_train * n_features
50+
assert len(X_test) == n_test * n_features
51+
52+
model = emlearn_logreg.new(n_features, 0.05, 0.001, 0.0005)
53+
54+
stop_iter, stop_loss = emlearn_logreg.train(
55+
model,
56+
X_train,
57+
y_train,
58+
max_iterations=1500,
59+
tolerance=1e-5,
60+
check_interval=25,
61+
batch_size=64,
62+
score_limit=0.28,
63+
)
64+
65+
assert stop_iter > 0
66+
assert stop_loss == stop_loss # not NaN
67+
68+
train_loss = model.score_logloss(X_train, y_train)
69+
test_loss = model.score_logloss(X_test, y_test)
70+
71+
assert train_loss < 0.35, train_loss
72+
assert test_loss < 0.4, test_loss
73+
74+
accuracy = accuracy_on_dataset(model, X_test, y_test, n_features)
75+
assert accuracy > 0.9, accuracy
76+
77+
78+
if __name__ == '__main__':
79+
test_logreg_real_dataset_binary_classification()

0 commit comments

Comments
 (0)