emlearn
diff --git a/‎examples/datasets/cancer/X_test.npy‎
16.9 KB b/‎examples/datasets/cancer/X_test.npy‎
16.9 KB
diff --git a/‎examples/datasets/cancer/X_train.npy‎
50 KB b/‎examples/datasets/cancer/X_train.npy‎
50 KB
diff --git a/‎tools/generate_logreg_bc_dataset.py‎ ‎examples/datasets/cancer/prepare.py‎tools/generate_logreg_bc_dataset.py renamed to examples/datasets/cancer/prepare.py
Lines changed: 13 additions & 10 deletions b/‎tools/generate_logreg_bc_dataset.py‎ ‎examples/datasets/cancer/prepare.py‎tools/generate_logreg_bc_dataset.py renamed to examples/datasets/cancer/prepare.py
Lines changed: 13 additions & 10 deletions
diff --git a/‎examples/datasets/cancer/y_test.npy‎
700 Bytes b/‎examples/datasets/cancer/y_test.npy‎
700 Bytes
diff --git a/‎examples/datasets/cancer/y_train.npy‎
1.79 KB b/‎examples/datasets/cancer/y_train.npy‎
1.79 KB
diff --git a/‎tests/test_logreg_cancer.py‎
Lines changed: 79 additions & 0 deletions b/‎tests/test_logreg_cancer.py‎
Lines changed: 79 additions & 0 deletions
@@ -1,25 +1,28 @@
 #!/usr/bin/env python3
-"""Download and preprocess the Breast Cancer Wisconsin dataset for logreg tests."""
+"""Download and preprocess the Breast Cancer Wisconsin dataset."""
 
 from pathlib import Path
+import os
 
 import numpy as np
 from sklearn.datasets import load_breast_cancer
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 
-OUTPUT_DIR = Path('data')
-OUTPUT_DIR.mkdir(exist_ok=True)
+def main():
 
-FILENAMES = {
-    'X_train': OUTPUT_DIR / 'logreg_bc_X_train.npy',
-    'X_test': OUTPUT_DIR / 'logreg_bc_X_test.npy',
-    'y_train': OUTPUT_DIR / 'logreg_bc_y_train.npy',
-    'y_test': OUTPUT_DIR / 'logreg_bc_y_test.npy',
-}
+    here = os.path.dirname(__file__)
 
+    OUTPUT_DIR = Path(here)
+    OUTPUT_DIR.mkdir(exist_ok=True)
+
+    FILENAMES = {
+        'X_train': OUTPUT_DIR / 'X_train.npy',
+        'X_test': OUTPUT_DIR / 'X_test.npy',
+        'y_train': OUTPUT_DIR / 'y_train.npy',
+        'y_test': OUTPUT_DIR / 'y_test.npy',
+    }
 
-def main():
     X, y = load_breast_cancer(return_X_y=True)
     scaler = StandardScaler()
     X = scaler.fit_transform(X).astype('float32')
 
@@ -0,0 +1,79 @@
+import array
+import gc
+import emlearn_logreg
+import npyfile
+
+
+def load_flattened(path):
+    shape, buf = npyfile.load(path)
+    return shape, array.array('f', buf)
+
+
+def predict_class_from_proba(model, features, threshold=0.5):
+    proba = model.predict(features)
+    return 1 if proba >= threshold else 0
+
+
+def accuracy_on_dataset(model, X, y, n_features, threshold=0.5):
+    correct = 0
+    n_samples = len(y)
+    for idx in range(n_samples):
+        start = idx * n_features
+        features = array.array('f', X[start:start + n_features])
+        pred = predict_class_from_proba(model, features, threshold)
+        if pred == int(y[idx]):
+            correct += 1
+    return correct / n_samples
+
+
+def test_logreg_real_dataset_binary_classification():
+
+    data_dir = 'examples/datasets/cancer/'
+    DATA_FILES = {
+        'X_train': data_dir+'X_train.npy',
+        'X_test': data_dir+'X_test.npy',
+        'y_train': data_dir+'y_train.npy',
+        'y_test': data_dir+'y_test.npy',
+    }
+
+    gc.collect()
+    X_train_shape, X_train = load_flattened(DATA_FILES['X_train'])
+    y_train_shape, y_train = load_flattened(DATA_FILES['y_train'])
+    X_test_shape, X_test = load_flattened(DATA_FILES['X_test'])
+    y_test_shape, y_test = load_flattened(DATA_FILES['y_test'])
+
+    n_features = X_train_shape[1]
+    n_train = y_train_shape[0]
+    n_test = y_test_shape[0]
+
+    assert len(X_train) == n_train * n_features
+    assert len(X_test) == n_test * n_features
+
+    model = emlearn_logreg.new(n_features, 0.05, 0.001, 0.0005)
+
+    stop_iter, stop_loss = emlearn_logreg.train(
+        model,
+        X_train,
+        y_train,
+        max_iterations=1500,
+        tolerance=1e-5,
+        check_interval=25,
+        batch_size=64,
+        score_limit=0.28,
+    )
+
+    assert stop_iter > 0
+    assert stop_loss == stop_loss  # not NaN
+
+    train_loss = model.score_logloss(X_train, y_train)
+    test_loss = model.score_logloss(X_test, y_test)
+
+    assert train_loss < 0.35, train_loss
+    assert test_loss < 0.4, test_loss
+
+    accuracy = accuracy_on_dataset(model, X_test, y_test, n_features)
+    assert accuracy > 0.9, accuracy
+
+
+if __name__ == '__main__':
+    test_logreg_real_dataset_binary_classification()