|
| 1 | +import array |
| 2 | +import gc |
| 3 | +import emlearn_logreg |
| 4 | +import npyfile |
| 5 | + |
| 6 | + |
| 7 | +def load_flattened(path): |
| 8 | + shape, buf = npyfile.load(path) |
| 9 | + return shape, array.array('f', buf) |
| 10 | + |
| 11 | + |
| 12 | +def predict_class_from_proba(model, features, threshold=0.5): |
| 13 | + proba = model.predict(features) |
| 14 | + return 1 if proba >= threshold else 0 |
| 15 | + |
| 16 | + |
| 17 | +def accuracy_on_dataset(model, X, y, n_features, threshold=0.5): |
| 18 | + correct = 0 |
| 19 | + n_samples = len(y) |
| 20 | + for idx in range(n_samples): |
| 21 | + start = idx * n_features |
| 22 | + features = array.array('f', X[start:start + n_features]) |
| 23 | + pred = predict_class_from_proba(model, features, threshold) |
| 24 | + if pred == int(y[idx]): |
| 25 | + correct += 1 |
| 26 | + return correct / n_samples |
| 27 | + |
| 28 | + |
| 29 | +def test_logreg_real_dataset_binary_classification(): |
| 30 | + |
| 31 | + data_dir = 'examples/datasets/cancer/' |
| 32 | + DATA_FILES = { |
| 33 | + 'X_train': data_dir+'X_train.npy', |
| 34 | + 'X_test': data_dir+'X_test.npy', |
| 35 | + 'y_train': data_dir+'y_train.npy', |
| 36 | + 'y_test': data_dir+'y_test.npy', |
| 37 | + } |
| 38 | + |
| 39 | + gc.collect() |
| 40 | + X_train_shape, X_train = load_flattened(DATA_FILES['X_train']) |
| 41 | + y_train_shape, y_train = load_flattened(DATA_FILES['y_train']) |
| 42 | + X_test_shape, X_test = load_flattened(DATA_FILES['X_test']) |
| 43 | + y_test_shape, y_test = load_flattened(DATA_FILES['y_test']) |
| 44 | + |
| 45 | + n_features = X_train_shape[1] |
| 46 | + n_train = y_train_shape[0] |
| 47 | + n_test = y_test_shape[0] |
| 48 | + |
| 49 | + assert len(X_train) == n_train * n_features |
| 50 | + assert len(X_test) == n_test * n_features |
| 51 | + |
| 52 | + model = emlearn_logreg.new(n_features, 0.05, 0.001, 0.0005) |
| 53 | + |
| 54 | + stop_iter, stop_loss = emlearn_logreg.train( |
| 55 | + model, |
| 56 | + X_train, |
| 57 | + y_train, |
| 58 | + max_iterations=1500, |
| 59 | + tolerance=1e-5, |
| 60 | + check_interval=25, |
| 61 | + batch_size=64, |
| 62 | + score_limit=0.28, |
| 63 | + ) |
| 64 | + |
| 65 | + assert stop_iter > 0 |
| 66 | + assert stop_loss == stop_loss # not NaN |
| 67 | + |
| 68 | + train_loss = model.score_logloss(X_train, y_train) |
| 69 | + test_loss = model.score_logloss(X_test, y_test) |
| 70 | + |
| 71 | + assert train_loss < 0.35, train_loss |
| 72 | + assert test_loss < 0.4, test_loss |
| 73 | + |
| 74 | + accuracy = accuracy_on_dataset(model, X_test, y_test, n_features) |
| 75 | + assert accuracy > 0.9, accuracy |
| 76 | + |
| 77 | + |
| 78 | +if __name__ == '__main__': |
| 79 | + test_logreg_real_dataset_binary_classification() |
0 commit comments