emlearn
diff --git a/‎src/emlearn_logreg/logreg.py‎
Lines changed: 126 additions & 58 deletions b/‎src/emlearn_logreg/logreg.py‎
Lines changed: 126 additions & 58 deletions
diff --git a/‎tests/test_logreg.py‎
Lines changed: 59 additions & 8 deletions b/‎tests/test_logreg.py‎
Lines changed: 59 additions & 8 deletions
diff --git a/‎tests/test_logreg_cancer.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/test_logreg_cancer.py‎
Lines changed: 0 additions & 1 deletion
@@ -3,26 +3,10 @@
 log_prefix = 'emlearn_logreg:'
 
 
-def _make_workspace_triplet(n_classes):
-    return (
-        array.array('f', [0.0] * n_classes),
-        array.array('f', [0.0] * n_classes),
-        array.array('f', [0.0] * n_classes),
-    )
+def _make_buffer(n):
+    return array.array('f', [0.0] * n)
 
 
-def _make_workspace_pair(n_classes):
-    return (
-        array.array('f', [0.0] * n_classes),
-        array.array('f', [0.0] * n_classes),
-    )
-
-
-def _make_predict_buffers(n_classes):
-    return (
-        array.array('f', [0.0] * n_classes),
-        array.array('f', [0.0] * n_classes),
-    )
 
 def train(model, X_train, y_train,
         max_iterations=200,
@@ -31,12 +15,13 @@ def train(model, X_train, y_train,
         divergence_factor=2.0,
         score_limit=None,
         verbose=0,
-        batch_size=None,
         ):
-    """Mini-batch training loop for logistic regression.
+    """Full-dataset training loop for logistic regression."""
+    if max_iterations <= 0:
+        raise ValueError('max_iterations must be positive')
+    if check_interval <= 0:
+        raise ValueError('check_interval must be positive')
 
-    Copies data into a reusable buffer when mini-batching to limit peak memory.
-    """
     n_features = model.get_n_features()
     n_classes = model.get_n_classes()
     if len(X_train) % n_features != 0:
@@ -48,53 +33,27 @@ def train(model, X_train, y_train,
     if n_samples == 0:
         raise ValueError('y_train is empty')
 
-    if batch_size is None or batch_size <= 0 or batch_size > n_samples:
-        batch_size = n_samples
-
-    logits_buf, probs_buf, bias_buf = _make_workspace_triplet(n_classes)
-    score_logits, score_probs = _make_workspace_pair(n_classes)
-    predict_logits, predict_probs = _make_predict_buffers(n_classes)
+    logits_buf = _make_buffer(n_classes)
+    probs_buf = _make_buffer(n_classes)
+    bias_buf = _make_buffer(n_classes)
+    score_logits = _make_buffer(n_classes)
+    score_probs = _make_buffer(n_classes)
 
     prev_loss = None
     final_loss = float('inf')
     iterations_completed = 0
 
-    use_batches = batch_size < n_samples
-    full_X_view = memoryview(X_train)
-    full_y_view = memoryview(y_train)
-    if use_batches:
-        batch_X = array.array('f', [0.0] * (batch_size * n_features))
-        batch_y = array.array('f', [0.0] * (batch_size * n_classes))
-        batch_X_view = memoryview(batch_X)
-        batch_y_view = memoryview(batch_y)
-    else:
-        batch_X_view = full_X_view
-        batch_y_view = full_y_view
+    X_view = memoryview(X_train)
+    y_view = memoryview(y_train)
 
     for _ in range(max_iterations):
         iterations_completed += 1
-        if use_batches:
-            for start in range(0, n_samples, batch_size):
-                count = min(batch_size, n_samples - start)
-                base_feature = start * n_features
-                base_target = start * n_classes
-                # Copy features for current batch
-                end_f = base_feature + count * n_features
-                batch_X[:count * n_features] = X_train[base_feature:end_f]
-                # Copy targets
-                end_t = base_target + count * n_classes
-                batch_y[:count * n_classes] = y_train[base_target:end_t]
-
-                X_slice = batch_X_view[:count * n_features]
-                y_slice = batch_y_view[:count * n_classes]
-                model.step(X_slice, y_slice, logits_buf, probs_buf, bias_buf)
-        else:
-            model.step(batch_X_view, batch_y_view, logits_buf, probs_buf, bias_buf)
+        model.step(X_view, y_view, logits_buf, probs_buf, bias_buf)
 
         if iterations_completed % check_interval != 0:
             continue
 
-        current_loss = model.score_logloss(full_X_view, full_y_view, score_logits, score_probs)
+        current_loss = model.score_logloss(X_view, y_view, score_logits, score_probs)
         final_loss = current_loss
         change = float('inf') if prev_loss is None else abs(prev_loss - current_loss)
 
@@ -123,7 +82,116 @@ def train(model, X_train, y_train,
         prev_loss = current_loss
 
     if final_loss == float('inf'):
-        final_loss = model.score_logloss(full_X_view, full_y_view, score_logits, score_probs)
+        final_loss = model.score_logloss(X_view, y_view, score_logits, score_probs)
+
+    return iterations_completed, final_loss
+
+
+def train_batches(model,
+        batch_iter_factory,
+        max_iterations=200,
+        tolerance=1e-4,
+        check_interval=5,
+        divergence_factor=2.0,
+        score_limit=None,
+        verbose=0,
+        score_batches=None,
+        ):
+    """Train logistic regression model using externally provided batches.
+
+    batch_iter_factory must be a callable that returns a fresh iterator for each
+    epoch. Each iterator should yield tuples of (X_batch, y_batch) where both are
+    float32 arrays compatible with model.step(). y_batch must be one-hot encoded.
+
+    score_batches is an optional callable taking the model and returning the
+    average log-loss over the data (computed however the caller prefers). When
+    provided, it is used for convergence checking.
+    """
+    if not callable(batch_iter_factory):
+        raise ValueError('batch_iter_factory must be callable')
+    if max_iterations <= 0:
+        raise ValueError('max_iterations must be positive')
+    if check_interval <= 0:
+        raise ValueError('check_interval must be positive')
+    if score_batches is not None and not callable(score_batches):
+        raise ValueError('score_batches must be callable')
+
+    n_features = model.get_n_features()
+    n_classes = model.get_n_classes()
+
+    logits_buf = _make_buffer(n_classes)
+    probs_buf = _make_buffer(n_classes)
+    bias_buf = _make_buffer(n_classes)
+
+    prev_loss = None
+    final_loss = float('inf')
+    iterations_completed = 0
+
+    for _ in range(max_iterations):
+        iterations_completed += 1
+        batches = batch_iter_factory()
+        try:
+            batch_iter = iter(batches)
+        except TypeError:
+            raise ValueError('batch iterator must be iterable')
+
+        batches_processed = 0
+
+        for batch in batch_iter:
+            batches_processed += 1
+            try:
+                X_batch, y_batch = batch
+            except Exception as exc:
+                raise ValueError('each batch must unpack into (X_batch, y_batch)') from exc
+
+            if len(X_batch) == 0:
+                continue
+            if len(X_batch) % n_features != 0:
+                raise ValueError('X_batch size mismatch with n_features')
+            n_samples = len(X_batch) // n_features
+            if len(y_batch) != n_samples * n_classes:
+                raise ValueError('y_batch must be one-hot encoded (len = n_samples * n_classes)')
+
+            model.step(X_batch, y_batch, logits_buf, probs_buf, bias_buf)
+
+        if batches_processed == 0:
+            raise ValueError('batch iterator produced no batches')
+
+        if iterations_completed % check_interval != 0:
+            continue
+        if score_batches is None:
+            continue
+
+        current_loss = float(score_batches(model))
+        final_loss = current_loss
+        change = float('inf') if prev_loss is None else abs(prev_loss - current_loss)
+
+        if verbose >= 2:
+            print(log_prefix, f'Iteration {iterations_completed} loss={current_loss}')
+
+        converged = change < tolerance and iterations_completed > check_interval * 2
+
+        if score_limit is not None and current_loss <= score_limit:
+            converged = True
+
+        diverged = not (current_loss == current_loss)
+        if not diverged and prev_loss is not None:
+            diverged = current_loss > prev_loss * divergence_factor
+
+        if converged:
+            if verbose >= 1:
+                print(log_prefix, f"Converged at iteration {iterations_completed}")
+            break
+
+        if diverged:
+            if verbose >= 1:
+                print(log_prefix, f"Diverged at iteration {iterations_completed}")
+            break
+
+        prev_loss = current_loss
+
+    if score_batches is not None and final_loss == float('inf'):
+        final_loss = float(score_batches(model))
 
     return iterations_completed, final_loss
 
@@ -160,12 +160,27 @@ def assert_raises_value_error(func, message='Expected ValueError'):
     raise AssertionError(message)
 
 
+def make_batch_factory(X, y, n_features, n_classes, batch_size):
+    n_samples = len(X) // n_features
+
+    def factory():
+        for start in range(0, n_samples, batch_size):
+            count = min(batch_size, n_samples - start)
+            feat_start = start * n_features
+            feat_end = feat_start + count * n_features
+            target_start = start * n_classes
+            target_end = target_start + count * n_classes
+            yield array.array('f', X[feat_start:feat_end]), array.array('f', y[target_start:target_end])
+
+    return factory
+
+
 def test_logreg_train_and_predict():
     X, y = make_dataset()
     model = emlearn_logreg.new(2, 2, 0.5, 0.0, 0.0)
 
     emlearn_logreg.train(model, X, y, max_iterations=400, tolerance=1e-5,
-                         check_interval=5, batch_size=2)
+                         check_interval=5)
 
     logits, probs = alloc_predict_buffers(model)
     run_predict(model, array.array('f', [1, 1]), logits, probs)
@@ -207,7 +222,7 @@ def test_logreg_train_minibatch_reduces_loss():
     initial_loss = model.score_logloss(X, y, logits, probs)
 
     emlearn_logreg.train(model, X, y, max_iterations=600, tolerance=1e-6,
-                         check_interval=10, batch_size=1)
+                         check_interval=10)
 
     final_loss = model.score_logloss(X, y, logits, probs)
     assert final_loss < initial_loss * 0.7, (initial_loss, final_loss)
@@ -270,7 +285,7 @@ def test_logreg_handles_ill_conditioned_features():
     initial_loss = model.score_logloss(X, y, logits, probs)
 
     emlearn_logreg.train(model, X, y, max_iterations=2000, tolerance=1e-6,
-                         check_interval=100, batch_size=4)
+                         check_interval=100)
 
     final_loss = model.score_logloss(X, y, logits, probs)
     assert final_loss < initial_loss, (initial_loss, final_loss)
@@ -289,7 +304,7 @@ def test_logreg_high_dimensional_sparse_case():
     initial_loss = model.score_logloss(X, y, logits, probs)
 
     emlearn_logreg.train(model, X, y, max_iterations=600, tolerance=1e-6,
-                         check_interval=30, batch_size=4)
+                         check_interval=30)
 
     final_loss = model.score_logloss(X, y, logits, probs)
 
@@ -312,6 +327,46 @@ def test_logreg_train_requires_targets():
     assert_raises_value_error(lambda: emlearn_logreg.train(model, X, y))
 
 
+def test_logreg_train_batches_matches_dense_training():
+    X, y = make_linearly_separable_dataset()
+    n_features = 2
+    n_classes = 2
+    batch_size = 2
+    model = emlearn_logreg.new(n_features, n_classes, 0.4, 0.01, 0.0)
+
+    def batch_factory():
+        return make_batch_factory(X, y, n_features, n_classes, batch_size)()
+
+    def scorer(m):
+        logits, probs = alloc_predict_buffers(m)
+        return m.score_logloss(X, y, logits, probs)
+
+    emlearn_logreg.train_batches(
+        model,
+        batch_factory,
+        max_iterations=200,
+        tolerance=1e-6,
+        check_interval=10,
+        score_batches=scorer,
+        score_limit=0.1,
+    )
+
+    logits, probs = alloc_predict_buffers(model)
+    final_loss = model.score_logloss(X, y, logits, probs)
+    assert final_loss < 0.1, final_loss
+
+
+def test_logreg_train_batches_validates_batches():
+    n_features = 2
+    n_classes = 2
+    model = emlearn_logreg.new(n_features, n_classes, 0.2, 0.0, 0.0)
+
+    def bad_factory_missing_batches():
+        return iter(())
+
+    assert_raises_value_error(lambda: emlearn_logreg.train_batches(model, bad_factory_missing_batches))
+
+
 def test_logreg_warm_start_sets_new_weights_and_bias():
     X, y = make_dataset()
     model = emlearn_logreg.new(2, 2, 0.3, 0.0, 0.0)
@@ -346,7 +401,6 @@ def test_logreg_multiclass_softmax_train_set_accuracy():
         max_iterations=1200,
         tolerance=1e-6,
         check_interval=60,
-        batch_size=3,
     )
 
     for idx in range(len(y) // n_classes):
@@ -373,7 +427,6 @@ def test_logreg_multiclass_softmax_generalization():
         max_iterations=1200,
         tolerance=1e-6,
         check_interval=60,
-        batch_size=3,
     )
 
     test_points = [
@@ -399,5 +452,3 @@ def test_logreg_multiclass_softmax_generalization():
     test_logreg_train_validates_dimensions()
     test_logreg_train_requires_targets()
     test_logreg_warm_start_sets_new_weights_and_bias()
-    test_logreg_one_vs_rest_classifies_training_samples()
-    test_logreg_one_vs_rest_generalizes_new_points()
 
@@ -77,7 +77,6 @@ def test_logreg_real_dataset_binary_classification():
         max_iterations=1500,
         tolerance=1e-5,
         check_interval=25,
-        batch_size=64,
         score_limit=0.28,
     )
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,6 @@ def test_logreg_real_dataset_binary_classification():`
`77`	`77`	`max_iterations=1500,`
`78`	`78`	`tolerance=1e-5,`
`79`	`79`	`check_interval=25,`
`80`		`- batch_size=64,`
`81`	`80`	`score_limit=0.28,`
`82`	`81`	`)`
`83`	`82`