fix(memory): bound RAM in recalculate_backtests and migrate_backtests; export migrate_backtests

MDUYN · MDUYN · commit 377a7401b653 · 2026-05-05T22:30:21.000+02:00
- recalculate_backtests: workers now return only computed metrics+summary
  instead of the full Backtest (with snapshots/trades/timeseries), and
  in-flight tasks are bounded to n_workers so memory scales with workers,
  not len(backtests).
- migrate_backtests: build index.parquet from rows returned by workers
  instead of re-loading every freshly written bundle into the parent
  process; bound in-flight tasks to n_workers (replaces ex.map which
  buffered the full plan in the executor feeder).
- Export migrate_backtests from the package root.

Bump version to v8.7.1.
diff --git a/investing_algorithm_framework/__init__.py b/investing_algorithm_framework/__init__.py
@@ -24,7 +24,7 @@
     APPLICATION_DIRECTORY, DataSource, OrderExecutor, PortfolioProvider, \
     SnapshotInterval, AWS_S3_STATE_BUCKET_NAME, BacktestEvaluationFocus, \
     save_backtests_to_directory, BacktestMetrics, DATA_DIRECTORY, \
-    retag_backtests, \
+    retag_backtests, migrate_backtests, \
     Blotter, DefaultBlotter, SimulationBlotter, Transaction, \
     SlippageModel, NoSlippage, PercentageSlippage, FixedSlippage, \
     VolumeImpactSlippage, \
@@ -222,6 +222,7 @@
     "load_backtests_from_directory",
     "save_backtests_to_directory",
     "retag_backtests",
+    "migrate_backtests",
     "DataError",
     "create_backtest_metrics_for_backtest",
     "recalculate_backtests",
diff --git a/investing_algorithm_framework/domain/backtesting/backtest_utils.py b/investing_algorithm_framework/domain/backtesting/backtest_utils.py
@@ -1,6 +1,7 @@
 import json
 import os
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, as_completed, \
+    wait, FIRST_COMPLETED
 from logging import getLogger
 from pathlib import Path
 from random import Random
@@ -570,13 +571,14 @@ def load_backtests(
 def _migrate_one(args):
     """Worker entry point: open *src* (legacy dir or bundle), write
     *dst* as a bundle, optionally delete *src*, return the
-    destination path.
+    destination path together with a flat index row.
 
     Doing load+save (and optionally delete) in one worker call keeps
     each backtest's data in a single process — avoiding the cost of
     pickling fully-decoded Backtest objects back to the parent
-    process. This roughly halves peak memory usage for large
-    migrations and is faster end-to-end.
+    process. We also build the index row here, while the backtest is
+    still in memory, so the parent never has to re-open the migrated
+    bundles just to build ``index.parquet``.
     """
     src, dst, include_ohlcv, ohlcv_store, delete_source = args
     bt = _open_bundle(src) if is_bundle_file(src) else Backtest.open(src)
@@ -585,6 +587,11 @@ def _migrate_one(args):
         include_ohlcv=include_ohlcv,
         ohlcv_store=ohlcv_store,
     ))
+    rel = os.path.basename(out)
+    row = _backtest_to_index_row(bt, bundle_path=rel)
+    # Drop the heavy backtest before returning so the worker process's
+    # RSS can be reclaimed before it picks up the next task.
+    del bt
     if delete_source and os.path.abspath(src) != os.path.abspath(out):
         import shutil
         if os.path.isdir(src):
@@ -594,7 +601,7 @@ def _migrate_one(args):
                 os.remove(src)
             except OSError:
                 pass
-    return out
+    return out, row
 
 
 def migrate_backtests(
@@ -681,33 +688,68 @@ def migrate_backtests(
     n = len(plan)
     resolved_workers = min(_resolve_workers(workers), n)
 
-    iterator: object
-    if resolved_workers > 1:
-        with ProcessPoolExecutor(max_workers=resolved_workers) as ex:
-            results = ex.map(_migrate_one, plan)
-            iterator = tqdm(
-                results,
-                total=n,
-                desc="Migrating backtests",
-                disable=not show_progress,
-            )
-            for _ in iterator:
-                pass
-    else:
-        for args in tqdm(
-            plan,
-            total=n,
-            desc="Migrating backtests",
-            disable=not show_progress,
-        ):
-            _migrate_one(args)
-
-    if write_index:
-        # Re-open the freshly written bundles (cheap header reads only
-        # for the index) to build the parquet manifest.
-        migrated = load_backtests_from_directory(
-            dst_dir, workers=workers, show_progress=False,
+    rows: List[dict] = []
+    pbar = tqdm(
+        total=n,
+        desc="Migrating backtests",
+        disable=not show_progress,
+    )
+    try:
+        if resolved_workers > 1:
+            # Bound in-flight tasks to ``resolved_workers`` so we don't
+            # buffer the full plan inside the executor's feeder, and
+            # consume results as they finish to keep memory flat.
+            plan_iter = iter(plan)
+            with ProcessPoolExecutor(max_workers=resolved_workers) as ex:
+                inflight = {}
+                for _ in range(resolved_workers):
+                    try:
+                        args = next(plan_iter)
+                    except StopIteration:
+                        break
+                    inflight[ex.submit(_migrate_one, args)] = args
+
+                while inflight:
+                    done_set, _unused = wait(
+                        inflight.keys(), return_when=FIRST_COMPLETED
+                    )
+                    for fut in done_set:
+                        args = inflight.pop(fut)
+                        try:
+                            _out, row = fut.result()
+                            if write_index:
+                                rows.append(row)
+                        except Exception as e:
+                            logger.error(
+                                f"Failed to migrate {args[0]}: {e}"
+                            )
+                        finally:
+                            pbar.update(1)
+                        try:
+                            nxt = next(plan_iter)
+                        except StopIteration:
+                            continue
+                        inflight[ex.submit(_migrate_one, nxt)] = nxt
+        else:
+            for args in plan:
+                try:
+                    _out, row = _migrate_one(args)
+                    if write_index:
+                        rows.append(row)
+                except Exception as e:
+                    logger.error(f"Failed to migrate {args[0]}: {e}")
+                finally:
+                    pbar.update(1)
+    finally:
+        pbar.close()
+
+    if write_index and rows:
+        import pandas as pd  # local import keeps top of module light
+        df = pd.DataFrame(rows)
+        df.to_parquet(
+            Path(dst_dir) / "index.parquet",
+            index=False,
+            compression="zstd",
         )
-        _write_index(dst_dir, migrated)
 
     return n
diff --git a/investing_algorithm_framework/services/metrics/generate.py b/investing_algorithm_framework/services/metrics/generate.py
@@ -1,7 +1,7 @@
 from typing import List, Optional
 from logging import getLogger
 import os
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, wait, FIRST_COMPLETED
 
 from investing_algorithm_framework.domain import BacktestMetrics, \
     BacktestRun, OperationalException, Backtest, BacktestDateRange
@@ -89,23 +89,30 @@ def create_backtest_metrics_for_backtest(
 def _recalculate_one(args):
     """Process-pool worker for :func:`recalculate_backtests`.
 
-    Must be a module-level function so it pickles. Returns the mutated
-    backtest so the parent process replaces its in-list reference.
+    Must be a module-level function so it pickles. Returns only the
+    freshly computed per-run metrics and summary so the parent can
+    merge them into the existing backtest object without round-tripping
+    the full snapshots/trades back through pickle.
     """
     backtest, risk_free_rate, metrics = args
     rfr = risk_free_rate if risk_free_rate is not None \
         else (backtest.risk_free_rate or 0.0)
 
-    for run in backtest.get_all_backtest_runs():
-        run.backtest_metrics = create_backtest_metrics(run, rfr, metrics)
-
-    all_metrics = [
-        run.backtest_metrics
+    run_metrics = [
+        create_backtest_metrics(run, rfr, metrics)
         for run in backtest.get_all_backtest_runs()
-        if run.backtest_metrics is not None
     ]
-    backtest.backtest_summary = generate_backtest_summary_metrics(all_metrics)
-    return backtest
+    summary = generate_backtest_summary_metrics(
+        [m for m in run_metrics if m is not None]
+    )
+    return run_metrics, summary
+
+
+def _apply_recalc_result(backtest, run_metrics, summary):
+    runs = backtest.get_all_backtest_runs()
+    for run, bm in zip(runs, run_metrics):
+        run.backtest_metrics = bm
+    backtest.backtest_summary = summary
 
 
 def recalculate_backtests(
@@ -144,29 +151,52 @@ def recalculate_backtests(
 
     if n_workers <= 1 or len(backtests) <= 1:
         for backtest in backtests:
-            _recalculate_one((backtest, risk_free_rate, metrics))
+            run_metrics, summary = _recalculate_one(
+                (backtest, risk_free_rate, metrics)
+            )
+            _apply_recalc_result(backtest, run_metrics, summary)
         return backtests
 
-    # Parallel: each worker mutates and returns the backtest. Replace
-    # the originals in-place so callers holding the list reference see
-    # the updated objects.
-    tasks = [(bt, risk_free_rate, metrics) for bt in backtests]
-    index_by_id = {id(bt): i for i, bt in enumerate(backtests)}
+    # Parallel: only keep ``n_workers`` tasks in flight at a time so we
+    # don't pickle every backtest up-front (which can blow memory for
+    # large batches with heavy snapshots/trades). Workers return only
+    # the lightweight metrics + summary, which we merge back into the
+    # caller's existing backtest objects.
+    pending = iter(enumerate(backtests))
+    inflight = {}
+
+    def _submit_next(executor):
+        try:
+            idx, bt = next(pending)
+        except StopIteration:
+            return False
+        fut = executor.submit(
+            _recalculate_one, (bt, risk_free_rate, metrics)
+        )
+        inflight[fut] = idx
+        return True
+
     with ProcessPoolExecutor(max_workers=n_workers) as ex:
-        future_to_task = {
-            ex.submit(_recalculate_one, t): t for t in tasks
-        }
-        for fut in as_completed(future_to_task):
-            original_bt = future_to_task[fut][0]
-            try:
-                updated = fut.result()
-            except Exception as e:  # pragma: no cover - defensive
-                logger.error(
-                    "Failed to recalculate backtest "
-                    f"{getattr(original_bt, 'algorithm_id', '?')}: {e}"
-                )
-                continue
-            backtests[index_by_id[id(original_bt)]] = updated
+        for _ in range(n_workers):
+            if not _submit_next(ex):
+                break
+
+        while inflight:
+            done_set, _unused = wait(
+                inflight.keys(), return_when=FIRST_COMPLETED
+            )
+            for fut in done_set:
+                idx = inflight.pop(fut)
+                bt = backtests[idx]
+                try:
+                    run_metrics, summary = fut.result()
+                    _apply_recalc_result(bt, run_metrics, summary)
+                except Exception as e:  # pragma: no cover - defensive
+                    logger.error(
+                        "Failed to recalculate backtest "
+                        f"{getattr(bt, 'algorithm_id', '?')}: {e}"
+                    )
+                _submit_next(ex)
 
     return backtests
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "investing-algorithm-framework"
-version = "v8.7.0"
+version = "v8.7.1"
 description = "A framework for creating trading bots"
 authors = ["MDUYN"]
 readme = "README.md"