feat(pipeline): add VectorPipelineEngine (#502 phase 2a)

MDUYN · MDUYN · commit 47776c10a6e0 · 2026-05-02T11:29:49.000+02:00
- Add VectorPipelineEngine: evaluates the full long-form panel and
  every declared factor once over the entire backtest window.
- Add per-evaluation factor result cache via a contextvar consulted
  by Factor.evaluate(). Routes _Rank/_TopN/_BottomN through it so a
  factor reused as both a column and a universe filter is computed
  only once per panel.
- Wire the same cache into PipelineEngine for parity.
- Equivalence tests: per-bar slice of VectorPipelineEngine matches
  PipelineEngine.evaluate(...) for the same dataset and as_of.

Public API (Pipeline/Factor/Filter) unchanged; strategies that work
in event mode work unchanged in vector mode.
diff --git a/investing_algorithm_framework/domain/pipeline/factor.py b/investing_algorithm_framework/domain/pipeline/factor.py
@@ -14,14 +14,25 @@
 """
 from __future__ import annotations
 
-from typing import List, Optional, TYPE_CHECKING
+from contextvars import ContextVar
+from typing import Dict, List, Optional, TYPE_CHECKING
 
 import polars as pl
 
 if TYPE_CHECKING:  # pragma: no cover - avoid runtime cycle
     from .filter import Filter
 
 
+# Per-evaluation memoisation cache (Phase 2 / #502). The pipeline
+# engines push a fresh dict here while evaluating a panel; nested
+# factors (``_Rank._base``, ``_TopN._base``, …) consult it via
+# :meth:`Factor.evaluate` so that a factor instance shared between a
+# pipeline column and a universe filter is computed only once.
+_EVAL_CACHE: ContextVar[Optional[Dict[tuple, pl.Series]]] = ContextVar(
+    "_pipeline_factor_eval_cache", default=None
+)
+
+
 class Factor:
     """Base class for all factor expressions.
 
@@ -64,6 +75,30 @@ def compute_panel(self, panel: pl.DataFrame) -> pl.Series:
         """
         raise NotImplementedError
 
+    # ------------------------------------------------------------------ #
+    # Cached evaluation (Phase 2 / #502)
+    # ------------------------------------------------------------------ #
+    def evaluate(self, panel: pl.DataFrame) -> pl.Series:
+        """Compute and cache this factor's values on ``panel``.
+
+        Identical to :meth:`compute_panel` when called outside an
+        engine context. When a pipeline engine has installed an
+        evaluation cache (via the ``_EVAL_CACHE`` context var), the
+        result is memoised by ``(id(panel), id(self))`` so that the
+        same factor instance reused as both a column and a filter is
+        only computed once per panel.
+        """
+        cache = _EVAL_CACHE.get()
+        if cache is None:
+            return self.compute_panel(panel)
+        key = (id(panel), id(self))
+        cached = cache.get(key)
+        if cached is not None:
+            return cached
+        values = self.compute_panel(panel)
+        cache[key] = values
+        return values
+
     # ------------------------------------------------------------------ #
     # Cross-sectional ops (Phase 1 surface)
     # ------------------------------------------------------------------ #
@@ -118,12 +153,12 @@ def required_window(self) -> int:
         return int(self.window)
 
     def compute_panel(self, panel: pl.DataFrame) -> pl.Series:
-        values = self._base.compute_panel(panel)
+        values = self._base.evaluate(panel)
         df = panel.select(["datetime", "symbol"]).with_columns(
             values.alias("__rank_input__")
         )
         if self._mask is not None:
-            mask_values = self._mask.compute_panel(panel)
+            mask_values = self._mask.evaluate(panel)
             df = df.with_columns(
                 pl.when(mask_values)
                 .then(pl.col("__rank_input__"))
diff --git a/investing_algorithm_framework/domain/pipeline/filter.py b/investing_algorithm_framework/domain/pipeline/filter.py
@@ -46,7 +46,7 @@ def required_window(self) -> int:
         return int(self.window)
 
     def compute_panel(self, panel: pl.DataFrame) -> pl.Series:
-        values = self._base.compute_panel(panel)
+        values = self._base.evaluate(panel)
         df = panel.select(["datetime", "symbol"]).with_columns(
             values.alias("__topn_input__")
         )
@@ -82,7 +82,7 @@ def required_window(self) -> int:
         return int(self.window)
 
     def compute_panel(self, panel: pl.DataFrame) -> pl.Series:
-        values = self._base.compute_panel(panel)
+        values = self._base.evaluate(panel)
         df = panel.select(["datetime", "symbol"]).with_columns(
             values.alias("__bottomn_input__")
         )
diff --git a/investing_algorithm_framework/services/pipeline/__init__.py b/investing_algorithm_framework/services/pipeline/__init__.py
@@ -1,4 +1,5 @@
 """Pipeline service package."""
 from .pipeline_engine import PipelineEngine
+from .vector_pipeline_engine import VectorPipelineEngine
 
-__all__ = ["PipelineEngine"]
+__all__ = ["PipelineEngine", "VectorPipelineEngine"]
diff --git a/investing_algorithm_framework/services/pipeline/pipeline_engine.py b/investing_algorithm_framework/services/pipeline/pipeline_engine.py
@@ -16,6 +16,7 @@
 import pandas as pd
 import polars as pl
 
+from investing_algorithm_framework.domain.pipeline.factor import _EVAL_CACHE
 from investing_algorithm_framework.domain.pipeline.pipeline import Pipeline
 
 
@@ -108,17 +109,22 @@ def evaluate_at(
         if panel.is_empty():
             return self._empty_output(pipeline_cls)
 
-        result = panel.select(["datetime", "symbol"])
-        for name, factor in pipeline_cls.get_columns().items():
-            values = factor.compute_panel(panel)
-            result = result.with_columns(values.alias(name))
-
-        universe = pipeline_cls.get_universe()
-        if universe is not None:
-            mask = universe.compute_panel(panel)
-            result = result.with_columns(mask.alias("__universe__"))
-            result = result.filter(pl.col("__universe__"))
-            result = result.drop("__universe__")
+        cache: Dict[tuple, pl.Series] = {}
+        token = _EVAL_CACHE.set(cache)
+        try:
+            result = panel.select(["datetime", "symbol"])
+            for name, factor in pipeline_cls.get_columns().items():
+                values = factor.evaluate(panel)
+                result = result.with_columns(values.alias(name))
+
+            universe = pipeline_cls.get_universe()
+            if universe is not None:
+                mask = universe.evaluate(panel)
+                result = result.with_columns(mask.alias("__universe__"))
+                result = result.filter(pl.col("__universe__"))
+                result = result.drop("__universe__")
+        finally:
+            _EVAL_CACHE.reset(token)
 
         # Slice to as_of bar
         result = result.filter(pl.col("datetime") == pl.lit(as_of))
diff --git a/investing_algorithm_framework/services/pipeline/vector_pipeline_engine.py b/investing_algorithm_framework/services/pipeline/vector_pipeline_engine.py
@@ -0,0 +1,166 @@
+"""VectorPipelineEngine — Phase 2 (#502).
+
+Materialises the full long-form OHLCV panel and every declared factor
+**once** over the entire backtest window, instead of rebuilding the
+panel on each event-loop iteration. The resulting long frame can then
+be sliced per bar by callers (e.g. the vector backtest service).
+
+Compared to :class:`PipelineEngine`:
+
+- ``evaluate_window`` returns the full ``(datetime, symbol, *factors)``
+  long frame for every bar in the panel — not just ``as_of``.
+- A per-engine factor-result cache keyed by ``id(factor)`` ensures that
+  shared sub-expressions (e.g. a ``Returns`` reused inside
+  ``Returns(...).rank(mask=universe)``) are computed only once per
+  evaluation.
+
+The public Pipeline / Factor / Filter surface from Phase 1 is reused
+unchanged. Strategies that work in event mode also work here.
+"""
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any, Dict, Mapping, Optional, Type
+
+import polars as pl
+
+from investing_algorithm_framework.domain.pipeline.factor import _EVAL_CACHE
+from investing_algorithm_framework.domain.pipeline.pipeline import Pipeline
+
+from .pipeline_engine import PANEL_COLUMNS, PipelineEngine
+
+
+class VectorPipelineEngine:
+    """Vector-mode pipeline executor (#502).
+
+    Build the panel and evaluate every declared factor over the full
+    window in one shot. The output is a long-form
+    ``pl.DataFrame`` with columns
+    ``(datetime, symbol, <factor_1>, ..., <factor_n>)``, sorted by
+    ``(datetime, symbol)``.
+
+    Universe filtering is applied as in event mode — symbols failing
+    the universe mask at a given bar are dropped from that bar's
+    output, and the universe column itself is not exposed.
+    """
+
+    # ------------------------------------------------------------------ #
+    # Panel construction (delegates to event engine for parity)
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def build_panel(
+        data_object: Mapping[str, Any],
+        symbol_to_identifier: Mapping[str, str],
+        end: Optional[datetime] = None,
+    ) -> pl.DataFrame:
+        """Stack per-symbol OHLCV frames into a long-form panel.
+
+        ``end`` is an optional inclusive upper bound (no look-ahead).
+        There is intentionally no ``start`` parameter here — the panel
+        keeps all earlier bars so factors get full warmup. Callers that
+        want to restrict the *output* range should pass ``start`` to
+        :meth:`evaluate_window` instead.
+        """
+        return PipelineEngine.build_panel(
+            data_object=data_object,
+            symbol_to_identifier=symbol_to_identifier,
+            as_of=end,
+        )
+
+    # ------------------------------------------------------------------ #
+    # Evaluation
+    # ------------------------------------------------------------------ #
+    def evaluate_window(
+        self,
+        pipeline_cls: Type[Pipeline],
+        data_object: Mapping[str, Any],
+        symbol_to_identifier: Mapping[str, str],
+        start: Optional[datetime] = None,
+        end: Optional[datetime] = None,
+    ) -> pl.DataFrame:
+        """Evaluate ``pipeline_cls`` over the full window.
+
+        Returns a long-form frame with one row per ``(datetime, symbol)``
+        pair that survives the universe mask, plus one column per
+        declared pipeline output factor.
+
+        ``end`` truncates the input panel (no look-ahead). ``start``
+        only restricts the *output* — earlier bars remain in the panel
+        as warmup so rolling factors compute correctly at ``start``.
+        """
+        panel = self.build_panel(
+            data_object=data_object,
+            symbol_to_identifier=symbol_to_identifier,
+            end=end,
+        )
+        result = self.evaluate_panel(pipeline_cls, panel)
+        if start is not None and not result.is_empty():
+            result = result.filter(pl.col("datetime") >= pl.lit(start))
+        return result
+
+    def evaluate_panel(
+        self,
+        pipeline_cls: Type[Pipeline],
+        panel: pl.DataFrame,
+    ) -> pl.DataFrame:
+        """Evaluate ``pipeline_cls`` over an already-built ``panel``.
+
+        Exposed separately so callers (e.g. the vector backtest service)
+        that already maintain a panel can reuse it without paying the
+        rebuild cost.
+        """
+        if panel.is_empty():
+            return self._empty_long_output(pipeline_cls)
+
+        cache: Dict[tuple, pl.Series] = {}
+        token = _EVAL_CACHE.set(cache)
+        try:
+            result = panel.select(["datetime", "symbol"])
+            for name, factor in pipeline_cls.get_columns().items():
+                values = factor.evaluate(panel)
+                result = result.with_columns(values.alias(name))
+
+            universe = pipeline_cls.get_universe()
+            if universe is not None:
+                mask = universe.evaluate(panel)
+                result = result.with_columns(mask.alias("__universe__"))
+                result = result.filter(pl.col("__universe__"))
+                result = result.drop("__universe__")
+        finally:
+            _EVAL_CACHE.reset(token)
+
+        return result.sort(["datetime", "symbol"])
+
+    # ------------------------------------------------------------------ #
+    # Slicing helpers
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def slice_at(long_result: pl.DataFrame, as_of: datetime) -> pl.DataFrame:
+        """Return the wide ``(symbol, *factors)`` frame for one bar.
+
+        Equivalent to what :meth:`PipelineEngine.evaluate` returns. The
+        ``datetime`` column is dropped so the shape matches event mode.
+        """
+        if long_result.is_empty():
+            return long_result.drop("datetime") \
+                if "datetime" in long_result.columns else long_result
+        sliced = long_result.filter(pl.col("datetime") == pl.lit(as_of))
+        if "datetime" in sliced.columns:
+            sliced = sliced.drop("datetime")
+        return sliced
+
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _empty_long_output(pipeline_cls: Type[Pipeline]) -> pl.DataFrame:
+        schema: Dict[str, Any] = {
+            "datetime": pl.Datetime,
+            "symbol": pl.Utf8,
+        }
+        for name in pipeline_cls.get_columns():
+            schema[name] = pl.Float64
+        return pl.DataFrame(schema=schema)
+
+
+__all__ = ["VectorPipelineEngine", "PANEL_COLUMNS"]
diff --git a/tests/services/pipeline/test_vector_pipeline_engine.py b/tests/services/pipeline/test_vector_pipeline_engine.py