|
| 1 | +"""VectorPipelineEngine — Phase 2 (#502). |
| 2 | +
|
| 3 | +Materialises the full long-form OHLCV panel and every declared factor |
| 4 | +**once** over the entire backtest window, instead of rebuilding the |
| 5 | +panel on each event-loop iteration. The resulting long frame can then |
| 6 | +be sliced per bar by callers (e.g. the vector backtest service). |
| 7 | +
|
| 8 | +Compared to :class:`PipelineEngine`: |
| 9 | +
|
| 10 | +- ``evaluate_window`` returns the full ``(datetime, symbol, *factors)`` |
| 11 | + long frame for every bar in the panel — not just ``as_of``. |
| 12 | +- A per-engine factor-result cache keyed by ``id(factor)`` ensures that |
| 13 | + shared sub-expressions (e.g. a ``Returns`` reused inside |
| 14 | + ``Returns(...).rank(mask=universe)``) are computed only once per |
| 15 | + evaluation. |
| 16 | +
|
| 17 | +The public Pipeline / Factor / Filter surface from Phase 1 is reused |
| 18 | +unchanged. Strategies that work in event mode also work here. |
| 19 | +""" |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +from datetime import datetime |
| 23 | +from typing import Any, Dict, Mapping, Optional, Type |
| 24 | + |
| 25 | +import polars as pl |
| 26 | + |
| 27 | +from investing_algorithm_framework.domain.pipeline.factor import _EVAL_CACHE |
| 28 | +from investing_algorithm_framework.domain.pipeline.pipeline import Pipeline |
| 29 | + |
| 30 | +from .pipeline_engine import PANEL_COLUMNS, PipelineEngine |
| 31 | + |
| 32 | + |
| 33 | +class VectorPipelineEngine: |
| 34 | + """Vector-mode pipeline executor (#502). |
| 35 | +
|
| 36 | + Build the panel and evaluate every declared factor over the full |
| 37 | + window in one shot. The output is a long-form |
| 38 | + ``pl.DataFrame`` with columns |
| 39 | + ``(datetime, symbol, <factor_1>, ..., <factor_n>)``, sorted by |
| 40 | + ``(datetime, symbol)``. |
| 41 | +
|
| 42 | + Universe filtering is applied as in event mode — symbols failing |
| 43 | + the universe mask at a given bar are dropped from that bar's |
| 44 | + output, and the universe column itself is not exposed. |
| 45 | + """ |
| 46 | + |
| 47 | + # ------------------------------------------------------------------ # |
| 48 | + # Panel construction (delegates to event engine for parity) |
| 49 | + # ------------------------------------------------------------------ # |
| 50 | + @staticmethod |
| 51 | + def build_panel( |
| 52 | + data_object: Mapping[str, Any], |
| 53 | + symbol_to_identifier: Mapping[str, str], |
| 54 | + end: Optional[datetime] = None, |
| 55 | + ) -> pl.DataFrame: |
| 56 | + """Stack per-symbol OHLCV frames into a long-form panel. |
| 57 | +
|
| 58 | + ``end`` is an optional inclusive upper bound (no look-ahead). |
| 59 | + There is intentionally no ``start`` parameter here — the panel |
| 60 | + keeps all earlier bars so factors get full warmup. Callers that |
| 61 | + want to restrict the *output* range should pass ``start`` to |
| 62 | + :meth:`evaluate_window` instead. |
| 63 | + """ |
| 64 | + return PipelineEngine.build_panel( |
| 65 | + data_object=data_object, |
| 66 | + symbol_to_identifier=symbol_to_identifier, |
| 67 | + as_of=end, |
| 68 | + ) |
| 69 | + |
| 70 | + # ------------------------------------------------------------------ # |
| 71 | + # Evaluation |
| 72 | + # ------------------------------------------------------------------ # |
| 73 | + def evaluate_window( |
| 74 | + self, |
| 75 | + pipeline_cls: Type[Pipeline], |
| 76 | + data_object: Mapping[str, Any], |
| 77 | + symbol_to_identifier: Mapping[str, str], |
| 78 | + start: Optional[datetime] = None, |
| 79 | + end: Optional[datetime] = None, |
| 80 | + ) -> pl.DataFrame: |
| 81 | + """Evaluate ``pipeline_cls`` over the full window. |
| 82 | +
|
| 83 | + Returns a long-form frame with one row per ``(datetime, symbol)`` |
| 84 | + pair that survives the universe mask, plus one column per |
| 85 | + declared pipeline output factor. |
| 86 | +
|
| 87 | + ``end`` truncates the input panel (no look-ahead). ``start`` |
| 88 | + only restricts the *output* — earlier bars remain in the panel |
| 89 | + as warmup so rolling factors compute correctly at ``start``. |
| 90 | + """ |
| 91 | + panel = self.build_panel( |
| 92 | + data_object=data_object, |
| 93 | + symbol_to_identifier=symbol_to_identifier, |
| 94 | + end=end, |
| 95 | + ) |
| 96 | + result = self.evaluate_panel(pipeline_cls, panel) |
| 97 | + if start is not None and not result.is_empty(): |
| 98 | + result = result.filter(pl.col("datetime") >= pl.lit(start)) |
| 99 | + return result |
| 100 | + |
| 101 | + def evaluate_panel( |
| 102 | + self, |
| 103 | + pipeline_cls: Type[Pipeline], |
| 104 | + panel: pl.DataFrame, |
| 105 | + ) -> pl.DataFrame: |
| 106 | + """Evaluate ``pipeline_cls`` over an already-built ``panel``. |
| 107 | +
|
| 108 | + Exposed separately so callers (e.g. the vector backtest service) |
| 109 | + that already maintain a panel can reuse it without paying the |
| 110 | + rebuild cost. |
| 111 | + """ |
| 112 | + if panel.is_empty(): |
| 113 | + return self._empty_long_output(pipeline_cls) |
| 114 | + |
| 115 | + cache: Dict[tuple, pl.Series] = {} |
| 116 | + token = _EVAL_CACHE.set(cache) |
| 117 | + try: |
| 118 | + result = panel.select(["datetime", "symbol"]) |
| 119 | + for name, factor in pipeline_cls.get_columns().items(): |
| 120 | + values = factor.evaluate(panel) |
| 121 | + result = result.with_columns(values.alias(name)) |
| 122 | + |
| 123 | + universe = pipeline_cls.get_universe() |
| 124 | + if universe is not None: |
| 125 | + mask = universe.evaluate(panel) |
| 126 | + result = result.with_columns(mask.alias("__universe__")) |
| 127 | + result = result.filter(pl.col("__universe__")) |
| 128 | + result = result.drop("__universe__") |
| 129 | + finally: |
| 130 | + _EVAL_CACHE.reset(token) |
| 131 | + |
| 132 | + return result.sort(["datetime", "symbol"]) |
| 133 | + |
| 134 | + # ------------------------------------------------------------------ # |
| 135 | + # Slicing helpers |
| 136 | + # ------------------------------------------------------------------ # |
| 137 | + @staticmethod |
| 138 | + def slice_at(long_result: pl.DataFrame, as_of: datetime) -> pl.DataFrame: |
| 139 | + """Return the wide ``(symbol, *factors)`` frame for one bar. |
| 140 | +
|
| 141 | + Equivalent to what :meth:`PipelineEngine.evaluate` returns. The |
| 142 | + ``datetime`` column is dropped so the shape matches event mode. |
| 143 | + """ |
| 144 | + if long_result.is_empty(): |
| 145 | + return long_result.drop("datetime") \ |
| 146 | + if "datetime" in long_result.columns else long_result |
| 147 | + sliced = long_result.filter(pl.col("datetime") == pl.lit(as_of)) |
| 148 | + if "datetime" in sliced.columns: |
| 149 | + sliced = sliced.drop("datetime") |
| 150 | + return sliced |
| 151 | + |
| 152 | + # ------------------------------------------------------------------ # |
| 153 | + # Helpers |
| 154 | + # ------------------------------------------------------------------ # |
| 155 | + @staticmethod |
| 156 | + def _empty_long_output(pipeline_cls: Type[Pipeline]) -> pl.DataFrame: |
| 157 | + schema: Dict[str, Any] = { |
| 158 | + "datetime": pl.Datetime, |
| 159 | + "symbol": pl.Utf8, |
| 160 | + } |
| 161 | + for name in pipeline_cls.get_columns(): |
| 162 | + schema[name] = pl.Float64 |
| 163 | + return pl.DataFrame(schema=schema) |
| 164 | + |
| 165 | + |
| 166 | +__all__ = ["VectorPipelineEngine", "PANEL_COLUMNS"] |
0 commit comments