miranov25
diff --git a/‎UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py‎
Lines changed: 105 additions & 8 deletions b/‎UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py‎
Lines changed: 105 additions & 8 deletions
diff --git a/‎UTILS/dfextensions/AliasDataFrame/LazyChainReader.py‎
Lines changed: 2 additions & 1 deletion b/‎UTILS/dfextensions/AliasDataFrame/LazyChainReader.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎UTILS/dfextensions/AliasDataFrame/LazyTreeReader.py‎
Lines changed: 68 additions & 1 deletion b/‎UTILS/dfextensions/AliasDataFrame/LazyTreeReader.py‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎UTILS/dfextensions/AliasDataFrame/examples/time_series/time_series_draw.py‎
Lines changed: 86 additions & 4 deletions b/‎UTILS/dfextensions/AliasDataFrame/examples/time_series/time_series_draw.py‎
Lines changed: 86 additions & 4 deletions
@@ -3803,6 +3803,13 @@ def _analyze_expression(self, expr):
         known_funcs = set(ArrowComputeMapper.FUNC_MAP.keys()) if ArrowComputeMapper else set()
         known_funcs.update(['np', 'numpy', 'math', 'abs', 'int', 'float', 'round', 
                            'min', 'max', 'sum', 'len', 'range', 'True', 'False', 'None'])
+        # Phase 13.58.ADF (D1): registered functions are added at runtime via
+        # register_function(); query the live registry at each parse so a call such as
+        # corr(xM, driftM) treats `corr` as a function, not a column. Querying the registry
+        # (rather than extending a static literal) is what makes runtime-registered names
+        # resolve correctly.
+        if hasattr(self, '_registered_functions'):
+            known_funcs.update(self._registered_functions.keys())
 
         for node in ast.walk(tree):
             if isinstance(node, ast.Name):
@@ -6834,11 +6841,43 @@ def _resolve_to_base_branches(self, columns: set, _visited: set = None) -> set:
 
         return base_branches
 
+    @staticmethod
+    def _split_top_level_colon(expr):
+        """Split a draw expression on top-level ':' separators (dfdraw 'y:x' grammar),
+        ignoring any ':' inside (), [], or {}.
+
+        Phase 13.58.ADF (D1): each part is fed individually to _analyze_expression, which
+        parses it as a Python expression; a raw 'y:x' is not valid Python, so the colon
+        split must happen first. Bracket-depth tracking keeps a stray ':' inside a call or
+        slice from splitting the expression.
+        """
+        parts = []
+        depth = 0
+        current = []
+        for ch in expr:
+            if ch in '([{':
+                depth += 1
+                current.append(ch)
+            elif ch in ')]}':
+                depth = max(0, depth - 1)
+                current.append(ch)
+            elif ch == ':' and depth == 0:
+                parts.append(''.join(current))
+                current = []
+            else:
+                current.append(ch)
+        parts.append(''.join(current))
+        return parts
+
     def get_required_branches(self,
                               expr: str = None,
                               selection: str = None,
                               group_by: str = None,
                               color: str = None,
+                              facet_by=None,
+                              weights=None,
+                              weights_vector=None,
+                              selection_vector=None,
                               aliases: list = None,
                               validate: bool = False) -> set:
         """
@@ -6883,11 +6922,29 @@ def get_required_branches(self,
         """
         all_columns = set()
 
-        # 1. Parse main expression (e.g., 'dEdx:p' → {'dEdx', 'p'})
-        #    Reuse logic from _parse_expr_aliases but get ALL columns (GPT tweak #1)
+        # 1. Parse main expression into column references.
+        #    Phase 13.58.ADF (D1): route expr parsing through the AST analyzer instead of a
+        #    raw ':'-split, so function calls and compound math resolve to their real column
+        #    dependencies (e.g. 'corr(xM, driftM):c' -> {xM, driftM, c}) and registered
+        #    function names are not mistaken for columns. The dfdraw 'y:x' form is split on
+        #    the top-level ':' first (each side is its own Python expression). Falls back to
+        #    the literal token when a part is not parseable or yields no refs, preserving the
+        #    prior behaviour for bare names and aliases (AC-4 regression set).
         if expr:
-            parts = expr.replace(' ', '').split(':')
-            all_columns.update(parts)
+            for part in self._split_top_level_colon(expr):
+                part = part.strip()
+                if not part:
+                    continue
+                analysis = self._analyze_expression(part)
+                refs = set(analysis.get('column_refs', set()))
+                for sf_name, sf_col in analysis.get('subframe_refs', []):
+                    refs.add(f"{sf_name}.{sf_col}")
+                if refs:
+                    all_columns.update(refs)
+                else:
+                    # Not parseable as a Python expression, or a bare literal: keep the
+                    # token so downstream alias/branch resolution still sees it.
+                    all_columns.add(part.replace(' ', ''))
 
         # 2. Parse selection string
         if selection:
@@ -6899,7 +6956,35 @@ def get_required_branches(self,
             all_columns.add(group_by)
         if color and isinstance(color, str):
             all_columns.add(color)
-        
+
+        # 3b. Phase 13.58.ADF (D2): column-name-bearing draw kwargs. Any kwarg whose string
+        #     value is interpreted as a column name must contribute to the required-branch
+        #     set, so a branch referenced ONLY via facet_by/weights/weights_vector/
+        #     selection_vector pre-loads in lazy mode (the silent-empty-figure class). Each
+        #     may be a string or a per-Y list of strings. Integer count kwargs
+        #     (facet_by_bins/_quantiles, group_by_bins/_quantiles) are deliberately NOT
+        #     included — they are bin counts, not column names.
+        def _add_colname_kwarg(value, as_selection=False):
+            if value is None:
+                return
+            items = value if isinstance(value, (list, tuple)) else [value]
+            for item in items:
+                if not isinstance(item, str) or not item:
+                    continue
+                if as_selection:
+                    all_columns.update(self._parse_selection_columns(item))
+                    continue
+                analysis = self._analyze_expression(item)
+                refs = set(analysis.get('column_refs', set()))
+                for sf_name, sf_col in analysis.get('subframe_refs', []):
+                    refs.add(f"{sf_name}.{sf_col}")
+                all_columns.update(refs if refs else {item})
+
+        _add_colname_kwarg(facet_by)
+        _add_colname_kwarg(weights)
+        _add_colname_kwarg(weights_vector)
+        _add_colname_kwarg(selection_vector, as_selection=True)
+
         # 4. Add explicit aliases
         if aliases:
             all_columns.update(aliases)
@@ -11339,7 +11424,11 @@ def draw(self,
                 expr=expr,
                 selection=kwargs.get('selection'),
                 group_by=kwargs.get('group_by'),
-                color=kwargs.get('color')
+                color=kwargs.get('color'),
+                facet_by=kwargs.get('facet_by'),
+                weights=kwargs.get('weights'),
+                weights_vector=kwargs.get('weights_vector'),
+                selection_vector=kwargs.get('selection_vector')
             )
             # Load any branches not already loaded
             branches_to_load = required_branches - self._lazy_reader.loaded_branches
@@ -12419,7 +12508,11 @@ def draw_batch(self,
                     expr=merged_spec.get('expr', name),
                     selection=merged_spec.get('selection'),
                     group_by=merged_spec.get('group_by'),
-                    color=merged_spec.get('color')
+                    color=merged_spec.get('color'),
+                    facet_by=merged_spec.get('facet_by'),
+                    weights=merged_spec.get('weights'),
+                    weights_vector=merged_spec.get('weights_vector'),
+                    selection_vector=merged_spec.get('selection_vector')
                 )
                 all_required.update(required)
 
@@ -12754,7 +12847,11 @@ def draw_figures(
                         expr=merged_plot.get('expr', ''),
                         selection=merged_plot.get('selection'),
                         group_by=merged_plot.get('group_by'),
-                        color=merged_plot.get('color')
+                        color=merged_plot.get('color'),
+                        facet_by=merged_plot.get('facet_by'),
+                        weights=merged_plot.get('weights'),
+                        weights_vector=merged_plot.get('weights_vector'),
+                        selection_vector=merged_plot.get('selection_vector')
                     )
                     all_required.update(required)
 
 
@@ -316,7 +316,8 @@ def estimate_memory(self, branches: List[str] = None) -> dict:
         Returns
         -------
         dict
-            'bytes': int, 'human': str, 'warning': str or None
+            'bytes': int, 'human': str, 'branches': int, 'entries': int,
+            'warning': str or None
         """
         if branches is None:
             branches = list(self._available_branches)
 
@@ -8,6 +8,7 @@
 
 import uproot
 import pandas as pd
+import numpy as np
 from typing import List, Set, Optional
 
 
@@ -201,7 +202,73 @@ def get_branch_dtype(self, name: str) -> str:
             raise ValueError(f"Branch '{name}' not in TTree")
         self._open_file()
         return str(self._tree[name].interpretation)
-    
+
+    def estimate_memory(self, branches: List[str] = None) -> dict:
+        """
+        Estimate memory for loading branches (single-tree lazy).
+
+        Phase 13.58.ADF (D3): mirrors LazyChainReader.estimate_memory but uses the real
+        per-branch dtype item size from the TTree interpretation (not a float32 constant),
+        so the estimate matches the eager `sum(df[col].nbytes)` exactly (AC-5, tolerance 0)
+        for the same branches on deterministic data. Removes the single-tree-lazy
+        AttributeError (ADF.estimate_memory previously delegated to a method that only
+        existed on the chain reader).
+
+        Parameters
+        ----------
+        branches : List[str], optional
+            Branches to estimate. None = all available.
+
+        Returns
+        -------
+        dict
+            'bytes': int, 'human': str, 'branches': int, 'entries': int,
+            'warning': str or None
+        """
+        self._open_file()
+        if branches is None:
+            branches = list(self.available_branches)
+        total = 0
+        for b in branches:
+            if b not in self.available_branches:
+                continue
+            total += self._branch_itemsize(b) * self.num_entries
+
+        warning = None
+        if total > 32 * 1024**3:
+            warning = "Estimated memory exceeds 32 GB - consider chunked loading"
+        elif total > 16 * 1024**3:
+            warning = "Estimated memory exceeds 16 GB"
+
+        return {
+            'bytes': total,
+            'human': self._format_bytes(total),
+            'branches': len(branches),
+            'entries': self.num_entries,
+            'warning': warning,
+        }
+
+    def _branch_itemsize(self, branch: str) -> int:
+        """Bytes-per-entry for a branch, from the real TTree interpretation dtype.
+
+        Falls back to 4 bytes only if the dtype cannot be determined (e.g. an unusual
+        interpretation); flat numeric branches resolve exactly.
+        """
+        try:
+            dt = np.dtype(self._tree[branch].interpretation.numpy_dtype)
+            return dt.base.itemsize if dt.subdtype is not None else dt.itemsize
+        except Exception:
+            return 4
+
+    @staticmethod
+    def _format_bytes(n: int) -> str:
+        """Format bytes as a human-readable string."""
+        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+            if abs(n) < 1024:
+                return f"{n:.1f} {unit}"
+            n /= 1024
+        return f"{n:.1f} PB"
+
     def close(self):
         """Close file handle."""
         if self._file is not None:
 
@@ -43,20 +43,102 @@
 ITS_SEL  = f"{BASE_SEL}&(hasITSTPC)"
 
 
-def build_adf(root_path, sample=None):
-    """Load ADF. build_adf(path, 0.2) gives ~1 min dev run (20% sample)."""
-    adf = root_to_adf(root_path)
+def build_adf(root_path, sample=None, lazy=False, tree_name="tree"):
+    """Load ADF. build_adf(path, 0.2) gives ~1 min dev run (20% sample).
+
+    Phase 13.58.ADF (D4): the additive ``lazy=`` parameter swaps ONLY the constructor --
+    eager ``root_to_adf()`` (default, behaviour unchanged) or lazy ``read_tree_lazy()`` --
+    and shares every post-read step below, so the eager and lazy galleries differ only in
+    data loading (no plotting logic is forked or replicated). ``lazy=True`` requires
+    ``sample=None``: a lazy read makes an N-row frame, ``sample(frac)`` shrinks it, then a
+    branch load returns full-N rows and crashes in ``_merge_loaded_data`` -- sampled-lazy is
+    out of scope (Phase 13.58 §2). ``tree_name`` is used only by the lazy path (the eager
+    default resolution is untouched); pass the gallery file's tree name if it is not
+    ``"tree"``.
+    """
+    if lazy:
+        if sample is not None:
+            raise ValueError(
+                "build_adf(lazy=True) does not support sampling (sample must be None): "
+                "sampled-lazy crashes in _merge_loaded_data and is out of scope for "
+                "Phase 13.58.ADF. Run the lazy gallery unsampled."
+            )
+        adf = AliasDataFrame.read_tree_lazy(root_path, tree_name)
+    else:
+        adf = root_to_adf(root_path)
     adf.draw_lazy = True
     apply_meta(adf, df_TimeSeriesAliases)
     apply_meta(adf, df_TimeSeriesMeta)
     addTimeQuantiles(adf, varname="timeMS", step=10000, step2=100)
     if sample is not None:
         adf.df = adf.df.sample(frac=sample, random_state=42).reset_index(drop=True)
     adf.materialize_aliases(names=["sector", "time_s"])
-    print(f"ADF ready: {len(adf.df):,} tracks" + (f" ({int(sample*100)}% sample)" if sample else ""))
+    print(f"ADF ready: {len(adf.df):,} tracks"
+          + (f" ({int(sample*100)}% sample)" if sample else "")
+          + (" [lazy]" if lazy else ""))
     return adf
 
 
+def validate_lazy_vs_eager(root_path, tree_name="tree"):
+    """Phase 13.58.ADF D4 / AC-1 / AC-2 -- gallery lazy-vs-eager double-run (SERVER gate).
+
+    Requires ROOT (eager root_to_adf) + dfdraw; run on the server, unsampled. Asserts:
+      * AC-1 (clean, genuinely lazy): the lazy ADF uses read_tree_lazy, branches load on
+        demand (a figure-only branch like 'ncl' is NOT force-materialized by setup, and
+        IS loaded after the figure that needs it), and every gallery figure renders with
+        no error (AD-TS-DRAW-001).
+      * AC-2 (PP-5 identity): a representative set of draws produce identical stats lazy vs
+        eager at the data/stats level (NOT pixel). The comparison is defensive -- it equates
+        whatever numeric stats both runs return -- so it cannot false-green on a key name.
+    Sandbox cannot run this (no ROOT/dfdraw); it is the alma2 secondary integration gate.
+    The primary gate is the dedicated synthetic test (tests/test_phase1358_lazy_timeseries.py).
+    """
+    import numpy as np
+    import matplotlib.pyplot as plt
+
+    eager = build_adf(root_path, lazy=False, tree_name=tree_name)
+    lazy = build_adf(root_path, lazy=True, tree_name=tree_name)
+    assert lazy._lazy_reader is not None, "lazy build must use read_tree_lazy (not eager-in-disguise)"
+
+    # provably lazy: a figure-only branch must not be force-materialized by setup
+    forced = set(lazy._lazy_reader.loaded_branches)
+    assert "ncl" not in forced, "ncl must not be force-loaded by build_adf setup"
+
+    # AC-2 stats identity on representative draws (return_data=True; defensive key match)
+    checks = [
+        dict(expr="ncl", type="hist", bins=50, selection="ncl>30"),
+        dict(expr="dcar_tpc_vertex:tgl", type="profile", bins=50, selection=BASE_SEL),
+    ]
+    for kw in checks:
+        re_ = eager.draw(return_data=True, **kw)
+        rl_ = lazy.draw(return_data=True, **kw)
+        se = re_[2] if isinstance(re_, tuple) and len(re_) > 2 and isinstance(re_[2], dict) else {}
+        sl = rl_[2] if isinstance(rl_, tuple) and len(rl_) > 2 and isinstance(rl_[2], dict) else {}
+        compared = 0
+        for key in (set(se) & set(sl)):
+            try:
+                a = np.asarray(se[key], dtype=float)
+                b = np.asarray(sl[key], dtype=float)
+            except (TypeError, ValueError):
+                continue
+            if a.shape == b.shape and a.size:
+                assert np.allclose(a, b, equal_nan=True), f"lazy != eager stats for {kw}, key '{key}'"
+                compared += 1
+        assert compared > 0, f"no comparable numeric stats produced for {kw}"
+
+    assert "ncl" in lazy._lazy_reader.loaded_branches, "ncl should load lazily after its draw"
+
+    # AC-1 clean run: every gallery figure renders lazily with no error
+    fig_funcs = [v for k, v in sorted(globals().items())
+                 if k.startswith("fig") and callable(v)]
+    for fn in fig_funcs:
+        fig = fn(lazy)
+        if fig is not None:
+            plt.close(fig)
+    print(f"validate_lazy_vs_eager: OK -- {len(fig_funcs)} figures rendered lazily; "
+          f"lazy==eager stats on {len(checks)} representative draws")
+
+
 def _add(pdf, fig, title):
     """Save figure to PDF with suptitle + PDF bookmark."""
     fig.suptitle(title, fontsize=8, color="gray", y=1.0, ha="left", x=0.01)