Skip to content

Commit 7e063b2

Browse files
author
miranov25
committed
feat(AliasDataFrameRDF): Phase 2 complete - RDataFrame integration
- 46/46 tests passing (35 unit + 11 integration) - 1-key, 2-key, 3-key subframe joins verified - AST-based expression conversion to C++ - Topological sort with cycle detection - Composite key linearization for N>2 keys TODO: Add sparse key support via unique value mapping (requested by Marian for large/non-contiguous key ranges) Prepares for RNTuple migration (TTree::Draw deprecated). Co-authored-by: Claude (Architect/Coder) Reviewed-by: GPT (Performance) Reviewed-by: Gemini (C++/ROOT)
1 parent ccbd8af commit 7e063b2

3 files changed

Lines changed: 501 additions & 12 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrameRDF.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -358,16 +358,27 @@ def get_ordered_defines(
358358
ValueError
359359
If circular dependency detected
360360
"""
361-
# Get schema
362-
if schema is None and aDF is not None:
363-
schema = aDF.schema
364-
365-
if schema is None:
361+
# Get all aliases - prefer aDF.aliases property which handles schema properly
362+
if aDF is not None and hasattr(aDF, 'aliases'):
363+
# AliasDataFrame stores aliases in _schema["columns"] with "expr" key
364+
# The .aliases property returns {name: expr} dict
365+
all_aliases = aDF.aliases
366+
elif schema is not None:
367+
# Fallback: try 'aliases' key or extract from 'columns'
368+
if 'aliases' in schema:
369+
all_aliases = schema['aliases']
370+
elif 'columns' in schema:
371+
# Extract aliases from columns (entries with 'expr' key)
372+
all_aliases = {
373+
k: v.get('expr', v) if isinstance(v, dict) else v
374+
for k, v in schema['columns'].items()
375+
if isinstance(v, dict) and 'expr' in v
376+
}
377+
else:
378+
all_aliases = {}
379+
else:
366380
raise ValueError("Must provide either aDF or schema")
367381

368-
# Get all aliases from schema
369-
all_aliases = schema.get('aliases', {})
370-
371382
# If specific aliases requested, use them; otherwise all
372383
if aliases is None:
373384
aliases = list(all_aliases.keys())
@@ -389,8 +400,7 @@ def get_ordered_defines(
389400
# Build result list
390401
result = []
391402
for name in ordered:
392-
info = all_aliases.get(name, {})
393-
expr = info.get('expr', '') if isinstance(info, dict) else str(info)
403+
expr = all_aliases.get(name, '')
394404
deps = extract_dependencies(expr, set(all_aliases.keys()))
395405
cpp_expr = to_cpp_expr(expr)
396406

@@ -534,8 +544,8 @@ def setup_tree_with_friends(
534544
print(f"Warning: Subframe '{sf_name}' not found")
535545
continue
536546

537-
# Get index columns
538-
index_cols = sf_info.get('index_columns', [])
547+
# Get index columns - schema uses 'index' key
548+
index_cols = sf_info.get('index', sf_info.get('index_columns', []))
539549

540550
if len(index_cols) == 0:
541551
print(f"Warning: Subframe '{sf_name}' has no index columns")
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
"""
2+
Pytest fixtures for AliasDataFrameRDF tests.
3+
4+
Provides session-scoped test data with all 4 subframes and proper indices.
5+
"""
6+
7+
import pytest
8+
import numpy as np
9+
import pandas as pd
10+
import os
11+
import sys
12+
13+
# Add parent directory to path
14+
_this_dir = os.path.dirname(os.path.abspath(__file__))
15+
_parent_dir = os.path.dirname(_this_dir)
16+
if _parent_dir not in sys.path:
17+
sys.path.insert(0, _parent_dir)
18+
19+
20+
def create_rdf_test_data(filepath: str):
21+
"""
22+
Create test data with all 4 subframes and proper indices.
23+
Mirrors real calibration structure.
24+
25+
Structure:
26+
- Main tree: 10,000 rows
27+
- Subframe T: 1-key index (track_tf_uid), 100 entries
28+
- Subframe R: 1-key index (firstTForbit), 50 entries
29+
- Subframe DTrack0: 3-key index (side, row, drift25), 8512 entries
30+
- Subframe DITS0FitSide: 2-key index (drift25, side), 56 entries
31+
32+
Parameters
33+
----------
34+
filepath : str
35+
Output ROOT file path
36+
37+
Returns
38+
-------
39+
tuple
40+
(filepath, aDF) - path to file and the AliasDataFrame object with aliases
41+
"""
42+
from AliasDataFrame import AliasDataFrame
43+
from itertools import product
44+
45+
np.random.seed(42) # Reproducible
46+
n_rows = 10_000
47+
48+
# Main tree columns
49+
main_df = pd.DataFrame({
50+
'track_tf_uid': np.random.randint(0, 100, n_rows), # For T join (1-key)
51+
'firstTForbit': np.random.randint(0, 50, n_rows), # For R join (1-key)
52+
'side': np.random.randint(0, 2, n_rows), # For DTrack0 (3-key)
53+
'row': np.random.randint(0, 152, n_rows), # For DTrack0 (3-key)
54+
'drift25': np.random.randint(0, 28, n_rows), # For DTrack0 (3-key)
55+
'mX': np.random.randn(n_rows).astype(np.float32),
56+
'mY': np.random.randn(n_rows).astype(np.float32),
57+
'mZ': np.random.randn(n_rows).astype(np.float32),
58+
'x': np.random.randn(n_rows).astype(np.float32),
59+
'y': np.random.randn(n_rows).astype(np.float32),
60+
})
61+
62+
# Subframe T: 1-key index (track_tf_uid)
63+
t_df = pd.DataFrame({
64+
'track_tf_uid': np.arange(100),
65+
'mP2': np.random.randn(100).astype(np.float32),
66+
'mP3': np.random.randn(100).astype(np.float32),
67+
'mP4': np.random.randn(100).astype(np.float32),
68+
'dy': np.random.randn(100).astype(np.float32) * 0.1,
69+
'dz': np.random.randn(100).astype(np.float32) * 0.1,
70+
})
71+
72+
# Subframe R: 1-key index (firstTForbit)
73+
r_df = pd.DataFrame({
74+
'firstTForbit': np.arange(50),
75+
'refX': np.random.randn(50).astype(np.float32),
76+
})
77+
78+
# Subframe DTrack0: 3-key index (side, row, drift25)
79+
# Create all combinations: 2 * 152 * 28 = 8512 entries
80+
keys = list(product(range(2), range(152), range(28)))
81+
dtrack_df = pd.DataFrame({
82+
'side': [k[0] for k in keys],
83+
'row': [k[1] for k in keys],
84+
'drift25': [k[2] for k in keys],
85+
'dyC2_median': np.random.randn(len(keys)).astype(np.float32) * 0.01,
86+
'dzC2_median': np.random.randn(len(keys)).astype(np.float32) * 0.01,
87+
})
88+
# Create composite key for N>2 key join (same algorithm as AliasDataFrameTree.C)
89+
# __adf_key__ = k0 + k1*max0 + k2*max0*max1
90+
max_side, max_row, max_drift = 2, 152, 28
91+
dtrack_df['__adf_key_DTrack0__'] = (
92+
dtrack_df['side'] +
93+
dtrack_df['row'] * max_side +
94+
dtrack_df['drift25'] * max_side * max_row
95+
).astype(np.int64)
96+
97+
# Also add composite key to main tree for join
98+
main_df['__adf_key_DTrack0__'] = (
99+
main_df['side'] +
100+
main_df['row'] * max_side +
101+
main_df['drift25'] * max_side * max_row
102+
).astype(np.int64)
103+
104+
# Subframe DITS0FitSide: 2-key index (drift25, side)
105+
keys2 = list(product(range(28), range(2))) # 28 * 2 = 56 entries
106+
dits_df = pd.DataFrame({
107+
'drift25': [k[0] for k in keys2],
108+
'side': [k[1] for k in keys2],
109+
'itsParam': np.random.randn(len(keys2)).astype(np.float32),
110+
})
111+
112+
# Create AliasDataFrame
113+
aDF = AliasDataFrame(main_df)
114+
115+
# Register subframes with proper index columns
116+
aDF.register_subframe('T', AliasDataFrame(t_df), index_columns='track_tf_uid')
117+
aDF.register_subframe('R', AliasDataFrame(r_df), index_columns='firstTForbit')
118+
# For 3-key subframe, use composite key (matches AliasDataFrameTree.C behavior)
119+
aDF.register_subframe('DTrack0', AliasDataFrame(dtrack_df),
120+
index_columns='__adf_key_DTrack0__')
121+
aDF.register_subframe('DITS0FitSide', AliasDataFrame(dits_df),
122+
index_columns=['drift25', 'side'])
123+
124+
# Add test aliases (representative subset)
125+
# These cover various patterns: subframe access, arithmetic, boolean
126+
# Note: Use C++-compatible function names (tan, abs) not numpy (np.tan, np.abs)
127+
aDF.add_alias('z_calc', 'tan(T.mP3) * drift25')
128+
aDF.add_alias('dy_c', 'T.mP2 - mY')
129+
aDF.add_alias('dz_c', 'T.mP4 - mZ')
130+
aDF.add_alias('dyC2', 'dy_c - DTrack0.dyC2_median')
131+
aDF.add_alias('dzC2', 'dz_c - DTrack0.dzC2_median')
132+
aDF.add_alias('isValid', '(row < 152) & (abs(dyC2) < 2)')
133+
134+
# Export with composite indices
135+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
136+
aDF.export_tree(filepath, "tree")
137+
138+
print(f"Created test data: {filepath}")
139+
print(f" Main tree: {n_rows} rows")
140+
print(f" Subframe T: {len(t_df)} entries (1-key)")
141+
print(f" Subframe R: {len(r_df)} entries (1-key)")
142+
print(f" Subframe DTrack0: {len(dtrack_df)} entries (3-key)")
143+
print(f" Subframe DITS0FitSide: {len(dits_df)} entries (2-key)")
144+
print(f" Aliases: {len(aDF.aliases)} defined") # Use .aliases property
145+
146+
# Return both filepath and aDF object (with aliases)
147+
return filepath, aDF
148+
149+
150+
@pytest.fixture(scope="session")
151+
def rdf_test_data(tmp_path_factory):
152+
"""
153+
Session-scoped fixture that creates test data once per test session.
154+
155+
Returns tuple of (filepath, aDF) where aDF has the aliases defined.
156+
"""
157+
filepath = tmp_path_factory.mktemp("data") / "rdf_test_data.root"
158+
return create_rdf_test_data(str(filepath))
159+
160+
161+
@pytest.fixture(scope="session")
162+
def rdf_test_file(rdf_test_data):
163+
"""Returns path to ROOT file with test data."""
164+
return rdf_test_data[0]
165+
166+
167+
@pytest.fixture(scope="session")
168+
def rdf_test_adf(rdf_test_data):
169+
"""Returns AliasDataFrame with test aliases defined."""
170+
return rdf_test_data[1]
171+
172+
173+
# =============================================================================
174+
# Persistent Fixture Data (optional - for reuse across test runs)
175+
# =============================================================================
176+
177+
# Path to persistent fixture data (relative to tests/ directory)
178+
PERSISTENT_FIXTURE_PATH = os.path.join(_this_dir, "fixtures", "rdf_test_data.root")
179+
180+
181+
def get_or_create_persistent_fixture():
182+
"""
183+
Get or create persistent fixture data.
184+
185+
If fixtures/rdf_test_data.root exists, return it.
186+
Otherwise create it.
187+
188+
This allows reusing the same test data across multiple test runs,
189+
which is faster than recreating it each time.
190+
191+
Usage:
192+
# In conftest.py, replace rdf_test_data fixture with:
193+
@pytest.fixture(scope="session")
194+
def rdf_test_data():
195+
return get_or_create_persistent_fixture()
196+
"""
197+
if os.path.exists(PERSISTENT_FIXTURE_PATH):
198+
print(f"Using existing fixture: {PERSISTENT_FIXTURE_PATH}")
199+
return _recreate_adf_with_schema(PERSISTENT_FIXTURE_PATH)
200+
else:
201+
print(f"Creating new fixture: {PERSISTENT_FIXTURE_PATH}")
202+
return create_rdf_test_data(PERSISTENT_FIXTURE_PATH)
203+
204+
205+
def _recreate_adf_with_schema(filepath):
206+
"""
207+
Recreate AliasDataFrame with schema from existing file.
208+
209+
Since aliases aren't stored in the ROOT file, we recreate them here.
210+
"""
211+
from AliasDataFrame import AliasDataFrame
212+
import uproot
213+
214+
# Load the main DataFrame
215+
with uproot.open(filepath) as f:
216+
tree = f["tree"]
217+
main_df = tree.arrays(library="pd")
218+
219+
aDF = AliasDataFrame(main_df)
220+
221+
# Add the same aliases (must match create_rdf_test_data)
222+
aDF.add_alias('z_calc', 'tan(T.mP3) * drift25')
223+
aDF.add_alias('dy_c', 'T.mP2 - mY')
224+
aDF.add_alias('dz_c', 'T.mP4 - mZ')
225+
aDF.add_alias('dyC2', 'dy_c - DTrack0.dyC2_median')
226+
aDF.add_alias('dzC2', 'dz_c - DTrack0.dzC2_median')
227+
aDF.add_alias('isValid', '(row < 152) & (abs(dyC2) < 2)')
228+
229+
# Add subframe info to schema (for setup_tree_with_friends)
230+
# Schema uses 'index' key, not 'index_columns'
231+
# DTrack0 uses composite key for 3-key join
232+
aDF._schema['subframes'] = {
233+
'T': {'index': ['track_tf_uid']},
234+
'R': {'index': ['firstTForbit']},
235+
'DTrack0': {'index': ['__adf_key_DTrack0__']}, # Composite key
236+
'DITS0FitSide': {'index': ['drift25', 'side']},
237+
}
238+
239+
return filepath, aDF
240+
241+
242+
# Allow running this file directly to create persistent fixture data
243+
if __name__ == '__main__':
244+
os.makedirs(os.path.dirname(PERSISTENT_FIXTURE_PATH), exist_ok=True)
245+
filepath, aDF = create_rdf_test_data(PERSISTENT_FIXTURE_PATH)
246+
print(f"\nPersistent fixture created at: {filepath}")
247+
print(f"This file can be reused across test runs.")

0 commit comments

Comments
 (0)