Skip to content

Commit 437e30c

Browse files
committed
Phase 13.18.GB Turn 2: fixture generator + 24 fixtures + dfGB_to_root + round-trip test
- cpp/fixtures/generate_fixtures.py : deterministic, seeded - cpp/fixtures/F_*.json (24 files, 79 KB total, max 5.4 KB) : committed as test data per T9-literal-baseline discipline - cpp/fixtures/validate_fixtures.py : structural integrity + axis coverage - cpp/dfGB_to_root.py : uproot helper for Layer B - cpp/tests/test_dfGB_roundtrip.py : P1-epsilon, 6/6 passing alma2 - cpp/conftest.py : pytest path setup for cpp/ subproject Axis coverage: fit 12:12, bnd 12:12, dim 8:8:8, method 12:12, cov 20:4. All 24 orthogonal cells covered exactly once. Reviewer pair: Claude20 + Claude21, verdict [OK]. Spec: PHASE_13_18_GB_Fixture_Specification_v1.0 (committed separately as PHASE document). Proposal: PHASE_13_18_GB_v1.1_Proposal (committed separately as PHASE document).
1 parent d0d020a commit 437e30c

29 files changed

Lines changed: 6158 additions & 0 deletions
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""pytest conftest at the cpp/ level.
2+
3+
Placed here (NOT in cpp/tests/) so pytest loads it while descending
4+
from the repo-level rootdir (where pytest.ini lives) into cpp/tests/.
5+
This ensures sys.path contains cpp/ BEFORE any test module is imported,
6+
making `from dfGB_to_root import ...` resolve correctly.
7+
8+
Required because the repo has a top-level pytest.ini that owns rootdir,
9+
and the cpp/ subproject is not a Python package (no __init__.py).
10+
"""
11+
import sys
12+
from pathlib import Path
13+
14+
CPP_ROOT = Path(__file__).resolve().parent
15+
if str(CPP_ROOT) not in sys.path:
16+
sys.path.insert(0, str(CPP_ROOT))
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#!/usr/bin/env python3
2+
"""Phase 13.18.GB Turn 2 — Python helper to dump a trained dfGB DataFrame
3+
into a ROOT .root file consumable by the C++ evaluator (Layer B).
4+
5+
The output .root file contains:
6+
- A TTree named by tree_name (default "dfGB") with one branch per
7+
dfGB column (index columns + coefficient columns + error columns).
8+
One TTree entry per dfGB row.
9+
- A TObjString named <tree_name>__gbreg_schema holding a small JSON
10+
blob with the schema fields the Layer B Option A load path reads
11+
(group_columns, predictor_columns, targets, suffix, fit_intercept).
12+
13+
The sidecar naming convention <tree_name>__gbreg_schema is the Phase
14+
13.18.GB demo-level default per proposal v1.1 §13 A-2. ADF can adopt or
15+
propose an alternative in Phase 13.19.
16+
17+
Dependencies
18+
------------
19+
uproot (pip install uproot)
20+
21+
This module does NOT require ROOT/PyROOT. uproot writes the .root file
22+
in a format ROOT reads natively.
23+
"""
24+
from __future__ import annotations
25+
26+
import json
27+
from pathlib import Path
28+
from typing import Any
29+
30+
import numpy as np
31+
import pandas as pd
32+
import uproot
33+
34+
35+
SCHEMA_SIDECAR_SUFFIX = "__gbreg_schema"
36+
37+
38+
def dfGB_to_root(
39+
dfGB: pd.DataFrame,
40+
output_path: str | Path,
41+
tree_name: str = "dfGB",
42+
*,
43+
group_columns: list[str],
44+
predictor_columns: list[str],
45+
targets: list[str],
46+
suffix: str,
47+
fit_intercept: bool,
48+
overwrite: bool = True,
49+
) -> Path:
50+
"""Dump a trained dfGB to a .root file + schema sidecar.
51+
52+
Parameters
53+
----------
54+
dfGB : pd.DataFrame
55+
The trained model DataFrame. One row per populated bin.
56+
Must contain all group_columns and every expected coefficient
57+
column derived from (targets, predictor_columns, suffix,
58+
fit_intercept).
59+
output_path : str | Path
60+
Destination .root file.
61+
tree_name : str
62+
Name of the TTree inside the file. Default "dfGB".
63+
group_columns : list[str]
64+
Group (index) column names; must be present in dfGB.
65+
predictor_columns : list[str]
66+
Predictor column names driving slope coefficient column naming.
67+
targets : list[str]
68+
Target column names driving the <target>_intercept<suffix> and
69+
<target>_slope_<pred><suffix> column naming.
70+
suffix : str
71+
Coefficient column suffix (e.g. "_fit", "_sw").
72+
fit_intercept : bool
73+
Whether the model has an intercept column per target.
74+
overwrite : bool
75+
If True and output_path exists, it is overwritten.
76+
77+
Returns
78+
-------
79+
Path
80+
The output path.
81+
82+
Raises
83+
------
84+
ValueError
85+
If dfGB is missing any required column, or if any group or
86+
coefficient column has incompatible dtype for TTree storage.
87+
"""
88+
output_path = Path(output_path)
89+
if output_path.exists() and not overwrite:
90+
raise FileExistsError(output_path)
91+
output_path.parent.mkdir(parents=True, exist_ok=True)
92+
93+
# Validate required columns are present
94+
required = list(group_columns)
95+
for t in targets:
96+
if fit_intercept:
97+
required.append(f"{t}_intercept{suffix}")
98+
for pred in predictor_columns:
99+
required.append(f"{t}_slope_{pred}{suffix}")
100+
missing = [c for c in required if c not in dfGB.columns]
101+
if missing:
102+
raise ValueError(
103+
f"dfGB missing required columns: {missing}. "
104+
f"Available columns: {list(dfGB.columns)}")
105+
106+
# Build a branches dict for uproot: column name -> numpy array.
107+
# Keep group_columns as int64 and coefficient columns as float64.
108+
# Unknown-suffix columns (e.g. "_err_sw", "_rmse_sw", "_n_fitted_sw")
109+
# are PRESERVED per P1-δ: load-time the C++ side tolerates them
110+
# without schema error.
111+
branches: dict[str, np.ndarray] = {}
112+
for col in dfGB.columns:
113+
arr = dfGB[col].to_numpy()
114+
if col in group_columns:
115+
# Enforce integer dtype for group columns
116+
if not np.issubdtype(arr.dtype, np.integer):
117+
try:
118+
arr = arr.astype(np.int64)
119+
except Exception as e:
120+
raise ValueError(
121+
f"group column {col!r} not integer-castable: {e}")
122+
else:
123+
arr = arr.astype(np.int64)
124+
elif np.issubdtype(arr.dtype, np.floating):
125+
arr = arr.astype(np.float64)
126+
elif np.issubdtype(arr.dtype, np.integer):
127+
arr = arr.astype(np.int64)
128+
else:
129+
# Non-numeric columns not supported (strings, objects etc.)
130+
raise ValueError(
131+
f"column {col!r} has unsupported dtype {arr.dtype} "
132+
f"for TTree storage")
133+
branches[col] = arr
134+
135+
# Schema sidecar JSON
136+
schema = {
137+
"group_columns": list(group_columns),
138+
"predictor_columns": list(predictor_columns),
139+
"targets": list(targets),
140+
"suffix": suffix,
141+
"fit_intercept": bool(fit_intercept),
142+
}
143+
schema_json = json.dumps(schema, sort_keys=True)
144+
sidecar_name = f"{tree_name}{SCHEMA_SIDECAR_SUFFIX}"
145+
146+
# Write the file
147+
with uproot.recreate(output_path) as f:
148+
f[tree_name] = branches
149+
# uproot writes strings as TObjString compatible records
150+
f[sidecar_name] = schema_json
151+
152+
return output_path
153+
154+
155+
def root_to_dfGB(
156+
input_path: str | Path,
157+
tree_name: str = "dfGB",
158+
) -> tuple[pd.DataFrame, dict[str, Any]]:
159+
"""Inverse helper — read a .root file produced by dfGB_to_root.
160+
161+
Returns
162+
-------
163+
(df, schema) : tuple
164+
df : pd.DataFrame restored from the TTree
165+
schema : dict parsed from the sidecar TObjString
166+
167+
Raises
168+
------
169+
KeyError if the tree or sidecar is not present.
170+
"""
171+
input_path = Path(input_path)
172+
sidecar_name = f"{tree_name}{SCHEMA_SIDECAR_SUFFIX}"
173+
with uproot.open(input_path) as f:
174+
if tree_name not in f:
175+
raise KeyError(f"tree {tree_name!r} not in {input_path}")
176+
tree = f[tree_name]
177+
# uproot-5 returns a structured ndarray or a dict depending on
178+
# version; handle both paths robustly.
179+
arrs = tree.arrays(library="np")
180+
if isinstance(arrs, dict):
181+
df = pd.DataFrame({k: arrs[k] for k in arrs})
182+
elif hasattr(arrs, "dtype") and arrs.dtype.names:
183+
# structured ndarray
184+
df = pd.DataFrame({name: arrs[name] for name in arrs.dtype.names})
185+
else:
186+
raise RuntimeError(
187+
f"unexpected uproot arrays return type: {type(arrs)}")
188+
189+
# Schema sidecar: uproot returns the stored string
190+
if sidecar_name not in f:
191+
raise KeyError(f"schema sidecar {sidecar_name!r} not in "
192+
f"{input_path}")
193+
schema_raw = f[sidecar_name]
194+
# uproot returns stored strings as Python str directly when read
195+
if isinstance(schema_raw, str):
196+
schema = json.loads(schema_raw)
197+
else:
198+
# Some uproot versions wrap; try .tostring() / .fString
199+
schema_str = getattr(schema_raw, "fString", None) or str(schema_raw)
200+
schema = json.loads(schema_str)
201+
return df, schema
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"fixture_id": "F_01_I_n_1D_L_d",
3+
"axis_values": {
4+
"fit_intercept": true,
5+
"bounds": "nan",
6+
"dimensions": 1,
7+
"method": "lookup",
8+
"coverage": "dense"
9+
},
10+
"input": {
11+
"gb_columns": [
12+
"bin_x"
13+
],
14+
"predictor_columns": [],
15+
"targets": [
16+
"y"
17+
],
18+
"suffix": "_fit",
19+
"fit_intercept": true,
20+
"subframe_rows": [
21+
{
22+
"bin_x": 0,
23+
"y_intercept_fit": 0.23242564825522027
24+
},
25+
{
26+
"bin_x": 1,
27+
"y_intercept_fit": -0.354555855649046
28+
},
29+
{
30+
"bin_x": 2,
31+
"y_intercept_fit": -0.3414468413507963
32+
},
33+
{
34+
"bin_x": 3,
35+
"y_intercept_fit": -0.2643518926743085
36+
}
37+
],
38+
"query_positions": [
39+
[
40+
0.0
41+
],
42+
[
43+
1.0
44+
],
45+
[
46+
2.0
47+
],
48+
[
49+
3.0
50+
]
51+
],
52+
"predictor_values_per_query": [
53+
[],
54+
[],
55+
[],
56+
[]
57+
],
58+
"method": "lookup",
59+
"bounds": "nan"
60+
},
61+
"expected_output": {
62+
"y": [
63+
0.23242564825522027,
64+
-0.354555855649046,
65+
-0.3414468413507963,
66+
-0.2643518926743085
67+
]
68+
},
69+
"intermediates": {
70+
"expected_bin_centers": [
71+
[
72+
0,
73+
1,
74+
2,
75+
3
76+
]
77+
],
78+
"expected_valid_mask_flat": [
79+
true,
80+
true,
81+
true,
82+
true
83+
],
84+
"expected_coefficient_shape": [
85+
4
86+
],
87+
"expected_remap": [
88+
{
89+
"0": 0,
90+
"1": 1,
91+
"2": 2,
92+
"3": 3
93+
}
94+
]
95+
},
96+
"metadata": {
97+
"generator_version": "0.1.0",
98+
"generator_timestamp": "2026-04-13T11:49:03+00:00",
99+
"python_evaluator_hash": "reference-kernel-v0.1.0",
100+
"notes": "Baseline 1D integer lookup with intercept."
101+
}
102+
}

0 commit comments

Comments
 (0)