|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Phase 13.18.GB Turn 2 — Python helper to dump a trained dfGB DataFrame |
| 3 | +into a ROOT .root file consumable by the C++ evaluator (Layer B). |
| 4 | +
|
| 5 | +The output .root file contains: |
| 6 | + - A TTree named by tree_name (default "dfGB") with one branch per |
| 7 | + dfGB column (index columns + coefficient columns + error columns). |
| 8 | + One TTree entry per dfGB row. |
| 9 | + - A TObjString named <tree_name>__gbreg_schema holding a small JSON |
| 10 | + blob with the schema fields the Layer B Option A load path reads |
| 11 | + (group_columns, predictor_columns, targets, suffix, fit_intercept). |
| 12 | +
|
| 13 | +The sidecar naming convention <tree_name>__gbreg_schema is the Phase |
| 14 | +13.18.GB demo-level default per proposal v1.1 §13 A-2. ADF can adopt or |
| 15 | +propose an alternative in Phase 13.19. |
| 16 | +
|
| 17 | +Dependencies |
| 18 | +------------ |
| 19 | +uproot (pip install uproot) |
| 20 | +
|
| 21 | +This module does NOT require ROOT/PyROOT. uproot writes the .root file |
| 22 | +in a format ROOT reads natively. |
| 23 | +""" |
| 24 | +from __future__ import annotations |
| 25 | + |
| 26 | +import json |
| 27 | +from pathlib import Path |
| 28 | +from typing import Any |
| 29 | + |
| 30 | +import numpy as np |
| 31 | +import pandas as pd |
| 32 | +import uproot |
| 33 | + |
| 34 | + |
| 35 | +SCHEMA_SIDECAR_SUFFIX = "__gbreg_schema" |
| 36 | + |
| 37 | + |
| 38 | +def dfGB_to_root( |
| 39 | + dfGB: pd.DataFrame, |
| 40 | + output_path: str | Path, |
| 41 | + tree_name: str = "dfGB", |
| 42 | + *, |
| 43 | + group_columns: list[str], |
| 44 | + predictor_columns: list[str], |
| 45 | + targets: list[str], |
| 46 | + suffix: str, |
| 47 | + fit_intercept: bool, |
| 48 | + overwrite: bool = True, |
| 49 | +) -> Path: |
| 50 | + """Dump a trained dfGB to a .root file + schema sidecar. |
| 51 | +
|
| 52 | + Parameters |
| 53 | + ---------- |
| 54 | + dfGB : pd.DataFrame |
| 55 | + The trained model DataFrame. One row per populated bin. |
| 56 | + Must contain all group_columns and every expected coefficient |
| 57 | + column derived from (targets, predictor_columns, suffix, |
| 58 | + fit_intercept). |
| 59 | + output_path : str | Path |
| 60 | + Destination .root file. |
| 61 | + tree_name : str |
| 62 | + Name of the TTree inside the file. Default "dfGB". |
| 63 | + group_columns : list[str] |
| 64 | + Group (index) column names; must be present in dfGB. |
| 65 | + predictor_columns : list[str] |
| 66 | + Predictor column names driving slope coefficient column naming. |
| 67 | + targets : list[str] |
| 68 | + Target column names driving the <target>_intercept<suffix> and |
| 69 | + <target>_slope_<pred><suffix> column naming. |
| 70 | + suffix : str |
| 71 | + Coefficient column suffix (e.g. "_fit", "_sw"). |
| 72 | + fit_intercept : bool |
| 73 | + Whether the model has an intercept column per target. |
| 74 | + overwrite : bool |
| 75 | + If True and output_path exists, it is overwritten. |
| 76 | +
|
| 77 | + Returns |
| 78 | + ------- |
| 79 | + Path |
| 80 | + The output path. |
| 81 | +
|
| 82 | + Raises |
| 83 | + ------ |
| 84 | + ValueError |
| 85 | + If dfGB is missing any required column, or if any group or |
| 86 | + coefficient column has incompatible dtype for TTree storage. |
| 87 | + """ |
| 88 | + output_path = Path(output_path) |
| 89 | + if output_path.exists() and not overwrite: |
| 90 | + raise FileExistsError(output_path) |
| 91 | + output_path.parent.mkdir(parents=True, exist_ok=True) |
| 92 | + |
| 93 | + # Validate required columns are present |
| 94 | + required = list(group_columns) |
| 95 | + for t in targets: |
| 96 | + if fit_intercept: |
| 97 | + required.append(f"{t}_intercept{suffix}") |
| 98 | + for pred in predictor_columns: |
| 99 | + required.append(f"{t}_slope_{pred}{suffix}") |
| 100 | + missing = [c for c in required if c not in dfGB.columns] |
| 101 | + if missing: |
| 102 | + raise ValueError( |
| 103 | + f"dfGB missing required columns: {missing}. " |
| 104 | + f"Available columns: {list(dfGB.columns)}") |
| 105 | + |
| 106 | + # Build a branches dict for uproot: column name -> numpy array. |
| 107 | + # Keep group_columns as int64 and coefficient columns as float64. |
| 108 | + # Unknown-suffix columns (e.g. "_err_sw", "_rmse_sw", "_n_fitted_sw") |
| 109 | + # are PRESERVED per P1-δ: load-time the C++ side tolerates them |
| 110 | + # without schema error. |
| 111 | + branches: dict[str, np.ndarray] = {} |
| 112 | + for col in dfGB.columns: |
| 113 | + arr = dfGB[col].to_numpy() |
| 114 | + if col in group_columns: |
| 115 | + # Enforce integer dtype for group columns |
| 116 | + if not np.issubdtype(arr.dtype, np.integer): |
| 117 | + try: |
| 118 | + arr = arr.astype(np.int64) |
| 119 | + except Exception as e: |
| 120 | + raise ValueError( |
| 121 | + f"group column {col!r} not integer-castable: {e}") |
| 122 | + else: |
| 123 | + arr = arr.astype(np.int64) |
| 124 | + elif np.issubdtype(arr.dtype, np.floating): |
| 125 | + arr = arr.astype(np.float64) |
| 126 | + elif np.issubdtype(arr.dtype, np.integer): |
| 127 | + arr = arr.astype(np.int64) |
| 128 | + else: |
| 129 | + # Non-numeric columns not supported (strings, objects etc.) |
| 130 | + raise ValueError( |
| 131 | + f"column {col!r} has unsupported dtype {arr.dtype} " |
| 132 | + f"for TTree storage") |
| 133 | + branches[col] = arr |
| 134 | + |
| 135 | + # Schema sidecar JSON |
| 136 | + schema = { |
| 137 | + "group_columns": list(group_columns), |
| 138 | + "predictor_columns": list(predictor_columns), |
| 139 | + "targets": list(targets), |
| 140 | + "suffix": suffix, |
| 141 | + "fit_intercept": bool(fit_intercept), |
| 142 | + } |
| 143 | + schema_json = json.dumps(schema, sort_keys=True) |
| 144 | + sidecar_name = f"{tree_name}{SCHEMA_SIDECAR_SUFFIX}" |
| 145 | + |
| 146 | + # Write the file |
| 147 | + with uproot.recreate(output_path) as f: |
| 148 | + f[tree_name] = branches |
| 149 | + # uproot writes strings as TObjString compatible records |
| 150 | + f[sidecar_name] = schema_json |
| 151 | + |
| 152 | + return output_path |
| 153 | + |
| 154 | + |
| 155 | +def root_to_dfGB( |
| 156 | + input_path: str | Path, |
| 157 | + tree_name: str = "dfGB", |
| 158 | +) -> tuple[pd.DataFrame, dict[str, Any]]: |
| 159 | + """Inverse helper — read a .root file produced by dfGB_to_root. |
| 160 | +
|
| 161 | + Returns |
| 162 | + ------- |
| 163 | + (df, schema) : tuple |
| 164 | + df : pd.DataFrame restored from the TTree |
| 165 | + schema : dict parsed from the sidecar TObjString |
| 166 | +
|
| 167 | + Raises |
| 168 | + ------ |
| 169 | + KeyError if the tree or sidecar is not present. |
| 170 | + """ |
| 171 | + input_path = Path(input_path) |
| 172 | + sidecar_name = f"{tree_name}{SCHEMA_SIDECAR_SUFFIX}" |
| 173 | + with uproot.open(input_path) as f: |
| 174 | + if tree_name not in f: |
| 175 | + raise KeyError(f"tree {tree_name!r} not in {input_path}") |
| 176 | + tree = f[tree_name] |
| 177 | + # uproot-5 returns a structured ndarray or a dict depending on |
| 178 | + # version; handle both paths robustly. |
| 179 | + arrs = tree.arrays(library="np") |
| 180 | + if isinstance(arrs, dict): |
| 181 | + df = pd.DataFrame({k: arrs[k] for k in arrs}) |
| 182 | + elif hasattr(arrs, "dtype") and arrs.dtype.names: |
| 183 | + # structured ndarray |
| 184 | + df = pd.DataFrame({name: arrs[name] for name in arrs.dtype.names}) |
| 185 | + else: |
| 186 | + raise RuntimeError( |
| 187 | + f"unexpected uproot arrays return type: {type(arrs)}") |
| 188 | + |
| 189 | + # Schema sidecar: uproot returns the stored string |
| 190 | + if sidecar_name not in f: |
| 191 | + raise KeyError(f"schema sidecar {sidecar_name!r} not in " |
| 192 | + f"{input_path}") |
| 193 | + schema_raw = f[sidecar_name] |
| 194 | + # uproot returns stored strings as Python str directly when read |
| 195 | + if isinstance(schema_raw, str): |
| 196 | + schema = json.loads(schema_raw) |
| 197 | + else: |
| 198 | + # Some uproot versions wrap; try .tostring() / .fString |
| 199 | + schema_str = getattr(schema_raw, "fString", None) or str(schema_raw) |
| 200 | + schema = json.loads(schema_str) |
| 201 | + return df, schema |
0 commit comments