Skip to content

Commit b7e5761

Browse files
author
miranov25
committed
feat(export): Add columns= parameter for snapshot mode
New parameter export_tree(columns=[...]) for cache/snapshot workflows: - Exports only specified columns (no schema/aliases/subframes) - Raises ValueError for missing columns - Warns (UserWarning) if subframes exist but won't be exported - Handles float16→float32 conversion for uproot compatibility 3 new tests added to TestExportTreeColumns. Design decisions (3-reviewer consensus): - Parameter name: columns= (matches pandas/RDF conventions) - No auto-include of index columns (explicit is better) - Warning for subframes (2-1 vote: GPT+Gemini yes, Claude no) Co-authored-by: Claude (Architect) Reviewed-by: GPT, Gemini, Claude
1 parent cf68ba4 commit b7e5761

2 files changed

Lines changed: 106 additions & 7 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3966,15 +3966,44 @@ def load(path_prefix, load_subframes=True):
39663966

39673967
return adf
39683968

3969-
def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True,compression=uproot.ZLIB(level=1)):
3969+
def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True, compression=uproot.ZLIB(level=1), columns=None):
39703970
"""
3971-
uproot.LZMA(level=5)
3972-
:param filename_or_file:
3973-
:param treename:
3974-
:param dropAliasColumns:
3975-
:param compression:
3976-
:return:
3971+
Export DataFrame to ROOT TTree.
3972+
3973+
Parameters
3974+
----------
3975+
filename_or_file : str or uproot file
3976+
Output file path or open uproot file
3977+
treename : str
3978+
Name of output tree
3979+
dropAliasColumns : bool
3980+
If True, don't export columns that are aliases
3981+
compression : uproot compression
3982+
Compression algorithm (default: ZLIB level 1)
3983+
columns : list of str, optional
3984+
If provided, export only these columns (snapshot/cache mode).
3985+
WARNING: Schema, aliases, and subframes are NOT exported.
39773986
"""
3987+
import warnings
3988+
3989+
# Snapshot mode: export only specified columns, no schema/subframes
3990+
if columns is not None:
3991+
missing = set(columns) - set(self.df.columns)
3992+
if missing:
3993+
raise ValueError(f"Requested columns not found: {sorted(missing)}")
3994+
if self._subframes.subframes:
3995+
warnings.warn(
3996+
"export_tree(columns=...) does not export subframes. "
3997+
"Only specified columns will be saved.",
3998+
UserWarning
3999+
)
4000+
dtype_casts = {col: np.float32 for col in columns if self.df[col].dtype == np.float16}
4001+
export_df = self.df[columns].astype(dtype_casts)
4002+
with uproot.recreate(filename_or_file, compression=compression) as f:
4003+
f[treename] = {col: export_df[col].values for col in export_df.columns}
4004+
return
4005+
4006+
# Full export mode: existing behavior
39784007
is_path = isinstance(filename_or_file, str)
39794008

39804009
if is_path:

UTILS/dfextensions/AliasDataFrame/tests/test_alias_dataframe.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2482,5 +2482,75 @@ def test_schema_v2_order_agnostic_load(self):
24822482
os.unlink(temp_path)
24832483

24842484

2485+
class TestExportTreeColumns(unittest.TestCase):
2486+
"""Tests for export_tree(columns=...) snapshot mode."""
2487+
2488+
def test_export_tree_columns_subset(self):
2489+
"""Test export_tree with columns parameter exports only specified columns."""
2490+
import tempfile
2491+
2492+
df = pd.DataFrame({
2493+
'x': np.array([1, 2, 3], dtype=np.float32),
2494+
'y': np.array([3, 4, 5], dtype=np.float32),
2495+
'z': np.array([5, 6, 7], dtype=np.float32),
2496+
})
2497+
adf = AliasDataFrame(df)
2498+
2499+
with tempfile.TemporaryDirectory() as tmp_dir:
2500+
filepath = os.path.join(tmp_dir, "subset.root")
2501+
adf.export_tree(filepath, "tree", columns=['x', 'y'])
2502+
2503+
# Verify only x, y exported
2504+
adf2 = AliasDataFrame.read_tree(filepath, "tree")
2505+
self.assertIn('x', adf2.df.columns)
2506+
self.assertIn('y', adf2.df.columns)
2507+
self.assertNotIn('z', adf2.df.columns)
2508+
2509+
# Verify data is correct
2510+
np.testing.assert_array_equal(adf2.df['x'].values, [1, 2, 3])
2511+
np.testing.assert_array_equal(adf2.df['y'].values, [3, 4, 5])
2512+
2513+
def test_export_tree_columns_missing_raises(self):
2514+
"""Test export_tree raises ValueError for missing columns."""
2515+
import tempfile
2516+
2517+
df = pd.DataFrame({'x': np.array([1, 2], dtype=np.float32)})
2518+
adf = AliasDataFrame(df)
2519+
2520+
with tempfile.TemporaryDirectory() as tmp_dir:
2521+
filepath = os.path.join(tmp_dir, "out.root")
2522+
with self.assertRaises(ValueError) as ctx:
2523+
adf.export_tree(filepath, "tree", columns=['x', 'missing'])
2524+
self.assertIn("not found", str(ctx.exception))
2525+
2526+
def test_export_tree_columns_warns_subframes(self):
2527+
"""Test export_tree warns when subframes exist but columns specified."""
2528+
import tempfile
2529+
import warnings
2530+
2531+
df = pd.DataFrame({
2532+
'x': np.array([1, 2], dtype=np.float32),
2533+
'row': np.array([0, 1], dtype=np.int32),
2534+
})
2535+
sub_df = pd.DataFrame({
2536+
'row': np.array([0, 1], dtype=np.int32),
2537+
'val': np.array([10, 20], dtype=np.float32),
2538+
})
2539+
2540+
adf = AliasDataFrame(df)
2541+
adf.register_subframe('S', AliasDataFrame(sub_df), index_columns='row')
2542+
2543+
with tempfile.TemporaryDirectory() as tmp_dir:
2544+
filepath = os.path.join(tmp_dir, "out.root")
2545+
with warnings.catch_warnings(record=True) as w:
2546+
warnings.simplefilter("always")
2547+
adf.export_tree(filepath, "tree", columns=['x'])
2548+
2549+
# Check warning was raised
2550+
self.assertEqual(len(w), 1)
2551+
self.assertIn("subframes", str(w[0].message))
2552+
self.assertEqual(w[0].category, UserWarning)
2553+
2554+
24852555
if __name__ == "__main__":
24862556
unittest.main()

0 commit comments

Comments
 (0)