Skip to content

Commit 4c269c3

Browse files
committed
Phase 13.21.ADF v1.1: dematerialize(drop=, keep=) + remove drop_materialized
Adds dematerialize() public API for memory reclamation: dematerialize(drop=['dyp_I2','ddxp_I2']) -- drop named aliases dematerialize(keep=['dy_I3','dz_I3']) -- keep named, drop rest dematerialize() -- drop all materialized Raw columns always protected. Mutually exclusive drop/keep (ValueError). Removes drop_materialized() — dematerialize(drop=...) is a strict superset. 3 internal call sites updated. No external callers existed (verified: zero test references, zero external imports). Composes with Phase 13.21 join caching: after dematerialization, re-materialization reuses cached join indices (index columns unchanged). Tests J1_6..J1_10: drop+recover, keep, drop-all, raw-column protection, mutual-exclusion error.
1 parent fa7cd11 commit 4c269c3

2 files changed

Lines changed: 192 additions & 13 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9878,20 +9878,80 @@ def _get_materialized_aliases(self):
98789878
aliases = self.aliases # {name: expr}
98799879
return {name for name in aliases if name in self.df.columns}
98809880

9881-
def drop_materialized(self, aliases):
9881+
# drop_materialized removed in Phase 13.21.ADF v1.1
9882+
# Use dematerialize(drop=...) instead — strict superset.
9883+
9884+
def dematerialize(self, drop=None, keep=None):
98829885
"""
9883-
Drop materialized alias columns from DataFrame.
9884-
9885-
Args:
9886-
aliases: Alias names to drop (silently ignores non-existent)
9887-
9888-
Note:
9889-
Only drops columns that are aliases, never physical columns.
9886+
Drop materialized alias columns to reclaim memory.
9887+
9888+
Aliases and subframes are preserved — dropped columns can be
9889+
re-materialized on demand via materialize_aliases().
9890+
9891+
Raw columns (those without a matching alias) are never dropped,
9892+
regardless of the drop/keep parameters.
9893+
9894+
Phase 13.21.ADF v1.1: composable with join index caching.
9895+
After dematerialization, re-materialization reuses cached join
9896+
indices (index columns unchanged), so the round-trip is cheap.
9897+
9898+
Parameters
9899+
----------
9900+
drop : list of str, optional
9901+
Column names to drop. Only alias-backed columns are dropped;
9902+
raw columns in this list are silently ignored.
9903+
Mutually exclusive with keep.
9904+
keep : list of str, optional
9905+
Column names to preserve. All OTHER materialized alias columns
9906+
are dropped. Raw columns are always preserved regardless.
9907+
Mutually exclusive with drop.
9908+
9909+
If neither drop nor keep is given, drops ALL materialized alias
9910+
columns.
9911+
9912+
Returns
9913+
-------
9914+
list of str
9915+
Names of columns actually dropped.
9916+
9917+
Examples
9918+
--------
9919+
>>> # Drop specific columns you know are no longer needed:
9920+
>>> adf.dematerialize(drop=['dyp_I2', 'ddxp_I2', 'ddzp_I2'])
9921+
['dyp_I2', 'ddxp_I2', 'ddzp_I2']
9922+
9923+
>>> # Keep only what the next step needs:
9924+
>>> adf.dematerialize(keep=['dy_I3', 'dz_I3'])
9925+
['dyp_I2', 'ddxp_I2', ..., 'weight_trackI1', ...]
9926+
9927+
>>> # Drop everything materialized (back to raw + subframes):
9928+
>>> adf.dematerialize()
9929+
['dy_I0', 'dz_I0', 'dyp_I2', ..., 'isNotEdge', ...]
98909930
"""
9891-
alias_names = set(self.aliases.keys())
9892-
to_drop = [a for a in aliases if a in alias_names and a in self.df.columns]
9931+
import gc
9932+
9933+
if drop is not None and keep is not None:
9934+
raise ValueError("Specify drop or keep, not both")
9935+
9936+
# Materialized alias columns = columns that DO have a matching alias
9937+
materialized = [c for c in self.df.columns if c in self.aliases]
9938+
9939+
if drop is not None:
9940+
# Drop only the named columns, only if they are alias-backed
9941+
to_drop = [c for c in drop if c in materialized]
9942+
elif keep is not None:
9943+
# Drop all materialized EXCEPT keep
9944+
keep_set = set(keep)
9945+
to_drop = [c for c in materialized if c not in keep_set]
9946+
else:
9947+
# Drop all materialized
9948+
to_drop = materialized
9949+
98939950
if to_drop:
98949951
self.df = self.df.drop(columns=to_drop)
9952+
gc.collect()
9953+
9954+
return to_drop
98959955

98969956
def _resolve_draw_param(self, param_value, param_name: str):
98979957
"""
@@ -10298,7 +10358,7 @@ def draw(self,
1029810358
if cleanup_needed:
1029910359
we_added = self._get_materialized_aliases() - already_materialized
1030010360
if we_added:
10301-
self.drop_materialized(we_added)
10361+
self.dematerialize(drop=list(we_added))
1030210362

1030310363
return result
1030410364

@@ -11249,7 +11309,7 @@ def draw_batch(self,
1124911309
if we_added:
1125011310
if verbose:
1125111311
print(f"Clearing {len(we_added)} materialized aliases")
11252-
self.drop_materialized(we_added)
11312+
self.dematerialize(drop=list(we_added))
1125311313

1125411314
return results
1125511315

@@ -11537,7 +11597,7 @@ def draw_figures(
1153711597
if we_added:
1153811598
if verbose:
1153911599
print(f"[draw_figures] Clearing {len(we_added)} materialized aliases")
11540-
self.drop_materialized(we_added)
11600+
self.dematerialize(drop=list(we_added))
1154111601

1154211602
return results
1154311603

UTILS/dfextensions/AliasDataFrame/tests/test_J1_join_cache.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,125 @@ def test_J1_5_multi_subframe_pipeline(self):
270270
err_msg="J1_5: multi-subframe pipeline result mismatch"
271271
)
272272

273+
@pytest.mark.invariance
274+
def test_J1_6_dematerialize_drop_and_recover(self):
275+
"""
276+
J1_6: dematerialize(drop=[...]) drops the column;
277+
re-materialize recovers identical values.
278+
"""
279+
adf = _build_main()
280+
sf = _build_coeff_sf()
281+
adf.register_subframe('Coeff', sf, index_columns=['sec'])
282+
adf.add_alias('correction', 'Coeff.c0 + x', dtype=np.float32)
283+
adf.add_alias('scaled', 'Coeff.c1 * y', dtype=np.float32)
284+
285+
adf.materialize_aliases(names=['correction', 'scaled'])
286+
original_correction = adf.df['correction'].values.copy()
287+
original_scaled = adf.df['scaled'].values.copy()
288+
289+
# Drop only 'correction', keep 'scaled'
290+
dropped = adf.dematerialize(drop=['correction'])
291+
assert dropped == ['correction'], f"Expected ['correction'], got {dropped}"
292+
assert 'correction' not in adf.df.columns, "correction should be dropped"
293+
assert 'scaled' in adf.df.columns, "scaled should survive"
294+
295+
# Raw columns must survive
296+
assert 'x' in adf.df.columns, "raw column 'x' must survive"
297+
assert 'sec' in adf.df.columns, "raw column 'sec' must survive"
298+
299+
# Re-materialize and verify bit-exact recovery
300+
adf.materialize_aliases(names=['correction'])
301+
np.testing.assert_array_equal(
302+
adf.df['correction'].values, original_correction,
303+
err_msg="J1_6: re-materialized values differ from original"
304+
)
305+
306+
@pytest.mark.invariance
307+
def test_J1_7_dematerialize_keep(self):
308+
"""
309+
J1_7: dematerialize(keep=[...]) drops all OTHER materialized aliases;
310+
raw columns survive; re-materialization recovers values.
311+
"""
312+
adf = _build_main()
313+
sf = _build_coeff_sf()
314+
adf.register_subframe('Coeff', sf, index_columns=['sec'])
315+
adf.add_alias('a1', 'Coeff.c0', dtype=np.float32)
316+
adf.add_alias('a2', 'Coeff.c1', dtype=np.float32)
317+
adf.add_alias('a3', 'x**2', dtype=np.float32)
318+
319+
adf.materialize_aliases(names=['a1', 'a2', 'a3'])
320+
original_a2 = adf.df['a2'].values.copy()
321+
original_a3 = adf.df['a3'].values.copy()
322+
323+
# Keep only a1; drop a2 and a3
324+
dropped = adf.dematerialize(keep=['a1'])
325+
assert 'a1' in adf.df.columns, "a1 should survive (in keep list)"
326+
assert 'a2' not in adf.df.columns, "a2 should be dropped"
327+
assert 'a3' not in adf.df.columns, "a3 should be dropped"
328+
assert set(dropped) == {'a2', 'a3'}, f"Expected {{'a2','a3'}}, got {set(dropped)}"
329+
330+
# Raw columns always survive
331+
assert 'x' in adf.df.columns
332+
assert 'sec' in adf.df.columns
333+
334+
# Re-materialize dropped aliases
335+
adf.materialize_aliases(names=['a2', 'a3'])
336+
np.testing.assert_array_equal(
337+
adf.df['a2'].values, original_a2,
338+
err_msg="J1_7: re-materialized a2 differs"
339+
)
340+
np.testing.assert_array_equal(
341+
adf.df['a3'].values, original_a3,
342+
err_msg="J1_7: re-materialized a3 differs"
343+
)
344+
345+
@pytest.mark.invariance
346+
def test_J1_8_dematerialize_all(self):
347+
"""
348+
J1_8: dematerialize() with no args drops ALL materialized alias columns.
349+
Raw columns survive.
350+
"""
351+
adf = _build_main()
352+
adf.add_alias('x2', 'x**2', dtype=np.float32)
353+
adf.add_alias('y2', 'y**2', dtype=np.float32)
354+
adf.materialize_aliases(names=['x2', 'y2'])
355+
356+
raw_cols_before = [c for c in adf.df.columns if c not in adf.aliases]
357+
358+
dropped = adf.dematerialize()
359+
assert 'x2' not in adf.df.columns
360+
assert 'y2' not in adf.df.columns
361+
362+
# All raw columns still present
363+
for col in raw_cols_before:
364+
assert col in adf.df.columns, f"raw column '{col}' was dropped"
365+
366+
@pytest.mark.invariance
367+
def test_J1_9_dematerialize_ignores_raw_columns(self):
368+
"""
369+
J1_9: dematerialize(drop=['raw_col']) silently ignores raw columns.
370+
No crash, no drop.
371+
"""
372+
adf = _build_main()
373+
adf.add_alias('a1', 'x**2', dtype=np.float32)
374+
adf.materialize_aliases(names=['a1'])
375+
376+
# Try to drop a raw column — should be silently ignored
377+
dropped = adf.dematerialize(drop=['x', 'sec', 'a1'])
378+
assert 'a1' not in adf.df.columns, "alias a1 should be dropped"
379+
assert 'x' in adf.df.columns, "raw column x must survive"
380+
assert 'sec' in adf.df.columns, "raw column sec must survive"
381+
assert dropped == ['a1'], f"Only alias columns in drop list: {dropped}"
382+
383+
@pytest.mark.invariance
384+
def test_J1_10_dematerialize_drop_keep_mutual_exclusion(self):
385+
"""
386+
J1_10: specifying both drop and keep raises ValueError.
387+
"""
388+
adf = _build_main()
389+
with pytest.raises(ValueError, match="drop or keep, not both"):
390+
adf.dematerialize(drop=['x'], keep=['y'])
391+
273392

274393
class TestJ2JoinCachePerformance:
275394
"""J2 — cache hit count verification."""

0 commit comments

Comments
 (0)