Skip to content

Commit 5bb9d66

Browse files
author
miranov25
committed
fix(AliasDataFrame): cleanTemporary now removes subframe join columns
Bug: materialize_aliases with cleanTemporary=True was not removing intermediate subframe columns (pattern: {col}__{subframe_name}). Root cause: Subframe join columns added by _prepare_subframe_joins() were not tracked in the 'added' list, so cleanup logic skipped them. Fix: Added explicit cleanup for columns matching {col}__{subframe_name} pattern where subframe_name is a registered subframe. Test: Added 4 new tests for cleanTemporary behavior. Real data validation: - Before: 22 → 51 columns (29 leaked) - After: 22 → 27 columns (correct) Reviewed-by: Claude (Architect)
1 parent a788a68 commit 5bb9d66

2 files changed

Lines changed: 230 additions & 0 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3381,7 +3381,23 @@ def _do_materialize():
33813381
# BATCH DROP: Single drop instead of per-column removal
33823382
if cleanTemporary and with_dependencies:
33833383
targets_set = set(targets)
3384+
3385+
# 1. Drop intermediate alias dependencies (existing logic)
33843386
cols_to_drop = [c for c in added if c not in targets_set and c in self.df.columns]
3387+
3388+
# 2. Drop subframe join columns (NEW: fix for subframe temporaries)
3389+
# These have pattern: {col}__{subframe_name}
3390+
if hasattr(self, '_subframes') and hasattr(self._subframes, 'subframes'):
3391+
subframe_names = set(self._subframes.subframes.keys())
3392+
for col in list(self.df.columns):
3393+
# Check if column is a subframe join column
3394+
if '__' in col and col not in targets_set:
3395+
# Extract suffix after last '__'
3396+
parts = col.rsplit('__', 1)
3397+
if len(parts) == 2 and parts[1] in subframe_names:
3398+
if col not in cols_to_drop:
3399+
cols_to_drop.append(col)
3400+
33853401
if cols_to_drop:
33863402
self.df.drop(columns=cols_to_drop, inplace=True)
33873403
if verbose:
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
"""
2+
Test case for cleanTemporary bug fix.
3+
4+
Bug: cleanTemporary=True was not removing intermediate subframe columns
5+
after materialize_aliases().
6+
7+
Fix: Added cleanup of columns matching pattern {col}__{subframe_name}
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
import pytest
13+
14+
15+
def test_clean_temporary_subframe_columns():
16+
"""Verify cleanTemporary removes intermediate subframe columns."""
17+
# Import here to avoid module-level issues
18+
from AliasDataFrame import AliasDataFrame
19+
20+
# Create main DataFrame
21+
n = 1000
22+
np.random.seed(42)
23+
main_df = pd.DataFrame({
24+
'track_id': np.random.randint(0, 100, n, dtype=np.int32),
25+
'x': np.random.randn(n).astype(np.float32),
26+
'y': np.random.randn(n).astype(np.float32),
27+
})
28+
29+
# Create subframe
30+
n_tracks = 100
31+
track_df = pd.DataFrame({
32+
'track_id': np.arange(n_tracks, dtype=np.int32),
33+
'mass': np.random.randn(n_tracks).astype(np.float32) + 1.0,
34+
'charge': np.random.choice([-1, 1], n_tracks).astype(np.int8),
35+
})
36+
37+
adf = AliasDataFrame(main_df)
38+
track_adf = AliasDataFrame(track_df)
39+
adf.register_subframe('T', track_adf, 'track_id')
40+
41+
# Define aliases with subframe dependencies
42+
adf.add_alias('r', 'sqrt(x**2 + y**2)') # Simple alias (intermediate)
43+
adf.add_alias('track_mass', 'T.mass') # Subframe ref (intermediate)
44+
adf.add_alias('final_result', 'r + track_mass') # Final target
45+
46+
# Record original columns
47+
original_columns = set(adf.df.columns)
48+
49+
# Materialize with cleanup
50+
adf.materialize_aliases(
51+
names=['final_result'],
52+
with_dependencies=True,
53+
cleanTemporary=True
54+
)
55+
56+
final_columns = set(adf.df.columns)
57+
58+
# Check: only final_result should be added
59+
new_columns = final_columns - original_columns
60+
assert new_columns == {'final_result'}, \
61+
f"Expected only 'final_result', got: {new_columns}"
62+
63+
# Check: no subframe intermediate columns should remain
64+
subframe_temps = [c for c in adf.df.columns if '__' in c]
65+
assert len(subframe_temps) == 0, \
66+
f"Temporary subframe columns not cleaned: {subframe_temps}"
67+
68+
# Check: intermediate alias 'r' should not remain
69+
assert 'r' not in adf.df.columns, "'r' intermediate should have been cleaned"
70+
71+
# Check: intermediate alias 'track_mass' should not remain
72+
assert 'track_mass' not in adf.df.columns, "'track_mass' intermediate should have been cleaned"
73+
74+
print("✅ test_clean_temporary_subframe_columns PASSED")
75+
76+
77+
def test_clean_temporary_preserves_targets():
78+
"""Verify cleanTemporary does NOT remove explicitly requested aliases."""
79+
from AliasDataFrame import AliasDataFrame
80+
81+
n = 1000
82+
np.random.seed(42)
83+
main_df = pd.DataFrame({
84+
'track_id': np.random.randint(0, 100, n, dtype=np.int32),
85+
'x': np.random.randn(n).astype(np.float32),
86+
})
87+
88+
track_df = pd.DataFrame({
89+
'track_id': np.arange(100, dtype=np.int32),
90+
'mass': np.random.randn(100).astype(np.float32),
91+
})
92+
93+
adf = AliasDataFrame(main_df)
94+
track_adf = AliasDataFrame(track_df)
95+
adf.register_subframe('T', track_adf, 'track_id')
96+
97+
adf.add_alias('track_mass', 'T.mass')
98+
adf.add_alias('result', 'x + track_mass')
99+
100+
original_columns = set(adf.df.columns)
101+
102+
# Request BOTH aliases as targets
103+
adf.materialize_aliases(
104+
names=['track_mass', 'result'],
105+
with_dependencies=True,
106+
cleanTemporary=True
107+
)
108+
109+
final_columns = set(adf.df.columns)
110+
new_columns = final_columns - original_columns
111+
112+
# Both should be preserved since both were requested
113+
assert 'track_mass' in new_columns, "Requested 'track_mass' should be preserved"
114+
assert 'result' in new_columns, "Requested 'result' should be preserved"
115+
116+
# But subframe join column should be cleaned
117+
subframe_temps = [c for c in adf.df.columns if '__T' in c]
118+
assert len(subframe_temps) == 0, \
119+
f"Temporary subframe columns not cleaned: {subframe_temps}"
120+
121+
print("✅ test_clean_temporary_preserves_targets PASSED")
122+
123+
124+
def test_clean_temporary_multiple_subframes():
125+
"""Verify cleanTemporary works with multiple subframes."""
126+
from AliasDataFrame import AliasDataFrame
127+
128+
n = 500
129+
np.random.seed(42)
130+
main_df = pd.DataFrame({
131+
'track_id': np.random.randint(0, 50, n, dtype=np.int32),
132+
'cluster_id': np.random.randint(0, 30, n, dtype=np.int32),
133+
'x': np.random.randn(n).astype(np.float32),
134+
})
135+
136+
track_df = pd.DataFrame({
137+
'track_id': np.arange(50, dtype=np.int32),
138+
'pt': np.random.randn(50).astype(np.float32),
139+
})
140+
141+
cluster_df = pd.DataFrame({
142+
'cluster_id': np.arange(30, dtype=np.int32),
143+
'energy': np.random.randn(30).astype(np.float32),
144+
})
145+
146+
adf = AliasDataFrame(main_df)
147+
adf.register_subframe('T', AliasDataFrame(track_df), 'track_id')
148+
adf.register_subframe('C', AliasDataFrame(cluster_df), 'cluster_id')
149+
150+
adf.add_alias('combined', 'T.pt + C.energy + x')
151+
152+
original_columns = set(adf.df.columns)
153+
154+
adf.materialize_aliases(
155+
names=['combined'],
156+
with_dependencies=True,
157+
cleanTemporary=True
158+
)
159+
160+
final_columns = set(adf.df.columns)
161+
new_columns = final_columns - original_columns
162+
163+
# Only 'combined' should be added
164+
assert new_columns == {'combined'}, f"Expected only 'combined', got: {new_columns}"
165+
166+
# No subframe columns from either T or C
167+
subframe_temps = [c for c in adf.df.columns if '__T' in c or '__C' in c]
168+
assert len(subframe_temps) == 0, \
169+
f"Temporary columns not cleaned: {subframe_temps}"
170+
171+
print("✅ test_clean_temporary_multiple_subframes PASSED")
172+
173+
174+
def test_no_cleanup_when_disabled():
175+
"""Verify subframe columns are preserved when cleanTemporary=False."""
176+
from AliasDataFrame import AliasDataFrame
177+
178+
n = 500
179+
np.random.seed(42)
180+
main_df = pd.DataFrame({
181+
'track_id': np.random.randint(0, 50, n, dtype=np.int32),
182+
'x': np.random.randn(n).astype(np.float32),
183+
})
184+
185+
track_df = pd.DataFrame({
186+
'track_id': np.arange(50, dtype=np.int32),
187+
'mass': np.random.randn(50).astype(np.float32),
188+
})
189+
190+
adf = AliasDataFrame(main_df)
191+
adf.register_subframe('T', AliasDataFrame(track_df), 'track_id')
192+
193+
adf.add_alias('result', 'x + T.mass')
194+
195+
adf.materialize_aliases(
196+
names=['result'],
197+
with_dependencies=True,
198+
cleanTemporary=False # Disable cleanup
199+
)
200+
201+
# Subframe column SHOULD remain when cleanup is disabled
202+
subframe_temps = [c for c in adf.df.columns if '__T' in c]
203+
assert len(subframe_temps) > 0, \
204+
"Subframe columns should be preserved when cleanTemporary=False"
205+
206+
print("✅ test_no_cleanup_when_disabled PASSED")
207+
208+
209+
if __name__ == '__main__':
210+
test_clean_temporary_subframe_columns()
211+
test_clean_temporary_preserves_targets()
212+
test_clean_temporary_multiple_subframes()
213+
test_no_cleanup_when_disabled()
214+
print("\n✅ All cleanTemporary tests PASSED!")

0 commit comments

Comments
 (0)