Skip to content

Commit f9df9cf

Browse files
author
miranov25
committed
fix(AliasDataFrame): Fill handling, batch optimization, and cycle detection fixes
BUG-2025-11-27-003: Fill Handling for Subframe Joins - Add set_global_fill() / set_subframe_fill() API for configurable fill behavior - fill_missing: Fill value for missing keys (default NaN) - fill_nan / fill_inf / fill_invalid: Fill values for invalid data - fill_mode: 'safe' (default) or 'direct' (fast path) - Use pd.merge(indicator=True) to distinguish missing keys from data NaN - Aggregated warnings: Single summary instead of per-column spam BUG-2025-11-27-002: Restore Batch Materialization Optimization - Restore context_override parameter in _eval_in_namespace() - Restore results dict + single pd.concat (avoids O(n²) fragmentation) - materialize_aliases() bypasses materialize_alias() for batch path - Remove _batch_mode parameter (was source of regression) BUG-2025-11-28-001: Cycle Detection and Index Column Fixes - Fix false cycle detection for subframe aliases ('val' = 'T.val') - Add token != alias check in select_aliases() graph builder - Add index column materialization in batched path - Improve cycle error messages with expressions and diagnostic hints Performance: ~3x speedup on subframe-heavy workflows (216s → 72s on 13.5M rows) Tests: 557 passed, 2 xpassed Benchmarks: All pass, no regression detected
1 parent 6aea3a5 commit f9df9cf

4 files changed

Lines changed: 1101 additions & 9 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,7 +2238,9 @@ def build_graph():
22382238
for alias, expr in self.aliases.items():
22392239
g.add_node(alias)
22402240
for token in re.findall(r'\b\w+\b', expr):
2241-
if token in self.aliases:
2241+
# Exclude self-references to prevent false cycles
2242+
# (e.g., alias 'val' with expr 'T.val' extracts token 'val')
2243+
if token in self.aliases and token != alias:
22422244
g.add_edge(token, alias)
22432245
return g
22442246

@@ -2254,6 +2256,30 @@ def build_graph():
22542256
try:
22552257
ordered = list(nx.topological_sort(g.subgraph(expanded)))
22562258
result = [n for n in ordered if n in expanded]
2259+
except nx.NetworkXUnfeasible:
2260+
# Find and report cycles with helpful error message
2261+
cycles = list(nx.simple_cycles(g.subgraph(expanded)))
2262+
if cycles:
2263+
max_cycles = 5
2264+
shown = cycles[:max_cycles]
2265+
cycle_info = []
2266+
for cycle in shown:
2267+
cycle_str = ' -> '.join(cycle) + ' -> ' + cycle[0]
2268+
exprs = [f" {a} = {self.aliases.get(a, '[not found]')}" for a in cycle]
2269+
cycle_info.append(f" Cycle: {cycle_str}\n" + '\n'.join(exprs))
2270+
2271+
msg = (
2272+
f"Dependency cycle detected in aliases "
2273+
f"({len(cycles)} total, showing first {len(shown)}):\n"
2274+
+ '\n'.join(cycle_info)
2275+
+ "\n\nHint: Self-referential aliases often occur when an alias name "
2276+
"matches a subframe column name.\n"
2277+
"To diagnose: adf.validate_no_cycles(raise_on_cycle=False)"
2278+
)
2279+
raise ValueError(msg)
2280+
else:
2281+
# Shouldn't happen, but fallback
2282+
result = list(expanded)
22572283
except nx.NetworkXError:
22582284
result = list(expanded)
22592285

@@ -2498,8 +2524,7 @@ def materialize_alias(self, name, cleanTemporary=False, dtype=None, warn_missing
24982524
result = result_dtype(result)
24992525
self.df[name] = result
25002526

2501-
# Emit aggregated missing key warning BEFORE restoring config
2502-
# (must be inside try block so warn_missing_keys=False takes effect)
2527+
# Emit aggregated warning BEFORE restoring config (so warn_missing_keys=False takes effect)
25032528
self._emit_missing_key_summary()
25042529

25052530
finally:
@@ -2592,13 +2617,28 @@ def materialize_aliases(self, pattern=None, names=None, with_dependencies=True,
25922617
if verbose:
25932618
print(f"[materialize_aliases] Computing: {name}")
25942619

2595-
# First, ensure any subframe aliases referenced in expression are materialized
2596-
# This handles cases like "T.mX" where mX might be an alias in subframe T
2620+
# Handle subframe dependencies: index columns and subframe attributes
25972621
tokens = re.findall(r'(\w+)\.(\w+)', expr)
25982622
for sf_name, sf_attr in tokens:
25992623
sf = self.get_subframe(sf_name)
2600-
if sf and sf_attr in sf.aliases and sf_attr not in sf.df.columns:
2601-
sf.materialize_alias(sf_attr)
2624+
if sf:
2625+
# FIX B: Materialize index columns if they're aliases
2626+
# This was missing in the batched path but exists in materialize_alias()
2627+
entry = self._subframes.get_entry(sf_name)
2628+
if entry:
2629+
index_cols = entry['index']
2630+
if isinstance(index_cols, str):
2631+
index_cols = [index_cols]
2632+
for idx_col in index_cols:
2633+
if idx_col in self.aliases and idx_col not in self.df.columns:
2634+
if idx_col not in results: # Not yet computed in batch
2635+
if verbose:
2636+
print(f"[materialize_aliases] Materializing index: {idx_col}")
2637+
self.materialize_alias(idx_col)
2638+
2639+
# Materialize subframe attribute if it's an alias
2640+
if sf_attr in sf.aliases and sf_attr not in sf.df.columns:
2641+
sf.materialize_alias(sf_attr)
26022642

26032643
# Compute with context_override so dependent aliases can see prior results
26042644
# Note: _eval_in_namespace calls _prepare_subframe_joins which handles:

UTILS/dfextensions/AliasDataFrame/benchmarks/baseline.json

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
{
22
"version": 1,
3-
"created": "2025-11-27T23:24:03.978426",
3+
"created": "2025-11-28T15:00:47.358284",
44
"host": "Marians-MBP-3.fritz.box",
55
"python_version": "3.9.6",
66
"cpu_count": 12,
77
"platform": "macOS-14.5-arm64-arm-64bit",
88
"benchmarks": {
9+
"benchmark_materialize_aliases.py": {
10+
"time_s": 1.406652083,
11+
"metrics": {
12+
"direct_vs_safe_speedup": 1.2558246197681708,
13+
"safe_vs_simple_ratio": 49.60799311584217
14+
}
15+
},
916
"benchmark_parallel.py": {
1017
"time_s": null,
1118
"metrics": {}
1219
},
1320
"benchmark_performance.py": {
14-
"time_s": 0.05617991599999983,
21+
"time_s": 0.05693191499999983,
1522
"metrics": {
1623
"all_passed": 1
1724
}

0 commit comments

Comments
 (0)