Skip to content

Commit 4312270

Browse files
author
miranov25
committed
perf: Batch materialization to avoid O(n²) DataFrame fragmentation
BUG-2025-11-27-002: materialize_aliases() took 216s for 58 aliases on 13.5M rows due to sequential column insertion causing DataFrame fragmentation. Changes: - Add context_override parameter to _eval_in_namespace() for dependency resolution - Refactor materialize_aliases() to compute all results into dict first - Single pd.concat() at end instead of 58 sequential insertions - Batch drop for cleanTemporary (single drop() call) - Add _ensure_subframe_dependencies() helper for subframe edge cases Expected improvement: 216s → <20s (>10× speedup) Tests: - Add test_batch_materialization.py (10 new tests) - All 504 tests pass Reviewed-by: Gemini, GPT, Claude 2
1 parent 4fe2170 commit 4312270

2 files changed

Lines changed: 393 additions & 14 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 164 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,9 +1514,40 @@ def add_alias(self, name, expression, dtype=None, is_constant=False):
15141514
# Check for cycles (catches indirect cycles like A -> B -> A)
15151515
self._check_for_cycles()
15161516

1517-
def _eval_in_namespace(self, expr, warn_missing_keys=True, alias_name=None):
1517+
def _eval_in_namespace(self, expr, warn_missing_keys=True, alias_name=None, context_override=None):
1518+
"""
1519+
Evaluate an expression in a namespace containing DataFrame columns and functions.
1520+
1521+
Parameters
1522+
----------
1523+
expr : str
1524+
Expression to evaluate
1525+
warn_missing_keys : bool, default=True
1526+
If True, warn when subframe join has missing keys
1527+
alias_name : str, optional
1528+
Name of alias being evaluated (for error messages)
1529+
context_override : dict, optional
1530+
Additional variables to include in evaluation namespace.
1531+
Used by materialize_aliases() to provide already-computed results
1532+
so that later aliases can reference earlier ones without requiring
1533+
them to be in self.df yet (enables batch materialization).
1534+
1535+
Returns
1536+
-------
1537+
pandas.Series or scalar
1538+
Result of evaluating the expression
1539+
"""
15181540
expr = self._prepare_subframe_joins(expr, warn_missing_keys=warn_missing_keys, alias_name=alias_name)
1541+
1542+
# Build namespace: DataFrame columns first
15191543
local_env = {col: self.df[col] for col in self.df.columns}
1544+
1545+
# Add context_override (previously computed aliases in batch mode)
1546+
# This allows alias B to reference alias A even if A isn't in self.df yet
1547+
if context_override:
1548+
local_env.update(context_override)
1549+
1550+
# Add functions last (so they don't get shadowed by columns)
15201551
local_env.update(self._default_functions())
15211552

15221553
try:
@@ -2062,12 +2093,21 @@ def materialize_aliases(self, pattern=None, names=None, with_dependencies=True,
20622093
list
20632094
Names of aliases that were materialized
20642095
2096+
Notes
2097+
-----
2098+
Performance optimization: Uses batch pd.concat instead of sequential
2099+
column insertion to avoid O(n²) DataFrame fragmentation.
2100+
See BUG-2025-11-27-002 for details.
2101+
20652102
Examples
20662103
--------
20672104
>>> adf.materialize_aliases(pattern=r'is.*') # All 'is*' aliases
20682105
>>> adf.materialize_aliases(names=['r', 'phi', 'cosPhi']) # Specific names
20692106
>>> adf.materialize_aliases(pattern=r'dy.*|dz.*') # dy and dz aliases
20702107
"""
2108+
import time
2109+
t_start = time.time() if verbose else None
2110+
20712111
# Get primary targets first (without dependencies)
20722112
targets = self.select_aliases(
20732113
pattern=pattern,
@@ -2090,29 +2130,139 @@ def materialize_aliases(self, pattern=None, names=None, with_dependencies=True,
20902130
with_dependencies=True
20912131
)
20922132
if verbose:
2093-
print(f"[materialize_aliases] With dependencies: {to_materialize}")
2133+
print(f"[materialize_aliases] With dependencies: {len(to_materialize)} aliases")
20942134
else:
20952135
to_materialize = targets
20962136

2097-
# Materialize in order
2098-
added = []
2137+
# =====================================================================
2138+
# BATCH MATERIALIZATION — Performance optimization (BUG-2025-11-27-002)
2139+
#
2140+
# Instead of sequential self.df[name] = result (which causes O(n²)
2141+
# DataFrame fragmentation), we:
2142+
# 1. Compute all results into a dict
2143+
# 2. Use context_override so later aliases can reference earlier ones
2144+
# 3. Single pd.concat at the end
2145+
# =====================================================================
2146+
2147+
results = {} # Collect all computed results
2148+
added = [] # Track which aliases we computed
2149+
20992150
for name in to_materialize:
2100-
if name not in self.df.columns:
2151+
if name in self.df.columns:
2152+
# Already materialized (either existed or subframe join added it)
2153+
continue
2154+
2155+
if name not in self.aliases:
21012156
if verbose:
2102-
print(f"[materialize_aliases] Materializing: {name}")
2103-
self.materialize_alias(name, cleanTemporary=False)
2104-
added.append(name)
2157+
print(f"[materialize_aliases] Warning: '{name}' not in aliases, skipping")
2158+
continue
2159+
2160+
expr = self.aliases[name]
2161+
2162+
# Handle subframe dependencies before evaluation
2163+
# This is necessary because _prepare_subframe_joins needs index columns
2164+
# and subframe attributes to exist
2165+
self._ensure_subframe_dependencies(name, expr, results, verbose)
2166+
2167+
if verbose:
2168+
print(f"[materialize_aliases] Computing: {name}")
2169+
2170+
# Evaluate with context_override containing previously computed results
2171+
# This allows alias B to reference alias A without A being in self.df yet
2172+
result = self._eval_in_namespace(
2173+
expr,
2174+
warn_missing_keys=True,
2175+
alias_name=name,
2176+
context_override=results
2177+
)
2178+
2179+
# Apply dtype if specified
2180+
result_dtype = self.alias_dtypes.get(name)
2181+
if result_dtype is not None:
2182+
try:
2183+
result = result.astype(result_dtype)
2184+
except AttributeError:
2185+
result = result_dtype(result)
2186+
2187+
results[name] = result
2188+
added.append(name)
2189+
2190+
# =====================================================================
2191+
# BATCH ASSIGNMENT — Single DataFrame operation
2192+
# This avoids the O(n²) fragmentation from sequential column insertion
2193+
# =====================================================================
2194+
if results:
2195+
new_cols_df = pd.DataFrame(results, index=self.df.index)
2196+
self.df = pd.concat([self.df, new_cols_df], axis=1)
2197+
if verbose:
2198+
print(f"[materialize_aliases] Batch-added {len(results)} columns")
21052199

2106-
# Clean temporary dependencies if requested
2200+
# =====================================================================
2201+
# BATCH CLEANUP — Single drop operation (also avoids fragmentation)
2202+
# =====================================================================
21072203
if cleanTemporary and with_dependencies:
21082204
targets_set = set(targets)
2109-
for col in added:
2110-
if col not in targets_set and col in self.df.columns:
2111-
self.df.drop(columns=[col], inplace=True)
2112-
if verbose:
2113-
print(f"[materialize_aliases] Cleaned temporary: {col}")
2205+
cols_to_drop = [col for col in added if col not in targets_set and col in self.df.columns]
2206+
if cols_to_drop:
2207+
self.df.drop(columns=cols_to_drop, inplace=True)
2208+
if verbose:
2209+
print(f"[materialize_aliases] Batch-dropped {len(cols_to_drop)} temporary columns")
2210+
2211+
if verbose:
2212+
elapsed = time.time() - t_start
2213+
print(f"[materialize_aliases] Completed in {elapsed:.2f}s ({len(added)} aliases)")
21142214

21152215
return added
2216+
2217+
def _ensure_subframe_dependencies(self, alias_name, expr, context_override, verbose=False):
2218+
"""
2219+
Ensure subframe dependencies are available before evaluating an alias.
2220+
2221+
This handles:
2222+
1. Materializing subframe index columns (if they're aliases)
2223+
2. Materializing subframe attributes (in the subframe's DataFrame)
2224+
2225+
Parameters
2226+
----------
2227+
alias_name : str
2228+
Name of the alias being evaluated
2229+
expr : str
2230+
Expression of the alias
2231+
context_override : dict
2232+
Dict of already-computed results (used to check if deps are available)
2233+
verbose : bool
2234+
If True, print progress
2235+
"""
2236+
# Find subframe references (pattern: word.word)
2237+
tokens = re.findall(r'\w+\.\w+', expr)
2238+
2239+
for token in tokens:
2240+
sf_name, sf_attr = token.split('.', 1)
2241+
sf = self.get_subframe(sf_name)
2242+
if sf is None:
2243+
continue
2244+
2245+
# Materialize subframe index columns if they're aliases
2246+
entry = self._subframes.get_entry(sf_name)
2247+
if entry:
2248+
index_cols = entry['index']
2249+
if isinstance(index_cols, str):
2250+
index_cols = [index_cols]
2251+
2252+
for idx_col in index_cols:
2253+
# Check if index column needs materialization
2254+
if idx_col in self.aliases and idx_col not in self.df.columns:
2255+
# Check if it's in context_override (already computed in this batch)
2256+
if idx_col not in context_override:
2257+
if verbose:
2258+
print(f"[materialize_aliases] Materializing index column: {idx_col}")
2259+
self.materialize_alias(idx_col, warn_missing_keys=True)
2260+
2261+
# Materialize the subframe attribute itself (in subframe's DataFrame)
2262+
if sf_attr in sf.aliases and sf_attr not in sf.df.columns:
2263+
if verbose:
2264+
print(f"[materialize_aliases] Materializing subframe attr: {sf_name}.{sf_attr}")
2265+
sf.materialize_alias(sf_attr)
21162266

21172267
def materialize_pattern(self, pattern, cleanTemporary=True, verbose=False,
21182268
only_unmaterialized=True):

0 commit comments

Comments
 (0)