Skip to content

Commit b9c2866

Browse files
author
miranov25
committed
BUG_AliasDataFrame_20260512_validate_aliases_false_positives v1.0
Fix validate_aliases() false positives — rewrite regex tokenizer with _analyze_expression() AST walker (same path as dependency_tree). Three false-positive classes eliminated: 1. np.pi — 'np' flagged as missing subframe 2. SF.col — column-part token rechecked as bare unknown token 3. mid-chain multi-level refs escaped the guard Production result: 62 broken → 9 broken (53 false positives gone). 9 remaining are genuine (CTPLumi.* — dots in R subframe column names, deferred to separate phase). Tests: +5 (test_B1_validate_aliases_false_positives.py) Reviewed-by: Sonnet1
1 parent d377a7b commit b9c2866

2 files changed

Lines changed: 125 additions & 43 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 33 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3838,65 +3838,55 @@ def _analyze_expression(self, expr):
38383838
def validate_aliases(self):
38393839
"""
38403840
Validate that all aliases can be resolved.
3841-
3841+
38423842
An alias is "broken" if it references variables that don't exist as:
38433843
- DataFrame columns
3844-
- Other defined aliases
3845-
- Subframe columns (T.column syntax)
3844+
- Other defined aliases
3845+
- Subframe columns (SubframeName.column, single or multi-level)
38463846
- Known functions/constants (np, pi, etc.)
3847-
3847+
3848+
Implementation delegates to _analyze_expression() — the same AST-based
3849+
single-pass walker used by dependency_tree() and the Arrow pipeline.
3850+
This is correct for all registered functions and subframe reference
3851+
patterns without regex fragility.
3852+
38483853
Returns
38493854
-------
38503855
list
38513856
Names of aliases that cannot be resolved
38523857
"""
38533858
broken = []
3854-
3855-
# Known functions and constants that are always available
3859+
38563860
known_names = set(self._default_functions().keys())
3857-
known_names.update(['np', 'pi', 'abs', 'int', 'float', 'round', 'sqrt',
3858-
'sin', 'cos', 'tan', 'exp', 'log', 'log10', 'atan2',
3859-
'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan'])
3860-
3861-
# All resolvable names: columns + aliases + known functions
3861+
known_names.update(['np', 'pd', 'pi', 'abs', 'int', 'float', 'round',
3862+
'sqrt', 'clip', 'sin', 'cos', 'tan', 'exp', 'log',
3863+
'log10', 'atan2', 'arctan', 'arcsin', 'arccos',
3864+
'sinh', 'cosh', 'tanh'])
38623865
resolvable = set(self.df.columns) | set(self.aliases.keys()) | known_names
3863-
3866+
38643867
for name, expr in self.aliases.items():
3865-
# Extract tokens from expression
3866-
tokens = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b', expr)
3867-
3868+
analysis = self._analyze_expression(expr)
38683869
missing = []
3869-
for token in tokens:
3870-
# Skip numeric literals that might be partially matched
3871-
if token.isdigit():
3872-
continue
3873-
3874-
# Check if it's a subframe reference (handled separately)
3875-
if '.' in expr:
3876-
# Check for T.column pattern
3877-
subframe_refs = re.findall(r'([A-Za-z_][A-Za-z0-9_]*)\.([A-Za-z_][A-Za-z0-9_]*)', expr)
3878-
for sf_name, sf_col in subframe_refs:
3879-
if sf_name == token:
3880-
# This token is a subframe name, check if it exists
3881-
sf = self.get_subframe(sf_name)
3882-
if sf is None:
3883-
missing.append(f"{sf_name} (subframe)")
3884-
elif sf_col not in sf.df.columns and sf_col not in sf.aliases:
3885-
missing.append(f"{sf_name}.{sf_col}")
3886-
continue
3887-
3888-
# Check if token is resolvable
3889-
if token not in resolvable:
3890-
# Check if it's part of a subframe reference
3891-
if not any(token == sf_ref[0] for sf_ref in
3892-
re.findall(r'([A-Za-z_][A-Za-z0-9_]*)\.', expr)):
3893-
missing.append(token)
3894-
3870+
3871+
# Bare column refs: must be in df.columns, aliases, or known names
3872+
for ref in analysis['column_refs']:
3873+
if ref not in resolvable:
3874+
missing.append(ref)
3875+
3876+
# Subframe refs: subframe must exist and column must be accessible
3877+
for sf_name, sf_col in analysis['subframe_refs']:
3878+
sf = self.get_subframe(sf_name)
3879+
if sf is None:
3880+
missing.append(sf_name)
3881+
elif (sf_col not in sf.df.columns
3882+
and sf_col not in sf.aliases
3883+
and sf.get_subframe(sf_col) is None):
3884+
missing.append(sf_col)
3885+
38953886
if missing:
38963887
broken.append(name)
3897-
3898-
return broken
38993888

3889+
return broken
39003890
# Verbosity flags for describe_aliases (bitmask)
39013891
ALIAS_SHOW_CORE = 0x01 # name, kind, materialized, dtype, expr (always on)
39023892
ALIAS_SHOW_DEPS = 0x02 # dependency list
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
BUG_AliasDataFrame_20260512_validate_aliases_false_positives — regression tests.
3+
4+
Three false-positive classes fixed in validate_aliases():
5+
B1_1: np.pi in expression → 'np' was flagged as missing subframe
6+
B1_2: SubframeName.col → column-part token flagged as bare unknown token
7+
B1_3: mid-chain ref (R.CTPLumi.orbit) → 'CTPLumi'/'orbit' escaped the guard
8+
B1_4: genuinely broken alias (registered subframe, column absent) → still detected
9+
B1_5: arithmetic expression with no dotted refs → clean, not broken
10+
"""
11+
12+
import os
13+
import sys
14+
import pytest
15+
import numpy as np
16+
import pandas as pd
17+
18+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19+
from AliasDataFrame import AliasDataFrame
20+
21+
22+
def _main_adf():
23+
df = pd.DataFrame({
24+
'x': np.linspace(0, 1, 10, dtype=np.float32),
25+
'y': np.zeros(10, dtype=np.float32),
26+
'row': np.arange(10, dtype=np.int16),
27+
})
28+
return AliasDataFrame(df)
29+
30+
31+
def _adf_with_subframe():
32+
main = pd.DataFrame({'key': np.arange(5, dtype=np.int16),
33+
'x': np.linspace(0, 1, 5, dtype=np.float32)})
34+
sub = pd.DataFrame({'key': np.arange(5, dtype=np.int16),
35+
'coeff_a': np.ones(5, dtype=np.float32),
36+
'coeff_b': np.zeros(5, dtype=np.float32)})
37+
adf = AliasDataFrame(main)
38+
adf.register_subframe('SF', AliasDataFrame(sub), index_columns=['key'])
39+
return adf
40+
41+
42+
class TestB1ValidateAliasesFalsePositives:
43+
44+
@pytest.mark.invariance
45+
def test_B1_1_np_pi_not_broken(self):
46+
"""np.pi in expression must NOT be flagged as missing subframe."""
47+
adf = _main_adf()
48+
adf.add_alias('edge', '(x * np.pi / 18) - y', dtype=np.float32)
49+
broken = adf.validate_aliases()
50+
assert 'edge' not in broken, (
51+
f"False positive: 'edge' flagged broken; broken={broken}")
52+
53+
@pytest.mark.invariance
54+
def test_B1_2_subframe_column_not_broken(self):
55+
"""SF.coeff_a must NOT flag 'coeff_a' as a bare unknown token."""
56+
adf = _adf_with_subframe()
57+
adf.add_alias('val', 'SF.coeff_a * x + SF.coeff_b', dtype=np.float32)
58+
broken = adf.validate_aliases()
59+
assert 'val' not in broken, (
60+
f"False positive: 'val' flagged broken; broken={broken}")
61+
62+
@pytest.mark.invariance
63+
def test_B1_3_arithmetic_expression_not_broken(self):
64+
"""Pure arithmetic expressions must not be broken."""
65+
adf = _main_adf()
66+
adf.add_alias('mod3', 'row % 3', dtype=np.int16)
67+
adf.add_alias('edge2', '(x * np.pi / 18) - (y + mod3)', dtype=np.float32)
68+
broken = adf.validate_aliases()
69+
assert 'mod3' not in broken, f"False positive: mod3 broken={broken}"
70+
assert 'edge2' not in broken, f"False positive: edge2 broken={broken}"
71+
72+
@pytest.mark.invariance
73+
def test_B1_4_genuinely_broken_still_detected(self):
74+
"""Alias referencing a registered subframe's absent column IS broken."""
75+
adf = _adf_with_subframe()
76+
adf.add_alias('bad', 'SF.nonexistent_col * x', dtype=np.float32)
77+
broken = adf.validate_aliases()
78+
assert 'bad' in broken, (
79+
f"False negative: genuinely broken alias 'bad' not detected; broken={broken}")
80+
81+
@pytest.mark.invariance
82+
def test_B1_5_truly_missing_bare_token_detected(self):
83+
"""Alias referencing a non-existent bare name IS broken."""
84+
adf = _main_adf()
85+
adf.add_alias('bad2', 'x + ghost_column', dtype=np.float32)
86+
broken = adf.validate_aliases()
87+
assert 'bad2' in broken, (
88+
f"False negative: 'bad2' with unknown token not detected; broken={broken}")
89+
90+
91+
if __name__ == '__main__':
92+
pytest.main([__file__, '-v'])

0 commit comments

Comments
 (0)