Skip to content

Commit e827853

Browse files
author
miranov25
committed
PHASE 13.30.DF v1.0 — Column-reference parameter validation (Class 2)
Closes the silent-fallthrough bug class where string-valued column- reference parameters silently degraded behaviour when the named column was absent from the DataFrame. Triggered by BUG_ADF_GroupBy_Expression_Materialization (2026-05-11): adf.draw(group_by='row%3', ...) silently rendered ungrouped because ADF forwards the raw expression and dfdraw used 'if group_by in df.columns' silent-fallthrough at profile.py:454, histogram.py:307, scatter.py:168. Sub-fix 1 of Phase 13.30 'Parameter Class Validation' suite. Covers Class 2 (strict column references). Class 3 (style-key defaults — will bundle Phase 13.26 + 13.28 FIX1) and Class 5 (enum generalization) ship as separate sub-fixes under the same phase tag. Architecture mirrors Phase 13.16.DF FIX1 _*_FORWARDED_NAMES discipline: - New module plots/_validation.py with validate_column_references() - Tuples DFDraw._{PROFILE,HIST,SCATTER,DRAW}_COLUMN_REFERENCES in drawer.py (single source of truth; call sites import the tuple directly — no duplication, no drift risk) - R6 validator _validate_forwarded_names() extended to cover the new tuples at module import - Reserved DFDraw._{PROFILE,HIST,SCATTER}_COLUMN_REFERENCE_LISTS empty tuples for Phase 13.27.DF Commit 2 (selection_vector, weights_vector — append-only when Commit 2 lands) - Class-4 weights (handled by _eval_weights with df.eval fallback) is NOT in any tuple; §9.Profile.4 invariance test locks the contract 12 §9-marked invariance tests with cross-class coverage: - TestColumnReferenceValidation_Profile (4 — incl Class-4 weights lock) - TestColumnReferenceValidation_Hist (3) - TestColumnReferenceValidation_Scatter (2) - TestR6ColumnReferenceValidator (2 — drift protection) - TestProductionReproducer (1 — locks architect's In[103]) ADF-side materialization fix tracked separately under BUG_ADF. Tests: 663 → 675 passed, 1 skipped, 0 failed.
1 parent f5b6b3d commit e827853

6 files changed

Lines changed: 469 additions & 6 deletions

File tree

UTILS/dfextensions/dfdraw/drawer.py

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,47 @@ def _auto_label(self, y_expr, x_expr=None):
596596
# Private kwargs that _draw_vector injects into iter_kwargs to suppress
597597
# per-iteration legend/title/tight_layout in the underlying plot modules.
598598
_VECTOR_SUPPRESS_KWARGS = ('_suppress_legend', '_suppress_title', '_suppress_layout')
599-
599+
600+
# =========================================================================
601+
# Phase 13.30.DF v1.0 — Class-2 column-reference parameter tuples.
602+
#
603+
# Mirror of _*_FORWARDED_NAMES discipline: single source of truth + one
604+
# validation loop per plot type. plots/_validation.py iterates these to
605+
# enforce that string-valued column-reference parameters name real columns,
606+
# raising ValueError instead of silently falling back to ungrouped mode.
607+
#
608+
# Parameter class taxonomy (Phase 13.30 proposal §3):
609+
# Class 2 — column reference (strict): listed here.
610+
# Examples: group_by (today); selection_vector, weights_vector
611+
# (Phase 13.27 Commit 2 — anticipated by *_COLUMN_REFERENCE_LISTS).
612+
# Class 4 — expression-or-column (permissive): NOT listed here.
613+
# Examples: weights (handled by _eval_weights with df.eval fallback).
614+
#
615+
# Adding a Class-4 parameter here would break the existing
616+
# expression-accepting contract.
617+
#
618+
# Validated at module import by _validate_forwarded_names() (extended for
619+
# this phase): every entry must be a real parameter of the target method.
620+
# =========================================================================
621+
622+
_PROFILE_COLUMN_REFERENCES = ('group_by',)
623+
_HIST_COLUMN_REFERENCES = ('group_by',)
624+
_SCATTER_COLUMN_REFERENCES = ('group_by',)
625+
_DRAW_COLUMN_REFERENCES = ('group_by',)
626+
# hist2d / hexbin have no group_by parameter — empty tuples document that
627+
# explicitly. Adding group_by to those plot types in a future phase requires
628+
# also appending here.
629+
_HIST2D_COLUMN_REFERENCES = ()
630+
_HEXBIN_COLUMN_REFERENCES = ()
631+
632+
# Reserved for Phase 13.27.DF Commit 2 (selection_vector / weights_vector).
633+
# Each entry names a parameter that is List[str] of column references.
634+
# Empty for v1.0 of Phase 13.30; Commit 2 fills them and adds a list-aware
635+
# validator overload.
636+
_PROFILE_COLUMN_REFERENCE_LISTS = ()
637+
_HIST_COLUMN_REFERENCE_LISTS = ()
638+
_SCATTER_COLUMN_REFERENCE_LISTS = ()
639+
600640
def _draw_vector(self, y_list, x_list, draw_method,
601641
vector_style=None, group_style='color',
602642
group_by=None, **kwargs):
@@ -3264,6 +3304,12 @@ def _validate_forwarded_names():
32643304
(DFDraw._HIST_FORWARDED_NAMES, DFDraw.hist, 'hist'),
32653305
(DFDraw._SCATTER_FORWARDED_NAMES, DFDraw.scatter, 'scatter'),
32663306
(DFDraw._DRAW_FORWARDED_NAMES, DFDraw.draw, 'draw'),
3307+
# Phase 13.30.DF v1.0 — Class-2 column-reference tuples.
3308+
# Same validation: every entry must be a real parameter of the target.
3309+
(DFDraw._PROFILE_COLUMN_REFERENCES, DFDraw.profile, 'profile (col-refs)'),
3310+
(DFDraw._HIST_COLUMN_REFERENCES, DFDraw.hist, 'hist (col-refs)'),
3311+
(DFDraw._SCATTER_COLUMN_REFERENCES, DFDraw.scatter, 'scatter (col-refs)'),
3312+
(DFDraw._DRAW_COLUMN_REFERENCES, DFDraw.draw, 'draw (col-refs)'),
32673313
]
32683314
errors = []
32693315
for tup, method, name in pairs:
@@ -3274,14 +3320,15 @@ def _validate_forwarded_names():
32743320
missing = set(tup) - sig_params
32753321
if missing:
32763322
errors.append(
3277-
f"_{name.upper()}_FORWARDED_NAMES contains non-signature "
3278-
f"parameters: {sorted(missing)}"
3323+
f"_{name.upper().replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_')} "
3324+
f"contains non-signature parameters: {sorted(missing)}"
32793325
)
32803326
if errors:
32813327
raise RuntimeError(
3282-
"Phase 13.16.DF FIX1 R6 validation failed at module import:\n"
3328+
"Phase 13.16.DF FIX1 R6 / Phase 13.30.DF Class-2 validation failed "
3329+
"at module import:\n"
32833330
+ "\n".join(" - " + e for e in errors)
3284-
+ "\n\nUpdate the relevant _*_FORWARDED_NAMES tuple in DFDraw."
3331+
+ "\n\nUpdate the relevant tuple in DFDraw."
32853332
)
32863333

32873334

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
"""
2+
Parameter validation for dfdraw plot calls — Phase 13.30.DF v1.0.
3+
4+
Class-2 column-reference validation: enforce that string-valued parameters
5+
naming a DataFrame column refer to an existing column, instead of silently
6+
falling through to ungrouped/unfiltered behaviour when the column is missing.
7+
8+
Background
9+
----------
10+
- BUG_ADF_GroupBy_Expression_Materialization (2026-05-11): ADF forwards
11+
`group_by="row%3"` unmaterialized; dfdraw's silent-fallthrough check
12+
(e.g. `if group_by in df.columns`) hid the bug behind a downstream
13+
matplotlib UserWarning about empty legend labels.
14+
15+
- Phase 13.30.DF sub-fix 1: introduce class-level tuple + helper so all
16+
Class-2 parameters are validated uniformly, and Phase 13.27 Commit 2's
17+
selection_vector / weights_vector get coverage for free when their
18+
tuples are populated.
19+
20+
Parameter taxonomy (see Phase 13.30 proposal §3 for the full table)
21+
-------------------------------------------------------------------
22+
- Class 2 — column reference (strict): string MUST be an existing column.
23+
This module handles Class 2. Examples: ``group_by``.
24+
25+
- Class 4 — expression-or-column (permissive): string is either column
26+
name OR pandas eval expression; dfdraw evaluates internally.
27+
Examples: ``weights`` (handled by ``_eval_weights`` in plots/profile.py).
28+
These parameters are NOT validated here.
29+
30+
References
31+
----------
32+
- BUG_ADF_GroupBy_Expression_Materialization.md
33+
- PHASE_13_30_DF_v1_0_Proposal_ParameterClassValidation.md §3, §4
34+
"""
35+
from typing import Iterable, Mapping, Any, List, Tuple
36+
import pandas as pd
37+
38+
39+
def validate_column_references(
40+
df: pd.DataFrame,
41+
kwargs: Mapping[str, Any],
42+
names: Iterable[str],
43+
context: str,
44+
) -> None:
45+
"""Validate that every Class-2 column-reference kwarg names a real column.
46+
47+
For each name in ``names``, look up the value in ``kwargs``. If the
48+
value is a non-empty string and is NOT a column in ``df``, raise
49+
``ValueError`` with an actionable message naming the offending
50+
parameter, the bad value, the available columns (first 10), and a
51+
pointer to the materialize-first remediation.
52+
53+
Class-2 parameters are strict column references — computed expressions
54+
must be materialized by the caller (e.g., via
55+
``AliasDataFrame.add_alias``). Class-4 parameters (like ``weights``,
56+
which dfdraw evaluates via ``df.eval``) are NOT validated here and
57+
must NOT be listed in ``names``.
58+
59+
Parameters
60+
----------
61+
df : pandas.DataFrame
62+
The DataFrame the plot call will operate on.
63+
kwargs : Mapping[str, Any]
64+
Keyword arguments of the calling plot function. Typically pass
65+
``locals()`` from the call site.
66+
names : Iterable[str]
67+
Names of Class-2 column-reference parameters to validate. Driven
68+
by the per-plot ``_*_COLUMN_REFERENCES`` tuple in ``drawer.py``.
69+
Pass the tuple directly to avoid duplication and drift.
70+
context : str
71+
Plot-type name (``"profile"`` / ``"hist"`` / ``"scatter"`` etc.).
72+
Used only to make the error message actionable.
73+
74+
Raises
75+
------
76+
ValueError
77+
If any value in ``names`` is a non-empty string not present in
78+
``df.columns``. Single-bad case produces a per-parameter message;
79+
multi-bad case enumerates all offenders in one error.
80+
81+
Notes
82+
-----
83+
- ``None`` values, non-string values, and the empty string are skipped
84+
(these are not the bug class we're catching — they have other paths).
85+
- Empty DataFrames (no columns at all) are passed through silently;
86+
downstream code already handles empty input.
87+
88+
Examples
89+
--------
90+
>>> df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6], "sector": [0, 1, 0]})
91+
>>> # Existing column — no raise:
92+
>>> validate_column_references(df, {"group_by": "sector"},
93+
... names=("group_by",), context="profile")
94+
95+
>>> # Missing column — raises ValueError:
96+
>>> validate_column_references(df, {"group_by": "row%3"},
97+
... names=("group_by",), context="profile")
98+
Traceback (most recent call last):
99+
...
100+
ValueError: group_by='row%3' is not a column in the DataFrame ...
101+
"""
102+
if df is None or len(getattr(df, "columns", [])) == 0:
103+
# No columns to check against; downstream code handles empty df
104+
return
105+
106+
bad: List[Tuple[str, str]] = []
107+
for name in names:
108+
value = kwargs.get(name)
109+
if value is None or not isinstance(value, str) or not value:
110+
continue
111+
if value in df.columns:
112+
continue
113+
bad.append((name, value))
114+
115+
if not bad:
116+
return
117+
118+
cols = list(df.columns)
119+
if len(cols) > 10:
120+
cols_preview = ", ".join(repr(c) for c in cols[:10]) + f", ... ({len(cols)} total)"
121+
else:
122+
cols_preview = ", ".join(repr(c) for c in cols)
123+
124+
if len(bad) == 1:
125+
name, value = bad[0]
126+
raise ValueError(
127+
f"{name}={value!r} is not a column in the DataFrame "
128+
f"(called from dfdraw {context}()). "
129+
f"If this is a computed expression, the caller must materialize "
130+
f"it into a column first. With AliasDataFrame: "
131+
f"adf.add_alias(<name>, {value!r}). "
132+
f"Available columns: [{cols_preview}]"
133+
)
134+
135+
bad_str = ", ".join(f"{n}={v!r}" for n, v in bad)
136+
raise ValueError(
137+
f"{len(bad)} column-reference parameters do not name DataFrame "
138+
f"columns (called from dfdraw {context}()): {bad_str}. "
139+
f"Materialize computed expressions into columns first. "
140+
f"Available columns: [{cols_preview}]"
141+
)

UTILS/dfextensions/dfdraw/plots/histogram.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
# Phase 13.28.DF: Robust data handling
2121
from ._data_sanitize import sanitize_for_plot
2222
from ._autorange import compute_autorange, VALID_STRATEGIES
23+
# Phase 13.30.DF: Class-2 column-reference parameter validation
24+
from ._validation import validate_column_references
2325

2426

2527
# =============================================================================
@@ -303,6 +305,15 @@ def draw_hist(
303305
elif norm == "probability":
304306
weights = np.ones_like(x_data) / len(x_data) if len(x_data) > 0 else None
305307

308+
# Phase 13.30.DF: Validate Class-2 column-reference parameters.
309+
# Catches BUG_ADF_GroupBy_Expression_Materialization (silent fallthrough below).
310+
from ..drawer import DFDraw as _DFDraw
311+
validate_column_references(
312+
df, locals(),
313+
names=_DFDraw._HIST_COLUMN_REFERENCES,
314+
context="hist",
315+
)
316+
306317
# Group-by handling
307318
if group_by is not None and group_by in df.columns:
308319
_draw_hist_grouped(

UTILS/dfextensions/dfdraw/plots/profile.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
# Phase 13.28.DF: Robust data handling
3030
from ._data_sanitize import sanitize_for_plot
3131
from ._autorange import compute_autorange, resolve_range_1d, VALID_STRATEGIES
32+
# Phase 13.30.DF: Class-2 column-reference parameter validation
33+
from ._validation import validate_column_references
3234

3335

3436
# =============================================================================
@@ -294,7 +296,18 @@ def draw_profile(
294296
"group_by_quantiles must be an integer (number of quantile bins), "
295297
"not True. Example: group_by_quantiles=4"
296298
)
297-
299+
300+
# Phase 13.30.DF: Validate Class-2 column-reference parameters.
301+
# Catches BUG_ADF_GroupBy_Expression_Materialization: caller passed a
302+
# computed expression (e.g. "row%3") where a column name is required.
303+
# Import tuple from drawer at call time to avoid circular import.
304+
from ..drawer import DFDraw as _DFDraw
305+
validate_column_references(
306+
df, locals(),
307+
names=_DFDraw._PROFILE_COLUMN_REFERENCES,
308+
context="profile",
309+
)
310+
298311
# Get style defaults
299312
if bins is None:
300313
bins = get_style_value("hist.bins", 50)

UTILS/dfextensions/dfdraw/plots/scatter.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from ..stats import format_stats_box
2222
# Phase 13.28.DF: Robust data handling
2323
from ._data_sanitize import sanitize_for_plot
24+
# Phase 13.30.DF: Class-2 column-reference parameter validation
25+
from ._validation import validate_column_references
2426

2527

2628
def draw_scatter(
@@ -164,6 +166,15 @@ def draw_scatter(
164166
if jitter:
165167
x_data, y_data = _apply_jitter(x_data, y_data, jitter)
166168

169+
# Phase 13.30.DF: Validate Class-2 column-reference parameters.
170+
# Catches BUG_ADF_GroupBy_Expression_Materialization (silent fallthrough below).
171+
from ..drawer import DFDraw as _DFDraw
172+
validate_column_references(
173+
df, locals(),
174+
names=_DFDraw._SCATTER_COLUMN_REFERENCES,
175+
context="scatter",
176+
)
177+
167178
# Group-by handling
168179
if group_by is not None and group_by in df.columns:
169180
_draw_scatter_grouped(

0 commit comments

Comments
 (0)