PHASE 13.30.DF v1.0 — Column-reference parameter validation (Class 2)

miranov25 · miranov25 · commit e8278531caba · 2026-05-12T07:04:46.000+02:00
Closes the silent-fallthrough bug class where string-valued column-
reference parameters silently degraded behaviour when the named column
was absent from the DataFrame.

Triggered by BUG_ADF_GroupBy_Expression_Materialization (2026-05-11):
adf.draw(group_by='row%3', ...) silently rendered ungrouped because ADF
forwards the raw expression and dfdraw used 'if group_by in df.columns'
silent-fallthrough at profile.py:454, histogram.py:307, scatter.py:168.

Sub-fix 1 of Phase 13.30 'Parameter Class Validation' suite. Covers
Class 2 (strict column references). Class 3 (style-key defaults — will
bundle Phase 13.26 + 13.28 FIX1) and Class 5 (enum generalization) ship
as separate sub-fixes under the same phase tag.

Architecture mirrors Phase 13.16.DF FIX1 _*_FORWARDED_NAMES discipline:
- New module plots/_validation.py with validate_column_references()
- Tuples DFDraw._{PROFILE,HIST,SCATTER,DRAW}_COLUMN_REFERENCES in drawer.py
  (single source of truth; call sites import the tuple directly — no
  duplication, no drift risk)
- R6 validator _validate_forwarded_names() extended to cover the new
  tuples at module import
- Reserved DFDraw._{PROFILE,HIST,SCATTER}_COLUMN_REFERENCE_LISTS empty
  tuples for Phase 13.27.DF Commit 2 (selection_vector, weights_vector
  — append-only when Commit 2 lands)
- Class-4 weights (handled by _eval_weights with df.eval fallback) is
  NOT in any tuple; §9.Profile.4 invariance test locks the contract

12 §9-marked invariance tests with cross-class coverage:
- TestColumnReferenceValidation_Profile (4 — incl Class-4 weights lock)
- TestColumnReferenceValidation_Hist    (3)
- TestColumnReferenceValidation_Scatter (2)
- TestR6ColumnReferenceValidator        (2 — drift protection)
- TestProductionReproducer              (1 — locks architect's In[103])

ADF-side materialization fix tracked separately under BUG_ADF.

Tests: 663 → 675 passed, 1 skipped, 0 failed.
diff --git a/UTILS/dfextensions/dfdraw/drawer.py b/UTILS/dfextensions/dfdraw/drawer.py
@@ -596,7 +596,47 @@ def _auto_label(self, y_expr, x_expr=None):
     # Private kwargs that _draw_vector injects into iter_kwargs to suppress
     # per-iteration legend/title/tight_layout in the underlying plot modules.
     _VECTOR_SUPPRESS_KWARGS = ('_suppress_legend', '_suppress_title', '_suppress_layout')
-    
+
+    # =========================================================================
+    # Phase 13.30.DF v1.0 — Class-2 column-reference parameter tuples.
+    #
+    # Mirror of _*_FORWARDED_NAMES discipline: single source of truth + one
+    # validation loop per plot type. plots/_validation.py iterates these to
+    # enforce that string-valued column-reference parameters name real columns,
+    # raising ValueError instead of silently falling back to ungrouped mode.
+    #
+    # Parameter class taxonomy (Phase 13.30 proposal §3):
+    #   Class 2 — column reference (strict): listed here.
+    #     Examples: group_by (today); selection_vector, weights_vector
+    #     (Phase 13.27 Commit 2 — anticipated by *_COLUMN_REFERENCE_LISTS).
+    #   Class 4 — expression-or-column (permissive): NOT listed here.
+    #     Examples: weights (handled by _eval_weights with df.eval fallback).
+    #
+    # Adding a Class-4 parameter here would break the existing
+    # expression-accepting contract.
+    #
+    # Validated at module import by _validate_forwarded_names() (extended for
+    # this phase): every entry must be a real parameter of the target method.
+    # =========================================================================
+
+    _PROFILE_COLUMN_REFERENCES = ('group_by',)
+    _HIST_COLUMN_REFERENCES    = ('group_by',)
+    _SCATTER_COLUMN_REFERENCES = ('group_by',)
+    _DRAW_COLUMN_REFERENCES    = ('group_by',)
+    # hist2d / hexbin have no group_by parameter — empty tuples document that
+    # explicitly. Adding group_by to those plot types in a future phase requires
+    # also appending here.
+    _HIST2D_COLUMN_REFERENCES  = ()
+    _HEXBIN_COLUMN_REFERENCES  = ()
+
+    # Reserved for Phase 13.27.DF Commit 2 (selection_vector / weights_vector).
+    # Each entry names a parameter that is List[str] of column references.
+    # Empty for v1.0 of Phase 13.30; Commit 2 fills them and adds a list-aware
+    # validator overload.
+    _PROFILE_COLUMN_REFERENCE_LISTS = ()
+    _HIST_COLUMN_REFERENCE_LISTS    = ()
+    _SCATTER_COLUMN_REFERENCE_LISTS = ()
+
     def _draw_vector(self, y_list, x_list, draw_method,
                      vector_style=None, group_style='color',
                      group_by=None, **kwargs):
@@ -3264,6 +3304,12 @@ def _validate_forwarded_names():
         (DFDraw._HIST_FORWARDED_NAMES,    DFDraw.hist,    'hist'),
         (DFDraw._SCATTER_FORWARDED_NAMES, DFDraw.scatter, 'scatter'),
         (DFDraw._DRAW_FORWARDED_NAMES,    DFDraw.draw,    'draw'),
+        # Phase 13.30.DF v1.0 — Class-2 column-reference tuples.
+        # Same validation: every entry must be a real parameter of the target.
+        (DFDraw._PROFILE_COLUMN_REFERENCES, DFDraw.profile, 'profile (col-refs)'),
+        (DFDraw._HIST_COLUMN_REFERENCES,    DFDraw.hist,    'hist (col-refs)'),
+        (DFDraw._SCATTER_COLUMN_REFERENCES, DFDraw.scatter, 'scatter (col-refs)'),
+        (DFDraw._DRAW_COLUMN_REFERENCES,    DFDraw.draw,    'draw (col-refs)'),
     ]
     errors = []
     for tup, method, name in pairs:
@@ -3274,14 +3320,15 @@ def _validate_forwarded_names():
         missing = set(tup) - sig_params
         if missing:
             errors.append(
-                f"_{name.upper()}_FORWARDED_NAMES contains non-signature "
-                f"parameters: {sorted(missing)}"
+                f"_{name.upper().replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_')} "
+                f"contains non-signature parameters: {sorted(missing)}"
             )
     if errors:
         raise RuntimeError(
-            "Phase 13.16.DF FIX1 R6 validation failed at module import:\n"
+            "Phase 13.16.DF FIX1 R6 / Phase 13.30.DF Class-2 validation failed "
+            "at module import:\n"
             + "\n".join("  - " + e for e in errors)
-            + "\n\nUpdate the relevant _*_FORWARDED_NAMES tuple in DFDraw."
+            + "\n\nUpdate the relevant tuple in DFDraw."
         )
 
 
diff --git a/UTILS/dfextensions/dfdraw/plots/_validation.py b/UTILS/dfextensions/dfdraw/plots/_validation.py
@@ -0,0 +1,141 @@
+"""
+Parameter validation for dfdraw plot calls — Phase 13.30.DF v1.0.
+
+Class-2 column-reference validation: enforce that string-valued parameters
+naming a DataFrame column refer to an existing column, instead of silently
+falling through to ungrouped/unfiltered behaviour when the column is missing.
+
+Background
+----------
+- BUG_ADF_GroupBy_Expression_Materialization (2026-05-11): ADF forwards
+  `group_by="row%3"` unmaterialized; dfdraw's silent-fallthrough check
+  (e.g. `if group_by in df.columns`) hid the bug behind a downstream
+  matplotlib UserWarning about empty legend labels.
+
+- Phase 13.30.DF sub-fix 1: introduce class-level tuple + helper so all
+  Class-2 parameters are validated uniformly, and Phase 13.27 Commit 2's
+  selection_vector / weights_vector get coverage for free when their
+  tuples are populated.
+
+Parameter taxonomy (see Phase 13.30 proposal §3 for the full table)
+-------------------------------------------------------------------
+- Class 2 — column reference (strict): string MUST be an existing column.
+  This module handles Class 2. Examples: ``group_by``.
+
+- Class 4 — expression-or-column (permissive): string is either column
+  name OR pandas eval expression; dfdraw evaluates internally.
+  Examples: ``weights`` (handled by ``_eval_weights`` in plots/profile.py).
+  These parameters are NOT validated here.
+
+References
+----------
+- BUG_ADF_GroupBy_Expression_Materialization.md
+- PHASE_13_30_DF_v1_0_Proposal_ParameterClassValidation.md §3, §4
+"""
+from typing import Iterable, Mapping, Any, List, Tuple
+import pandas as pd
+
+
+def validate_column_references(
+    df: pd.DataFrame,
+    kwargs: Mapping[str, Any],
+    names: Iterable[str],
+    context: str,
+) -> None:
+    """Validate that every Class-2 column-reference kwarg names a real column.
+
+    For each name in ``names``, look up the value in ``kwargs``. If the
+    value is a non-empty string and is NOT a column in ``df``, raise
+    ``ValueError`` with an actionable message naming the offending
+    parameter, the bad value, the available columns (first 10), and a
+    pointer to the materialize-first remediation.
+
+    Class-2 parameters are strict column references — computed expressions
+    must be materialized by the caller (e.g., via
+    ``AliasDataFrame.add_alias``). Class-4 parameters (like ``weights``,
+    which dfdraw evaluates via ``df.eval``) are NOT validated here and
+    must NOT be listed in ``names``.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The DataFrame the plot call will operate on.
+    kwargs : Mapping[str, Any]
+        Keyword arguments of the calling plot function. Typically pass
+        ``locals()`` from the call site.
+    names : Iterable[str]
+        Names of Class-2 column-reference parameters to validate. Driven
+        by the per-plot ``_*_COLUMN_REFERENCES`` tuple in ``drawer.py``.
+        Pass the tuple directly to avoid duplication and drift.
+    context : str
+        Plot-type name (``"profile"`` / ``"hist"`` / ``"scatter"`` etc.).
+        Used only to make the error message actionable.
+
+    Raises
+    ------
+    ValueError
+        If any value in ``names`` is a non-empty string not present in
+        ``df.columns``. Single-bad case produces a per-parameter message;
+        multi-bad case enumerates all offenders in one error.
+
+    Notes
+    -----
+    - ``None`` values, non-string values, and the empty string are skipped
+      (these are not the bug class we're catching — they have other paths).
+    - Empty DataFrames (no columns at all) are passed through silently;
+      downstream code already handles empty input.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6], "sector": [0, 1, 0]})
+    >>> # Existing column — no raise:
+    >>> validate_column_references(df, {"group_by": "sector"},
+    ...                            names=("group_by",), context="profile")
+
+    >>> # Missing column — raises ValueError:
+    >>> validate_column_references(df, {"group_by": "row%3"},
+    ...                            names=("group_by",), context="profile")
+    Traceback (most recent call last):
+        ...
+    ValueError: group_by='row%3' is not a column in the DataFrame ...
+    """
+    if df is None or len(getattr(df, "columns", [])) == 0:
+        # No columns to check against; downstream code handles empty df
+        return
+
+    bad: List[Tuple[str, str]] = []
+    for name in names:
+        value = kwargs.get(name)
+        if value is None or not isinstance(value, str) or not value:
+            continue
+        if value in df.columns:
+            continue
+        bad.append((name, value))
+
+    if not bad:
+        return
+
+    cols = list(df.columns)
+    if len(cols) > 10:
+        cols_preview = ", ".join(repr(c) for c in cols[:10]) + f", ... ({len(cols)} total)"
+    else:
+        cols_preview = ", ".join(repr(c) for c in cols)
+
+    if len(bad) == 1:
+        name, value = bad[0]
+        raise ValueError(
+            f"{name}={value!r} is not a column in the DataFrame "
+            f"(called from dfdraw {context}()). "
+            f"If this is a computed expression, the caller must materialize "
+            f"it into a column first. With AliasDataFrame: "
+            f"adf.add_alias(<name>, {value!r}). "
+            f"Available columns: [{cols_preview}]"
+        )
+
+    bad_str = ", ".join(f"{n}={v!r}" for n, v in bad)
+    raise ValueError(
+        f"{len(bad)} column-reference parameters do not name DataFrame "
+        f"columns (called from dfdraw {context}()): {bad_str}. "
+        f"Materialize computed expressions into columns first. "
+        f"Available columns: [{cols_preview}]"
+    )
diff --git a/UTILS/dfextensions/dfdraw/plots/histogram.py b/UTILS/dfextensions/dfdraw/plots/histogram.py
@@ -20,6 +20,8 @@
 # Phase 13.28.DF: Robust data handling
 from ._data_sanitize import sanitize_for_plot
 from ._autorange import compute_autorange, VALID_STRATEGIES
+# Phase 13.30.DF: Class-2 column-reference parameter validation
+from ._validation import validate_column_references
 
 
 # =============================================================================
@@ -303,6 +305,15 @@ def draw_hist(
     elif norm == "probability":
         weights = np.ones_like(x_data) / len(x_data) if len(x_data) > 0 else None
     
+    # Phase 13.30.DF: Validate Class-2 column-reference parameters.
+    # Catches BUG_ADF_GroupBy_Expression_Materialization (silent fallthrough below).
+    from ..drawer import DFDraw as _DFDraw
+    validate_column_references(
+        df, locals(),
+        names=_DFDraw._HIST_COLUMN_REFERENCES,
+        context="hist",
+    )
+
     # Group-by handling
     if group_by is not None and group_by in df.columns:
         _draw_hist_grouped(
diff --git a/UTILS/dfextensions/dfdraw/plots/profile.py b/UTILS/dfextensions/dfdraw/plots/profile.py
@@ -29,6 +29,8 @@
 # Phase 13.28.DF: Robust data handling
 from ._data_sanitize import sanitize_for_plot
 from ._autorange import compute_autorange, resolve_range_1d, VALID_STRATEGIES
+# Phase 13.30.DF: Class-2 column-reference parameter validation
+from ._validation import validate_column_references
 
 
 # =============================================================================
@@ -294,7 +296,18 @@ def draw_profile(
             "group_by_quantiles must be an integer (number of quantile bins), "
             "not True. Example: group_by_quantiles=4"
         )
-    
+
+    # Phase 13.30.DF: Validate Class-2 column-reference parameters.
+    # Catches BUG_ADF_GroupBy_Expression_Materialization: caller passed a
+    # computed expression (e.g. "row%3") where a column name is required.
+    # Import tuple from drawer at call time to avoid circular import.
+    from ..drawer import DFDraw as _DFDraw
+    validate_column_references(
+        df, locals(),
+        names=_DFDraw._PROFILE_COLUMN_REFERENCES,
+        context="profile",
+    )
+
     # Get style defaults
     if bins is None:
         bins = get_style_value("hist.bins", 50)
diff --git a/UTILS/dfextensions/dfdraw/plots/scatter.py b/UTILS/dfextensions/dfdraw/plots/scatter.py
@@ -21,6 +21,8 @@
 from ..stats import format_stats_box
 # Phase 13.28.DF: Robust data handling
 from ._data_sanitize import sanitize_for_plot
+# Phase 13.30.DF: Class-2 column-reference parameter validation
+from ._validation import validate_column_references
 
 
 def draw_scatter(
@@ -164,6 +166,15 @@ def draw_scatter(
     if jitter:
         x_data, y_data = _apply_jitter(x_data, y_data, jitter)
     
+    # Phase 13.30.DF: Validate Class-2 column-reference parameters.
+    # Catches BUG_ADF_GroupBy_Expression_Materialization (silent fallthrough below).
+    from ..drawer import DFDraw as _DFDraw
+    validate_column_references(
+        df, locals(),
+        names=_DFDraw._SCATTER_COLUMN_REFERENCES,
+        context="scatter",
+    )
+
     # Group-by handling
     if group_by is not None and group_by in df.columns:
         _draw_scatter_grouped(
diff --git a/UTILS/dfextensions/dfdraw/tests/test_phase_13_30_column_reference_validation.py b/UTILS/dfextensions/dfdraw/tests/test_phase_13_30_column_reference_validation.py