|
| 1 | +""" |
| 2 | +Parameter validation for dfdraw plot calls — Phase 13.30.DF v1.0. |
| 3 | +
|
| 4 | +Class-2 column-reference validation: enforce that string-valued parameters |
| 5 | +naming a DataFrame column refer to an existing column, instead of silently |
| 6 | +falling through to ungrouped/unfiltered behaviour when the column is missing. |
| 7 | +
|
| 8 | +Background |
| 9 | +---------- |
| 10 | +- BUG_ADF_GroupBy_Expression_Materialization (2026-05-11): ADF forwards |
| 11 | + `group_by="row%3"` unmaterialized; dfdraw's silent-fallthrough check |
| 12 | + (e.g. `if group_by in df.columns`) hid the bug behind a downstream |
| 13 | + matplotlib UserWarning about empty legend labels. |
| 14 | +
|
| 15 | +- Phase 13.30.DF sub-fix 1: introduce class-level tuple + helper so all |
| 16 | + Class-2 parameters are validated uniformly, and Phase 13.27 Commit 2's |
| 17 | + selection_vector / weights_vector get coverage for free when their |
| 18 | + tuples are populated. |
| 19 | +
|
| 20 | +Parameter taxonomy (see Phase 13.30 proposal §3 for the full table) |
| 21 | +------------------------------------------------------------------- |
| 22 | +- Class 2 — column reference (strict): string MUST be an existing column. |
| 23 | + This module handles Class 2. Examples: ``group_by``. |
| 24 | +
|
| 25 | +- Class 4 — expression-or-column (permissive): string is either column |
| 26 | + name OR pandas eval expression; dfdraw evaluates internally. |
| 27 | + Examples: ``weights`` (handled by ``_eval_weights`` in plots/profile.py). |
| 28 | + These parameters are NOT validated here. |
| 29 | +
|
| 30 | +References |
| 31 | +---------- |
| 32 | +- BUG_ADF_GroupBy_Expression_Materialization.md |
| 33 | +- PHASE_13_30_DF_v1_0_Proposal_ParameterClassValidation.md §3, §4 |
| 34 | +""" |
| 35 | +from typing import Iterable, Mapping, Any, List, Tuple |
| 36 | +import pandas as pd |
| 37 | + |
| 38 | + |
| 39 | +def validate_column_references( |
| 40 | + df: pd.DataFrame, |
| 41 | + kwargs: Mapping[str, Any], |
| 42 | + names: Iterable[str], |
| 43 | + context: str, |
| 44 | +) -> None: |
| 45 | + """Validate that every Class-2 column-reference kwarg names a real column. |
| 46 | +
|
| 47 | + For each name in ``names``, look up the value in ``kwargs``. If the |
| 48 | + value is a non-empty string and is NOT a column in ``df``, raise |
| 49 | + ``ValueError`` with an actionable message naming the offending |
| 50 | + parameter, the bad value, the available columns (first 10), and a |
| 51 | + pointer to the materialize-first remediation. |
| 52 | +
|
| 53 | + Class-2 parameters are strict column references — computed expressions |
| 54 | + must be materialized by the caller (e.g., via |
| 55 | + ``AliasDataFrame.add_alias``). Class-4 parameters (like ``weights``, |
| 56 | + which dfdraw evaluates via ``df.eval``) are NOT validated here and |
| 57 | + must NOT be listed in ``names``. |
| 58 | +
|
| 59 | + Parameters |
| 60 | + ---------- |
| 61 | + df : pandas.DataFrame |
| 62 | + The DataFrame the plot call will operate on. |
| 63 | + kwargs : Mapping[str, Any] |
| 64 | + Keyword arguments of the calling plot function. Typically pass |
| 65 | + ``locals()`` from the call site. |
| 66 | + names : Iterable[str] |
| 67 | + Names of Class-2 column-reference parameters to validate. Driven |
| 68 | + by the per-plot ``_*_COLUMN_REFERENCES`` tuple in ``drawer.py``. |
| 69 | + Pass the tuple directly to avoid duplication and drift. |
| 70 | + context : str |
| 71 | + Plot-type name (``"profile"`` / ``"hist"`` / ``"scatter"`` etc.). |
| 72 | + Used only to make the error message actionable. |
| 73 | +
|
| 74 | + Raises |
| 75 | + ------ |
| 76 | + ValueError |
| 77 | + If any value in ``names`` is a non-empty string not present in |
| 78 | + ``df.columns``. Single-bad case produces a per-parameter message; |
| 79 | + multi-bad case enumerates all offenders in one error. |
| 80 | +
|
| 81 | + Notes |
| 82 | + ----- |
| 83 | + - ``None`` values, non-string values, and the empty string are skipped |
| 84 | + (these are not the bug class we're catching — they have other paths). |
| 85 | + - Empty DataFrames (no columns at all) are passed through silently; |
| 86 | + downstream code already handles empty input. |
| 87 | +
|
| 88 | + Examples |
| 89 | + -------- |
| 90 | + >>> df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6], "sector": [0, 1, 0]}) |
| 91 | + >>> # Existing column — no raise: |
| 92 | + >>> validate_column_references(df, {"group_by": "sector"}, |
| 93 | + ... names=("group_by",), context="profile") |
| 94 | +
|
| 95 | + >>> # Missing column — raises ValueError: |
| 96 | + >>> validate_column_references(df, {"group_by": "row%3"}, |
| 97 | + ... names=("group_by",), context="profile") |
| 98 | + Traceback (most recent call last): |
| 99 | + ... |
| 100 | + ValueError: group_by='row%3' is not a column in the DataFrame ... |
| 101 | + """ |
| 102 | + if df is None or len(getattr(df, "columns", [])) == 0: |
| 103 | + # No columns to check against; downstream code handles empty df |
| 104 | + return |
| 105 | + |
| 106 | + bad: List[Tuple[str, str]] = [] |
| 107 | + for name in names: |
| 108 | + value = kwargs.get(name) |
| 109 | + if value is None or not isinstance(value, str) or not value: |
| 110 | + continue |
| 111 | + if value in df.columns: |
| 112 | + continue |
| 113 | + bad.append((name, value)) |
| 114 | + |
| 115 | + if not bad: |
| 116 | + return |
| 117 | + |
| 118 | + cols = list(df.columns) |
| 119 | + if len(cols) > 10: |
| 120 | + cols_preview = ", ".join(repr(c) for c in cols[:10]) + f", ... ({len(cols)} total)" |
| 121 | + else: |
| 122 | + cols_preview = ", ".join(repr(c) for c in cols) |
| 123 | + |
| 124 | + if len(bad) == 1: |
| 125 | + name, value = bad[0] |
| 126 | + raise ValueError( |
| 127 | + f"{name}={value!r} is not a column in the DataFrame " |
| 128 | + f"(called from dfdraw {context}()). " |
| 129 | + f"If this is a computed expression, the caller must materialize " |
| 130 | + f"it into a column first. With AliasDataFrame: " |
| 131 | + f"adf.add_alias(<name>, {value!r}). " |
| 132 | + f"Available columns: [{cols_preview}]" |
| 133 | + ) |
| 134 | + |
| 135 | + bad_str = ", ".join(f"{n}={v!r}" for n, v in bad) |
| 136 | + raise ValueError( |
| 137 | + f"{len(bad)} column-reference parameters do not name DataFrame " |
| 138 | + f"columns (called from dfdraw {context}()): {bad_str}. " |
| 139 | + f"Materialize computed expressions into columns first. " |
| 140 | + f"Available columns: [{cols_preview}]" |
| 141 | + ) |
0 commit comments