Skip to content

Commit 3b910ae

Browse files
author
miranov25
committed
PHASE 13.35.DF — group_by_bins + hist_norm for hist() (BUG-013)
Fixes 3 AttributeError crashes confirmed in 2026-05-20 live testing: T2: group_by_bins leaks to ax.hist() (no facet) T3: same crash in faceted subplot path T4: hist_norm leaks to ax.hist() Root cause: group_by_bins, group_by_quantiles, hist_norm, min_entries absent from BOTH DFDraw.hist() method signature AND draw_hist() function signature AND _HIST_FORWARDED_NAMES -> fall into **kwargs -> forwarded to _draw_hist_grouped() **hist_kwargs -> reach ax.hist() which rejects. Fixes (7 edits, ~300 LOC source + 384 LOC tests): 1. _HIST_FORWARDED_NAMES: add group_by_bins, group_by_quantiles, hist_norm, min_entries (drawer.py) 2. draw_hist() signature: add the 4 as explicit params (histogram.py) 3. group_by routing block: BUG-012 guard (float + nunique>20 + no bins -> ValueError with 'group_by_bins=N' guidance), pd.cut/qcut binning with float16->float32 upcast, df.copy() to avoid caller mutation, shared bin edges from x_data (sanitized at lines 259-307, NOT df[x].dropna() which would bypass nan_policy), stats_dict['n_groups'] population (was missing — T1 observation) 4. _draw_hist_grouped(): pop 'weights' from hist_kwargs to avoid ax.hist() double-weights TypeError; one-pass stacked loop building data_list + labels + surviving_colors in lockstep (fixes v1.2 P1-D label misalignment AND P3 color-shift); n_rendered counter in overlaid branch (v1.1 P1-C); str(group) labels (no _format_interval_label import — v1.1 P1-A); return -> int 5. _group_weights(): new module-level helper (probability/density) 6. DFDraw.hist() signature: add the 4 params (required by R6 module-import validator — discovered during implementation) 7. _dispatch_faceted_render() call: forward the 4 params explicitly (required after Edit 6 consumes them off **kwargs — discovered when T3 architect-call test initially failed) Architect's TPC/ITS production call now works end-to-end: adf.draw('dyp_I6-dyp_recoV2', type='hist', group_by='z', group_by_bins=5, min_entries=25, facet_by='sec') Live-tested: 9 sector panels x 5 drift-coordinate bins each. §9 tests (+11): HGB.1-3 group_by_bins + facet_by + shared bin edges HN.1-3 hist_norm probability/None/density (math at data layer) HGS.1-2 stats['n_groups'] + BUG-012 guard with actionable error HGSt.1 stacked + min_entries label alignment (regression lock for v1.2 P1-D, verified by failure injection) HGBC.1-2 backward compat (single-hist path; categorical group_by) Gate: 822 -> 833 / 0 / 0 / 1 skipped. Spec: PHASE_13_35_DF_v1_3_Proposal_HistGroupByNorm.md (notes repo) v1.0 -> v1.1: 5 factual source errors (Claude40 panel) v1.1 -> v1.2: 3 P1s (Sonet50/Sonnet52_R1/Sonnet53_R2 panel) v1.2 -> v1.3: 1 P1 stacked branch + 1 P2 comment + 1 §9 test add v1.3 implementation by Claude48 + 2 discovered fixes + P3 color-shift.
1 parent 458a952 commit 3b910ae

4 files changed

Lines changed: 578 additions & 20 deletions

File tree

UTILS/dfextensions/dfdraw/docs/CAPABILITY_MATRIX.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Capability Matrix — dfdraw
22

3-
**Generated:** 2026-05-18 14:39 UTC
3+
**Generated:** 2026-05-20 13:30 UTC
44
**Phase:** 13.15.DF
55
**Generator:** `scripts/generate_capability_matrix.py`
66
**Sources:** `tests/feature_taxonomy.py` + `tests/test_layer_classification.py`

UTILS/dfextensions/dfdraw/drawer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,13 @@ def _auto_label(self, y_expr, x_expr=None):
594594
'selection_labels', 'weights_labels',
595595
'selection_categorical', 'weights_categorical',
596596
'vector_compose', 'delta_facet',
597+
# Phase 13.35.DF: float group_by binning + per-group normalization (BUG-013 fix).
598+
# Without these, group_by_bins/quantiles/hist_norm/min_entries fall into
599+
# **kwargs → forwarded to _draw_hist_grouped() **hist_kwargs → reach
600+
# ax.hist() which raises AttributeError (T2/T3/T4 from v1.3 §3.2).
601+
'group_by_bins', 'group_by_quantiles',
602+
'hist_norm',
603+
'min_entries',
597604
)
598605

599606
_SCATTER_FORWARDED_NAMES = (
@@ -3175,6 +3182,14 @@ def hist(
31753182
weights_categorical: bool = False,
31763183
vector_compose: str = "inner",
31773184
delta_facet: Optional[str] = None,
3185+
# Phase 13.35.DF: float group_by binning + per-group normalization
3186+
# (BUG-013 fix). group_by_bins / group_by_quantiles bin a float
3187+
# group_by column via pd.cut/qcut. min_entries skips groups below
3188+
# threshold. hist_norm: None | "probability" | "density" (per-group).
3189+
group_by_bins: Optional[int] = None,
3190+
group_by_quantiles: Optional[int] = None,
3191+
hist_norm: Optional[str] = None,
3192+
min_entries: int = 0,
31783193
**kwargs
31793194
) -> DrawResult:
31803195
"""
@@ -3352,6 +3367,16 @@ def hist(
33523367
weights=weights,
33533368
facet_by_bins=facet_by_bins,
33543369
facet_by_quantiles=facet_by_quantiles,
3370+
# Phase 13.35.DF: forward float group_by binning + per-group
3371+
# normalization to per-subplot draw_hist (BUG-013 fix T3).
3372+
# Without these, the architect's call
3373+
# d.hist('x', group_by='z', group_by_bins=5, facet_by='sec')
3374+
# loses group_by_bins between method-level explicit-param
3375+
# consumption and per-subplot draw_hist invocation.
3376+
group_by_bins=group_by_bins,
3377+
group_by_quantiles=group_by_quantiles,
3378+
hist_norm=hist_norm,
3379+
min_entries=min_entries,
33553380
**kwargs
33563381
)
33573382
# Facet mode (legacy path, same=True ignored in facet mode)
@@ -3378,6 +3403,11 @@ def hist(
33783403
nan_policy=nan_policy,
33793404
# Phase 13.27.DF Commit 2 FIX1 (§7b): column-name weights
33803405
weights=weights,
3406+
# Phase 13.35.DF: float group_by binning + per-group normalization
3407+
group_by_bins=group_by_bins,
3408+
group_by_quantiles=group_by_quantiles,
3409+
hist_norm=hist_norm,
3410+
min_entries=min_entries,
33813411
**kwargs
33823412
)
33833413
axes = ax

UTILS/dfextensions/dfdraw/plots/histogram.py

Lines changed: 163 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,43 @@ def _compute_robust_stats_1d(data, groups, suffix=''):
143143
return result
144144

145145

146+
def _group_weights(
147+
x_group: np.ndarray,
148+
bin_edges: Optional[np.ndarray],
149+
hist_norm: Optional[str],
150+
) -> Optional[np.ndarray]:
151+
"""Per-group histogram weights for hist_norm normalization (Phase 13.35.DF).
152+
153+
Returns
154+
-------
155+
None if hist_norm is None (raw counts — pass-through to ax.hist)
156+
ndarray of length len(x_group) otherwise:
157+
'probability' : weights = 1/n → sum(heights) = 1.0
158+
'density' : weights = 1/(n × Δx) → ∫ heights dx = 1.0
159+
160+
Raises
161+
------
162+
ValueError if hist_norm is not None / 'probability' / 'density'.
163+
"""
164+
if hist_norm is None:
165+
return None
166+
n = len(x_group)
167+
if n == 0:
168+
return None
169+
if hist_norm == "probability":
170+
return np.ones(n) / n
171+
if hist_norm == "density":
172+
if bin_edges is not None:
173+
bin_width = float(np.diff(bin_edges).mean())
174+
else:
175+
bin_width = 1.0
176+
return np.ones(n) / (n * bin_width)
177+
raise ValueError(
178+
f"hist_norm must be None, 'probability', or 'density'; "
179+
f"got {hist_norm!r}"
180+
)
181+
182+
146183
def draw_hist(
147184
df: pd.DataFrame,
148185
x: Union[str, pd.Series, np.ndarray],
@@ -176,6 +213,15 @@ def draw_hist(
176213
# via the weights= kwarg. Precedence with norm="probability": explicit
177214
# per-row weights win and are additionally scaled by 1/n_clean.
178215
weights: Optional[str] = None,
216+
# Phase 13.35.DF: float group_by binning + per-group normalization (BUG-013 fix).
217+
# group_by_bins / group_by_quantiles: bin a float group_by column via pd.cut/qcut
218+
# before the per-group rendering loop. min_entries: skip groups below threshold.
219+
# hist_norm: per-group normalization (None=raw counts, "probability"=sum=1,
220+
# "density"=area=1). Independent of single-histogram norm= parameter.
221+
group_by_bins: Optional[int] = None,
222+
group_by_quantiles: Optional[int] = None,
223+
hist_norm: Optional[str] = None,
224+
min_entries: int = 0,
179225
**kwargs
180226
) -> Tuple[plt.Figure, plt.Axes, Dict[str, Any]]:
181227
"""
@@ -374,13 +420,62 @@ def draw_hist(
374420
if group_by is not None and group_by in df.columns:
375421
# w_data + group_by raises above; here _hist_weights is either None or
376422
# the probability-synthesized 1/n array (pre-FIX1 behavior).
377-
_draw_hist_grouped(
423+
#
424+
# Phase 13.35.DF (BUG-013 fix): float group_by + binning + per-group norm.
425+
426+
col = df[group_by]
427+
428+
# BUG-012 protection: float column with high cardinality and no bins.
429+
# Threshold 20 is a heuristic — see BUG-012 report for rationale.
430+
# Catches the silent-explosion case where group_by='z' on a continuous
431+
# float column produces hundreds of one-entry groups.
432+
if (col.dtype.kind == 'f'
433+
and group_by_bins is None
434+
and group_by_quantiles is None
435+
and col.nunique() > 20):
436+
raise ValueError(
437+
f"group_by='{group_by}' is a float column with "
438+
f"{col.nunique()} unique values. "
439+
f"Add group_by_bins=N or group_by_quantiles=N to bin it. "
440+
f"Example: group_by_bins=5 or group_by_quantiles=5."
441+
)
442+
443+
# Float binning: replace group_by column values with pd.Interval labels.
444+
# Use df.copy() to avoid modifying the caller's DataFrame.
445+
if group_by_bins is not None or group_by_quantiles is not None:
446+
df = df.copy()
447+
if col.dtype == np.float16:
448+
# Match profile path pattern (drawer.py:2585): pd.cut/qcut
449+
# don't accept float16 directly; upcast.
450+
df[group_by] = df[group_by].astype(np.float32)
451+
if group_by_bins is not None:
452+
df[group_by] = pd.cut(df[group_by], bins=group_by_bins)
453+
else:
454+
df[group_by] = pd.qcut(
455+
df[group_by], q=group_by_quantiles, duplicates='drop'
456+
)
457+
458+
# Shared bin edges computed from sanitized x_data.
459+
# x_data is fully sanitized (nan_policy applied) at lines 259-307.
460+
# Do NOT use df[x].dropna() here — that would bypass the nan_policy
461+
# sanitization already applied. (v1.1 P1-B from review panel.)
462+
_bins_for_edges = bins if bins is not None else 100
463+
_, shared_edges = np.histogram(
464+
x_data, bins=_bins_for_edges, range=_used_range
465+
)
466+
467+
n_rendered = _draw_hist_grouped(
378468
df, x, ax, group_by, top_k, stacked,
379-
bins=bins, range=_used_range, density=density, weights=_hist_weights,
469+
bin_edges=shared_edges,
470+
hist_norm=hist_norm,
471+
min_entries=min_entries,
472+
density=density, weights=_hist_weights,
380473
alpha=alpha, histtype=histtype, edgecolor=edgecolor,
381-
linewidth=linewidth, **kwargs
474+
linewidth=linewidth,
475+
**kwargs # group_by_bins/hist_norm/min_entries already consumed
382476
)
383477
stats_dict["grouped"] = True
478+
stats_dict["n_groups"] = n_rendered # was missing — T1 observation
384479
else:
385480
# Single histogram
386481
ax.hist(
@@ -432,37 +527,86 @@ def _draw_hist_grouped(
432527
group_by: str,
433528
top_k: Optional[int],
434529
stacked: bool,
530+
bin_edges: Optional[np.ndarray] = None, # Phase 13.35.DF: shared edges from full dataset
531+
hist_norm: Optional[str] = None, # Phase 13.35.DF: None | "probability" | "density"
532+
min_entries: int = 0, # Phase 13.35.DF: skip groups below threshold
435533
**hist_kwargs
436-
) -> None:
437-
"""Draw grouped/overlaid histograms."""
534+
) -> int:
535+
"""Draw grouped/overlaid histograms.
536+
537+
Phase 13.35.DF: extended for float group_by binning + per-group normalization.
538+
Returns the number of groups actually rendered (post min_entries filter).
539+
"""
438540
import matplotlib.pyplot as plt
439-
440-
# Get groups
541+
542+
# Phase 13.35.DF: pop 'weights' from hist_kwargs — the grouped path uses
543+
# per-group hist_norm weights (from _group_weights), not the routing
544+
# block's _hist_weights (which is None on this path anyway because
545+
# group_by + column-name weights raises NotImplementedError earlier).
546+
# Without this pop, ax.hist sees 'weights' twice (TypeError).
547+
hist_kwargs.pop('weights', None)
548+
549+
# Get groups (pd.Interval objects when group_by_bins/_quantiles was used;
550+
# scalar values otherwise).
441551
groups = df[group_by].unique()
442-
443-
# Top-K filtering
552+
553+
# Top-K filtering (existing behavior)
444554
if top_k is not None and len(groups) > top_k:
445555
counts = df[group_by].value_counts()
446556
top_groups = counts.head(top_k).index.tolist()
447557
groups = top_groups
448-
449-
# Color palette
558+
559+
# Color palette (existing behavior)
450560
palette_name = get_style_value("colors.palette", "tab10")
451561
palette = plt.colormaps.get_cmap(palette_name)
452562
colors = [palette(i % 10) for i in range(len(groups))]
453-
563+
564+
# Use shared edges if provided; fall back to matplotlib auto-bin (or kwarg)
565+
bins_arg = bin_edges if bin_edges is not None else hist_kwargs.pop('bins', 100)
566+
454567
if stacked:
455-
# Stacked histogram
456-
# BUG_dfdraw_20260505: cast to float for boolean expressions
457-
data_list = [df[df[group_by] == g][x].dropna().values.astype(float) for g in groups]
458-
ax.hist(data_list, label=[str(g) for g in groups], color=colors,
459-
stacked=True, **hist_kwargs)
568+
# One-pass loop: build data_list, labels, AND surviving_colors in lockstep
569+
# so all three stay aligned when min_entries filters drop groups.
570+
# v1.2 used two-pass list comprehensions (data_list filtered, then labels
571+
# zipped against UNFILTERED groups) → misaligned labels.
572+
# That was v1.2 P1-D — Hard Constraint §3 silent wrong result.
573+
# P3 (color-shift) also fixed here: pre-Phase 13.35.DF, colors[:M] gave
574+
# sequential tab10 colors, not the original-index color per surviving
575+
# group; surviving_colors[i] preserves the original colors[i] mapping.
576+
data_list, labels, surviving_colors = [], [], []
577+
for i, g in enumerate(groups):
578+
d = df[df[group_by] == g][x].dropna().values.astype(float)
579+
if len(d) >= min_entries:
580+
data_list.append(d)
581+
# Label: str(g) handles both scalars and pd.Interval objects.
582+
# _format_interval_label is defined in profile.py and NOT
583+
# imported here. (v1.1 P1-A from review panel.)
584+
labels.append(str(g))
585+
surviving_colors.append(colors[i])
586+
if not data_list:
587+
return 0
588+
ax.hist(data_list, bins=bins_arg, label=labels,
589+
color=surviving_colors, stacked=True, **hist_kwargs)
590+
return len(data_list)
460591
else:
461-
# Overlaid histograms
592+
# Overlaid histograms — one ax.hist call per surviving group.
593+
# colors[i] preserves original-index color when groups are skipped
594+
# (overlaid branch was already correct in baseline; documented for parity).
595+
n_rendered = 0
462596
for i, group in enumerate(groups):
463597
# BUG_dfdraw_20260505: cast to float for boolean expressions
464598
group_data = df[df[group_by] == group][x].dropna().values.astype(float)
465-
ax.hist(group_data, label=str(group), color=colors[i], **hist_kwargs)
599+
if len(group_data) < min_entries:
600+
continue
601+
weights = _group_weights(group_data, bin_edges, hist_norm)
602+
# Label via str(group) — not _format_interval_label. (v1.1 P1-A)
603+
ax.hist(group_data, bins=bins_arg,
604+
label=str(group), color=colors[i],
605+
weights=weights, **hist_kwargs)
606+
n_rendered += 1 # count post-skip (v1.1 P1-C from review panel)
607+
if n_rendered > 0:
608+
ax.legend()
609+
return n_rendered
466610

467611

468612
def _add_stats_box(

0 commit comments

Comments
 (0)