buckaroo-data · paddymul · Jun 9, 2026 · May 31, 2026 · May 31, 2026 · chatgpt-codex-connector
diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py
@@ -29,7 +29,7 @@
 from .pluggable_analysis_framework.col_analysis import ColAnalysis
 from buckaroo.extension_utils import copy_extend
 
-from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64
+from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet
 from .dataflow.dataflow import CustomizableDataflow
 from .dataflow.dataflow_extras import (Sampling, exception_protect)
 from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG)
@@ -261,11 +261,10 @@ def post_process_df(kls, df):
         self.buckaroo_state = temp_buckaroo_state
 
     def _sd_to_jsondf(self, sd):
-        """Serialize summary stats dict. Returns parquet-b64 tagged dict by default.
-
-        Exists so this can be overridden for polars.
-        """
-        return sd_to_parquet_b64(sd)
+        """Serialize summary stats. Delegates to the dataflow so the wire
+        projection (see #880) lives in exactly one place — used by the
+        infinite-widget path, which assembles ``all_stats`` on the widget."""
+        return self.dataflow._sd_to_jsondf(sd)
 
 
 

diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py
@@ -6,7 +6,7 @@
 from traitlets import Unicode, Any, observe, Dict
 
 from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis, SDType
-from ..serialization_utils import pd_to_obj, sd_to_parquet_b64
+from ..serialization_utils import pd_to_obj, sd_to_parquet_b64, project_sd
 from buckaroo.pluggable_analysis_framework.utils import (filter_analysis)
 from buckaroo.pluggable_analysis_framework.df_stats_v2 import DfStatsV2
 from .autocleaning import SentinelAutocleaning
@@ -17,7 +17,7 @@
     OverrideColumnConfig,
     PinnedRowConfig,
     merge_sd_overrides,
-    merge_sds, merge_column_config, StylingAnalysis)
+    merge_sds, merge_column_config, StylingAnalysis, wire_stat_keys)
 
 
 from .abc_dataflow import ABCDataflow
@@ -648,11 +648,16 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str,
     # ### end summary stats block        
 
     def _sd_to_jsondf(self, sd:SDType):
-        """Serialize summary stats dict. Returns parquet-b64 tagged dict by default.
+        """Serialize summary stats to the wire payload (parquet-b64 tagged dict).
 
-        Exists so this can be overridden for polars.
+        Projects ``sd`` down to just the stats the frontend reads — the
+        pinned-row values + histogram bins (see ``wire_stat_keys`` / #880) —
+        before serializing. The full ``sd`` stays on the dataflow's
+        ``merged_sd`` for styling regeneration and server-side use; only the
+        wire copy shrinks.
         """
-        return sd_to_parquet_b64(sd)
+        keep = wire_stat_keys(self.df_display_klasses.values(), self.pinned_rows)
+        return sd_to_parquet_b64(project_sd(sd, keep))
 
     def _df_to_obj(self, df:pd.DataFrame) -> TDict[str, TAny]:
         return pd_to_obj(self.sampling_klass.serialize_sample(df))

diff --git a/buckaroo/dataflow/styling_core.py b/buckaroo/dataflow/styling_core.py
@@ -472,6 +472,47 @@ def style_columns(cls, sd:SDType, df:pd.DataFrame) -> List[ColumnConfig]:
                 continue
             ret_col_config.append(base_style)
         return ret_col_config
-
 
-
+
+# Stat keys the JS color-map rule reads per column straight off the wire
+# payload (``histogram_bins`` / ``histogram_log_bins`` in gridUtils.ts),
+# independent of any pinned row.
+HISTOGRAM_BIN_WIRE_KEYS = frozenset({'histogram_bins', 'histogram_log_bins'})
+
+
+def _pinned_row_stat_keys(pinned_rows: Any) -> set:
+    """Stat keys referenced by a list of ``PinnedRowConfig`` entries.
+
+    A leading ``?`` marks an optional/scoped row whose data is keyed by the
+    unprefixed name (mirrors ``stripOptionalPinnedKey`` in gridUtils.ts).
+    """
+    keys = set()
+    for pr in pinned_rows or []:
+        pkey = pr.get('primary_key_val')
+        if not pkey:
+            continue
+        keys.add(pkey[1:] if pkey.startswith('?') else pkey)
+    return keys
+
+
+def wire_stat_keys(styling_classes: Iterable[Any], extra_pinned_rows: Any = ()) -> set:
+    """Stat keys the frontend reads from the ``all_stats`` wire payload.
+
+    The frontend reads exactly two things out of the summary-stats payload:
+    the histogram-bin arrays the color-map rule bins against
+    (``HISTOGRAM_BIN_WIRE_KEYS``), and the per-column pinned-row values it
+    looks up by ``primary_key_val``. Everything else in ``merged_sd``
+    (``value_counts``, ``histogram_args``, ``memory_usage``, the ``is_*``
+    typing flags, the heuristic ``*_frac`` cleaning stats, ...) is shipped
+    today but never read. This is the allowlist used to trim the wire copy
+    (see ``project_sd`` / #880).
+
+    ``styling_classes`` are the active ``StylingAnalysis`` subclasses (the
+    dataflow's ``df_display_klasses`` values); ``extra_pinned_rows`` carries
+    any runtime ``pinned_rows`` override set on the dataflow.
+    """
+    keys = set(HISTOGRAM_BIN_WIRE_KEYS)
+    for kls in styling_classes:
+        keys |= _pinned_row_stat_keys(getattr(kls, 'pinned_rows', None))
+    keys |= _pinned_row_stat_keys(extra_pinned_rows)
+    return keys
diff --git a/buckaroo/polars_buckaroo.py b/buckaroo/polars_buckaroo.py
@@ -9,7 +9,7 @@
 from .pluggable_analysis_framework.df_stats_v2 import PlDfStatsV2
 from .pluggable_analysis_framework.polars_analysis_management import PlDfStats
 from .customizations.pl_stats_v2 import PL_ANALYSIS_V2
-from .serialization_utils import pd_to_obj, sd_to_parquet_b64
+from .serialization_utils import pd_to_obj
 from .customizations.styling import DefaultSummaryStatsStyling, DefaultMainStyling
 from .customizations.pl_autocleaning_conf import NoCleaningConfPl
 from .dataflow.dataflow import Sampling
@@ -69,9 +69,8 @@ class PolarsBuckarooWidget(BuckarooWidget):
     DFStatsClass = PlDfStatsV2
     sampling_klass = PLSampling
 
-    def _sd_to_jsondf(self, sd):
-        """Serialize summary stats dict as parquet-b64."""
-        return sd_to_parquet_b64(sd)
+    # _sd_to_jsondf is inherited from BuckarooWidgetBase, which delegates to
+    # the dataflow so the wire-stat projection (#880) lives in one place.
 
     def _build_error_dataframe(self, e):
         return pl.DataFrame({'err': [str(e)]})

diff --git a/buckaroo/serialization_utils.py b/buckaroo/serialization_utils.py
@@ -394,6 +394,28 @@ def _stat_value_to_pa_array(val):
     return pa.array([_json_encode_cell(val)], type=pa.string())
 
 
+def project_sd(sd: Dict[str, Any], keep_keys: Any) -> Dict[str, Any]:
+    """Project a summary-stats dict down to ``keep_keys`` per column.
+
+    ``sd`` is ``{short_col: {stat_name: value}}``. Each column's inner stat
+    dict is filtered to the stats in ``keep_keys``; non-dict column values
+    (defensive — shouldn't occur) pass through untouched. The input is not
+    mutated.
+
+    Used to trim the ``all_stats`` wire payload to just the stats the frontend
+    reads before ``sd_to_parquet_b64`` (see ``wire_stat_keys`` / #880). The
+    full ``sd`` stays on the dataflow's ``merged_sd`` for styling regeneration
+    and server-side use; only the wire copy shrinks.
+    """
+    projected: Dict[str, Any] = {}
+    for col, stats in sd.items():
+        if isinstance(stats, dict):
+            projected[col] = {k: v for k, v in stats.items() if k in keep_keys}
+        else:
+            projected[col] = stats
+    return projected
+
+
 def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
     """Convert a summary stats dict to a tagged parquet-b64 payload.
 

diff --git a/tests/unit/basic_widget_test.py b/tests/unit/basic_widget_test.py
@@ -126,6 +126,43 @@ def test_string_column_handling():
     assert ten_col['tooltip_config'] == {'tooltip_type': 'simple', 'val_column': 'a'}
 
 
+def _wire_stat_names(all_stats_payload):
+    """Stat names present in a decoded ``all_stats`` parquet-b64 payload."""
+    import base64
+    import io
+    import pyarrow.parquet as pq
+    assert all_stats_payload['format'] == 'parquet_b64'
+    raw = base64.b64decode(all_stats_payload['data'])
+    tbl = pq.read_table(io.BytesIO(raw))
+    return {n.split('__', 1)[1] for n in tbl.schema.names if '__' in n}
+
+
+def test_all_stats_wire_payload_trimmed_to_displayed_keys():
+    """The ``all_stats`` wire payload carries only the stats the frontend
+    reads — the pinned-row values it looks up by ``primary_key_val`` and the
+    histogram bins the color-map rule reads. Dead weight (``value_counts``,
+    ``histogram_args``, ``memory_usage``, the ``is_*`` typing flags) stays on
+    the dataflow's ``merged_sd`` for styling but never ships. See #880."""
+    df = pd.DataFrame({'int_col': [1, 2, 3, 2, 1], 'flt': [1.5, 2.5, 3.5, 1.0, 2.0]})
+    w = BuckarooWidget(df)
+
+    merged_sd = w.dataflow.widget_args_tuple[2]
+    # The full sd on the dataflow keeps everything for styling/regeneration.
+    assert 'value_counts' in merged_sd['a']
+    assert 'histogram_args' in merged_sd['a']
+
+    wire_stats = _wire_stat_names(w.df_data_dict['all_stats'])
+    # Dead weight is gone from the wire ...
+    assert 'value_counts' not in wire_stats
+    assert 'histogram_args' not in wire_stats
+    assert 'memory_usage' not in wire_stats
+    assert 'is_numeric' not in wire_stats
+    # ... but everything the frontend reads survives.
+    for needed in ('dtype', 'mean', 'std', 'min', 'max', 'median',
+                   'histogram', 'histogram_bins',
+                   'non_null_count', 'most_freq'):
+        assert needed in wire_stats, needed
+
 
 def test_non_unique_column_names():
     #you end up with columns named [0,1,2, 0,1,2]

diff --git a/tests/unit/polars_basic_widget_test.py b/tests/unit/polars_basic_widget_test.py
@@ -11,6 +11,7 @@
 from buckaroo.polars_buckaroo import PolarsBuckarooWidget, PolarsBuckarooInfiniteWidget, to_parquet
 from buckaroo.pluggable_analysis_framework.polars_analysis_management import PlDfStats
 from buckaroo.dataflow.dataflow import StylingAnalysis
+from buckaroo.styling_helpers import inherit_
 from buckaroo.serialization_utils import resolve_summary_stats_payload as _resolve_all_stats
 from buckaroo.jlisp.lisp_utils import (s, sQ)
 
@@ -36,6 +37,15 @@ class SelectOnlyAnalysis(PolarsAnalysis):
         F.all().mean().name.map(json_postfix('mean')),
         F.all().quantile(.99).name.map(json_postfix('quin99'))]
 
+
+# Pin the stats SelectOnlyAnalysis produces so they survive the #880 wire
+# projection (which trims all_stats to the displayed/pinned keys).
+SELECT_ONLY_PINNED = [inherit_('null_count'), inherit_('mean'), inherit_('quin99')]
+
+
+class SelectOnlyStyling(StylingAnalysis):
+    pinned_rows = SELECT_ONLY_PINNED
+
 test_df = pl.DataFrame({'normal_int_series' : pl.Series([1,2,3,4])})
 
 
@@ -56,19 +66,19 @@ def test_polars_all_stats():
     #dsdf = replace_in_dict(sdf, [(np.nan, None)])
     class SimplePolarsBuckaroo(PolarsBuckarooWidget):
         DFStatsClass = PlDfStats  # v1 PolarsAnalysis classes need PlDfStats
-        analysis_klasses= [SelectOnlyAnalysis, StylingAnalysis]
+        analysis_klasses= [SelectOnlyAnalysis, SelectOnlyStyling]
 
     spbw = SimplePolarsBuckaroo(test_df)
     assert spbw.dataflow.merged_sd == expected
 
     resolved_stats = _resolve_all_stats(spbw.df_data_dict['all_stats'])
-    assert resolved_stats == [
-        {'index': 'orig_col_name', 'a': 'normal_int_series', 'level_0':'orig_col_name'},
-        {'index': 'rewritten_col_name', 'a': 'a', 'level_0':'rewritten_col_name'},
-        {'index': 'null_count', 'a': 0.0, 'level_0':'null_count'},
-        {'index': 'mean', 'a': 2.5, 'level_0':'mean'},
-        {'index': 'quin99', 'a': 4.0, 'level_0':'quin99'}]
-    assert spbw.df_display_args['main']['df_viewer_config'] == EXPECTED_DF_VIEWER_CONFIG
+    # #880: the wire payload is projected to the displayed (pinned) stats.
+    # orig_col_name / rewritten_col_name stay on merged_sd (asserted above)
+    # but aren't pinned, so they no longer ship to the frontend.
+    by_index = {row['index']: row['a'] for row in resolved_stats}
+    assert by_index == {'null_count': 0.0, 'mean': 2.5, 'quin99': 4.0}
+    assert spbw.df_display_args['main']['df_viewer_config'] == dict(
+        EXPECTED_DF_VIEWER_CONFIG, pinned_rows=SELECT_ONLY_PINNED)
 
 def test_polars_boolean():
     bool_df = pl.DataFrame({'bools':[True, True, False, False, True, None]})

diff --git a/tests/unit/test_sd_to_parquet_b64.py b/tests/unit/test_sd_to_parquet_b64.py
@@ -13,7 +13,8 @@
 import numpy as np
 import pyarrow.parquet as pq
 
-from buckaroo.serialization_utils import sd_to_parquet_b64
+from buckaroo.serialization_utils import sd_to_parquet_b64, project_sd
+from buckaroo.dataflow.styling_core import wire_stat_keys
 
 
 def _decode(result):
@@ -206,6 +207,31 @@ def test_uint64_max_round_trips_in_wide_layout():
     assert table.to_pydict()['a__max'] == [2**63 + 7]
 
 
+def test_project_sd_keeps_only_requested_keys():
+    """``project_sd`` filters each column's inner dict to ``keep_keys`` and
+    leaves the input untouched (see #880)."""
+    sd = {'a': {'mean': 1.0, 'value_counts': [1, 2, 3], 'dtype': 'int64'},
+        'b': {'mean': 2.0, 'memory_usage': 999, 'dtype': 'float64'}}
+    projected = project_sd(sd, {'mean', 'dtype'})
+    assert projected == {'a': {'mean': 1.0, 'dtype': 'int64'}, 'b': {'mean': 2.0, 'dtype': 'float64'}}
+    # input is not mutated
+    assert 'value_counts' in sd['a']
+
+
+def test_wire_stat_keys_unions_pinned_rows_and_histogram_bins():
+    """``wire_stat_keys`` is the histogram-bin keys plus every pinned-row
+    ``primary_key_val`` (``?`` scope prefix stripped) across the active
+    styling classes and any runtime pinned-rows override."""
+    class _Styling:
+        pinned_rows = [{'primary_key_val': 'dtype'},
+                       {'primary_key_val': 'mean'},
+                       {'primary_key_val': '?filtered_histogram'}]
+
+    keys = wire_stat_keys([_Styling], extra_pinned_rows=[{'primary_key_val': 'std'}])
+    assert keys == {'histogram_bins', 'histogram_log_bins',
+        'dtype', 'mean', 'filtered_histogram', 'std'}
+
+
 def test_negative_int_beyond_int64_falls_back_to_json_string():
     """Ints outside both int64 and uint64 range fall back to JSON-encoded string.