Skip to content

Commit e9d158c

Browse files
paddymulclaude
andcommitted
perf(serialization): project all_stats wire payload to displayed stats only
Derive the frontend-needed stat-key set from the active styling classes (pinned_rows primary_key_vals, ? scope-prefix stripped) plus the histogram bins the color-map rule reads, and project merged_sd down to those keys before sd_to_parquet_b64. The full merged_sd stays on the dataflow for styling regeneration and server-side use (sort, column_config); only the wire copy shrinks. - serialization_utils.project_sd: pure per-column key filter - styling_core.wire_stat_keys: the displayed-key allowlist - dataflow._sd_to_jsondf: the single projection choke point; the widget and polars _sd_to_jsondf now delegate here For a 20-stat numeric frame this trims the per-column wire payload from ~43 stats to ~18. (#880) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent f43b90f commit e9d158c

7 files changed

Lines changed: 128 additions & 26 deletions

File tree

buckaroo/buckaroo_widget.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from .pluggable_analysis_framework.col_analysis import ColAnalysis
3030
from buckaroo.extension_utils import copy_extend
3131

32-
from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64
32+
from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet
3333
from .dataflow.dataflow import CustomizableDataflow
3434
from .dataflow.dataflow_extras import (Sampling, exception_protect)
3535
from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG)
@@ -261,11 +261,10 @@ def post_process_df(kls, df):
261261
self.buckaroo_state = temp_buckaroo_state
262262

263263
def _sd_to_jsondf(self, sd):
264-
"""Serialize summary stats dict. Returns parquet-b64 tagged dict by default.
265-
266-
Exists so this can be overridden for polars.
267-
"""
268-
return sd_to_parquet_b64(sd)
264+
"""Serialize summary stats. Delegates to the dataflow so the wire
265+
projection (see #880) lives in exactly one place — used by the
266+
infinite-widget path, which assembles ``all_stats`` on the widget."""
267+
return self.dataflow._sd_to_jsondf(sd)
269268

270269

271270

buckaroo/dataflow/dataflow.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from traitlets import Unicode, Any, observe, Dict
77

88
from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis, SDType
9-
from ..serialization_utils import pd_to_obj, sd_to_parquet_b64
9+
from ..serialization_utils import pd_to_obj, sd_to_parquet_b64, project_sd
1010
from buckaroo.pluggable_analysis_framework.utils import (filter_analysis)
1111
from buckaroo.pluggable_analysis_framework.df_stats_v2 import DfStatsV2
1212
from .autocleaning import SentinelAutocleaning
@@ -17,7 +17,7 @@
1717
OverrideColumnConfig,
1818
PinnedRowConfig,
1919
merge_sd_overrides,
20-
merge_sds, merge_column_config, StylingAnalysis)
20+
merge_sds, merge_column_config, StylingAnalysis, wire_stat_keys)
2121

2222

2323
from .abc_dataflow import ABCDataflow
@@ -648,11 +648,16 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str,
648648
# ### end summary stats block
649649

650650
def _sd_to_jsondf(self, sd:SDType):
651-
"""Serialize summary stats dict. Returns parquet-b64 tagged dict by default.
651+
"""Serialize summary stats to the wire payload (parquet-b64 tagged dict).
652652
653-
Exists so this can be overridden for polars.
653+
Projects ``sd`` down to just the stats the frontend reads — the
654+
pinned-row values + histogram bins (see ``wire_stat_keys`` / #880) —
655+
before serializing. The full ``sd`` stays on the dataflow's
656+
``merged_sd`` for styling regeneration and server-side use; only the
657+
wire copy shrinks.
654658
"""
655-
return sd_to_parquet_b64(sd)
659+
keep = wire_stat_keys(self.df_display_klasses.values(), self.pinned_rows)
660+
return sd_to_parquet_b64(project_sd(sd, keep))
656661

657662
def _df_to_obj(self, df:pd.DataFrame) -> TDict[str, TAny]:
658663
return pd_to_obj(self.sampling_klass.serialize_sample(df))

buckaroo/dataflow/styling_core.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,47 @@ def style_columns(cls, sd:SDType, df:pd.DataFrame) -> List[ColumnConfig]:
471471
continue
472472
ret_col_config.append(base_style)
473473
return ret_col_config
474-
475474

476-
475+
476+
# Stat keys the JS color-map rule reads per column straight off the wire
477+
# payload (``histogram_bins`` / ``histogram_log_bins`` in gridUtils.ts),
478+
# independent of any pinned row.
479+
HISTOGRAM_BIN_WIRE_KEYS = frozenset({'histogram_bins', 'histogram_log_bins'})
480+
481+
482+
def _pinned_row_stat_keys(pinned_rows: Any) -> set:
483+
"""Stat keys referenced by a list of ``PinnedRowConfig`` entries.
484+
485+
A leading ``?`` marks an optional/scoped row whose data is keyed by the
486+
unprefixed name (mirrors ``stripOptionalPinnedKey`` in gridUtils.ts).
487+
"""
488+
keys = set()
489+
for pr in pinned_rows or []:
490+
pkey = pr.get('primary_key_val')
491+
if not pkey:
492+
continue
493+
keys.add(pkey[1:] if pkey.startswith('?') else pkey)
494+
return keys
495+
496+
497+
def wire_stat_keys(styling_classes: Iterable[Any], extra_pinned_rows: Any = ()) -> set:
498+
"""Stat keys the frontend reads from the ``all_stats`` wire payload.
499+
500+
The frontend reads exactly two things out of the summary-stats payload:
501+
the histogram-bin arrays the color-map rule bins against
502+
(``HISTOGRAM_BIN_WIRE_KEYS``), and the per-column pinned-row values it
503+
looks up by ``primary_key_val``. Everything else in ``merged_sd``
504+
(``value_counts``, ``histogram_args``, ``memory_usage``, the ``is_*``
505+
typing flags, the heuristic ``*_frac`` cleaning stats, ...) is shipped
506+
today but never read. This is the allowlist used to trim the wire copy
507+
(see ``project_sd`` / #880).
508+
509+
``styling_classes`` are the active ``StylingAnalysis`` subclasses (the
510+
dataflow's ``df_display_klasses`` values); ``extra_pinned_rows`` carries
511+
any runtime ``pinned_rows`` override set on the dataflow.
512+
"""
513+
keys = set(HISTOGRAM_BIN_WIRE_KEYS)
514+
for kls in styling_classes:
515+
keys |= _pinned_row_stat_keys(getattr(kls, 'pinned_rows', None))
516+
keys |= _pinned_row_stat_keys(extra_pinned_rows)
517+
return keys

buckaroo/polars_buckaroo.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from .pluggable_analysis_framework.df_stats_v2 import PlDfStatsV2
1010
from .pluggable_analysis_framework.polars_analysis_management import PlDfStats
1111
from .customizations.pl_stats_v2 import PL_ANALYSIS_V2
12-
from .serialization_utils import pd_to_obj, sd_to_parquet_b64
12+
from .serialization_utils import pd_to_obj
1313
from .customizations.styling import DefaultSummaryStatsStyling, DefaultMainStyling
1414
from .customizations.pl_autocleaning_conf import NoCleaningConfPl
1515
from .dataflow.dataflow import Sampling
@@ -69,9 +69,8 @@ class PolarsBuckarooWidget(BuckarooWidget):
6969
DFStatsClass = PlDfStatsV2
7070
sampling_klass = PLSampling
7171

72-
def _sd_to_jsondf(self, sd):
73-
"""Serialize summary stats dict as parquet-b64."""
74-
return sd_to_parquet_b64(sd)
72+
# _sd_to_jsondf is inherited from BuckarooWidgetBase, which delegates to
73+
# the dataflow so the wire-stat projection (#880) lives in one place.
7574

7675
def _build_error_dataframe(self, e):
7776
return pl.DataFrame({'err': [str(e)]})

buckaroo/serialization_utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,28 @@ def _stat_value_to_pa_array(val):
394394
return pa.array([_json_encode_cell(val)], type=pa.string())
395395

396396

397+
def project_sd(sd: Dict[str, Any], keep_keys: Any) -> Dict[str, Any]:
398+
"""Project a summary-stats dict down to ``keep_keys`` per column.
399+
400+
``sd`` is ``{short_col: {stat_name: value}}``. Each column's inner stat
401+
dict is filtered to the stats in ``keep_keys``; non-dict column values
402+
(defensive — shouldn't occur) pass through untouched. The input is not
403+
mutated.
404+
405+
Used to trim the ``all_stats`` wire payload to just the stats the frontend
406+
reads before ``sd_to_parquet_b64`` (see ``wire_stat_keys`` / #880). The
407+
full ``sd`` stays on the dataflow's ``merged_sd`` for styling regeneration
408+
and server-side use; only the wire copy shrinks.
409+
"""
410+
projected: Dict[str, Any] = {}
411+
for col, stats in sd.items():
412+
if isinstance(stats, dict):
413+
projected[col] = {k: v for k, v in stats.items() if k in keep_keys}
414+
else:
415+
projected[col] = stats
416+
return projected
417+
418+
397419
def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
398420
"""Convert a summary stats dict to a tagged parquet-b64 payload.
399421

tests/unit/polars_basic_widget_test.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from buckaroo.polars_buckaroo import PolarsBuckarooWidget, PolarsBuckarooInfiniteWidget, to_parquet
1212
from buckaroo.pluggable_analysis_framework.polars_analysis_management import PlDfStats
1313
from buckaroo.dataflow.dataflow import StylingAnalysis
14+
from buckaroo.styling_helpers import inherit_
1415
from buckaroo.serialization_utils import resolve_summary_stats_payload as _resolve_all_stats
1516
from buckaroo.jlisp.lisp_utils import (s, sQ)
1617

@@ -36,6 +37,15 @@ class SelectOnlyAnalysis(PolarsAnalysis):
3637
F.all().mean().name.map(json_postfix('mean')),
3738
F.all().quantile(.99).name.map(json_postfix('quin99'))]
3839

40+
41+
# Pin the stats SelectOnlyAnalysis produces so they survive the #880 wire
42+
# projection (which trims all_stats to the displayed/pinned keys).
43+
SELECT_ONLY_PINNED = [inherit_('null_count'), inherit_('mean'), inherit_('quin99')]
44+
45+
46+
class SelectOnlyStyling(StylingAnalysis):
47+
pinned_rows = SELECT_ONLY_PINNED
48+
3949
test_df = pl.DataFrame({'normal_int_series' : pl.Series([1,2,3,4])})
4050

4151

@@ -56,19 +66,19 @@ def test_polars_all_stats():
5666
#dsdf = replace_in_dict(sdf, [(np.nan, None)])
5767
class SimplePolarsBuckaroo(PolarsBuckarooWidget):
5868
DFStatsClass = PlDfStats # v1 PolarsAnalysis classes need PlDfStats
59-
analysis_klasses= [SelectOnlyAnalysis, StylingAnalysis]
69+
analysis_klasses= [SelectOnlyAnalysis, SelectOnlyStyling]
6070

6171
spbw = SimplePolarsBuckaroo(test_df)
6272
assert spbw.dataflow.merged_sd == expected
6373

6474
resolved_stats = _resolve_all_stats(spbw.df_data_dict['all_stats'])
65-
assert resolved_stats == [
66-
{'index': 'orig_col_name', 'a': 'normal_int_series', 'level_0':'orig_col_name'},
67-
{'index': 'rewritten_col_name', 'a': 'a', 'level_0':'rewritten_col_name'},
68-
{'index': 'null_count', 'a': 0.0, 'level_0':'null_count'},
69-
{'index': 'mean', 'a': 2.5, 'level_0':'mean'},
70-
{'index': 'quin99', 'a': 4.0, 'level_0':'quin99'}]
71-
assert spbw.df_display_args['main']['df_viewer_config'] == EXPECTED_DF_VIEWER_CONFIG
75+
# #880: the wire payload is projected to the displayed (pinned) stats.
76+
# orig_col_name / rewritten_col_name stay on merged_sd (asserted above)
77+
# but aren't pinned, so they no longer ship to the frontend.
78+
by_index = {row['index']: row['a'] for row in resolved_stats}
79+
assert by_index == {'null_count': 0.0, 'mean': 2.5, 'quin99': 4.0}
80+
assert spbw.df_display_args['main']['df_viewer_config'] == dict(
81+
EXPECTED_DF_VIEWER_CONFIG, pinned_rows=SELECT_ONLY_PINNED)
7282

7383
def test_polars_boolean():
7484
bool_df = pl.DataFrame({'bools':[True, True, False, False, True, None]})

tests/unit/test_sd_to_parquet_b64.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
import numpy as np
1414
import pyarrow.parquet as pq
1515

16-
from buckaroo.serialization_utils import sd_to_parquet_b64
16+
from buckaroo.serialization_utils import sd_to_parquet_b64, project_sd
17+
from buckaroo.dataflow.styling_core import wire_stat_keys
1718

1819

1920
def _decode(result):
@@ -206,6 +207,31 @@ def test_uint64_max_round_trips_in_wide_layout():
206207
assert table.to_pydict()['a__max'] == [2**63 + 7]
207208

208209

210+
def test_project_sd_keeps_only_requested_keys():
211+
"""``project_sd`` filters each column's inner dict to ``keep_keys`` and
212+
leaves the input untouched (see #880)."""
213+
sd = {'a': {'mean': 1.0, 'value_counts': [1, 2, 3], 'dtype': 'int64'},
214+
'b': {'mean': 2.0, 'memory_usage': 999, 'dtype': 'float64'}}
215+
projected = project_sd(sd, {'mean', 'dtype'})
216+
assert projected == {'a': {'mean': 1.0, 'dtype': 'int64'}, 'b': {'mean': 2.0, 'dtype': 'float64'}}
217+
# input is not mutated
218+
assert 'value_counts' in sd['a']
219+
220+
221+
def test_wire_stat_keys_unions_pinned_rows_and_histogram_bins():
222+
"""``wire_stat_keys`` is the histogram-bin keys plus every pinned-row
223+
``primary_key_val`` (``?`` scope prefix stripped) across the active
224+
styling classes and any runtime pinned-rows override."""
225+
class _Styling:
226+
pinned_rows = [{'primary_key_val': 'dtype'},
227+
{'primary_key_val': 'mean'},
228+
{'primary_key_val': '?filtered_histogram'}]
229+
230+
keys = wire_stat_keys([_Styling], extra_pinned_rows=[{'primary_key_val': 'std'}])
231+
assert keys == {'histogram_bins', 'histogram_log_bins',
232+
'dtype', 'mean', 'filtered_histogram', 'std'}
233+
234+
209235
def test_negative_int_beyond_int64_falls_back_to_json_string():
210236
"""Ints outside both int64 and uint64 range fall back to JSON-encoded string.
211237

0 commit comments

Comments
 (0)