diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py index f7431bc2c..adc6e7cc9 100644 --- a/buckaroo/buckaroo_widget.py +++ b/buckaroo/buckaroo_widget.py @@ -29,7 +29,7 @@ from .pluggable_analysis_framework.col_analysis import ColAnalysis from buckaroo.extension_utils import copy_extend -from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64 +from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet from .dataflow.dataflow import CustomizableDataflow from .dataflow.dataflow_extras import (Sampling, exception_protect) from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG) @@ -261,11 +261,10 @@ def post_process_df(kls, df): self.buckaroo_state = temp_buckaroo_state def _sd_to_jsondf(self, sd): - """Serialize summary stats dict. Returns parquet-b64 tagged dict by default. - - Exists so this can be overridden for polars. - """ - return sd_to_parquet_b64(sd) + """Serialize summary stats. Delegates to the dataflow so the wire + projection (see #880) lives in exactly one place — used by the + infinite-widget path, which assembles ``all_stats`` on the widget.""" + return self.dataflow._sd_to_jsondf(sd) diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py index 1f1f323c2..3d8b33047 100644 --- a/buckaroo/dataflow/dataflow.py +++ b/buckaroo/dataflow/dataflow.py @@ -6,7 +6,7 @@ from traitlets import Unicode, Any, observe, Dict from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis, SDType -from ..serialization_utils import pd_to_obj, sd_to_parquet_b64 +from ..serialization_utils import pd_to_obj, sd_to_parquet_b64, project_sd from buckaroo.pluggable_analysis_framework.utils import (filter_analysis) from buckaroo.pluggable_analysis_framework.df_stats_v2 import DfStatsV2 from .autocleaning import SentinelAutocleaning @@ -17,7 +17,7 @@ OverrideColumnConfig, PinnedRowConfig, merge_sd_overrides, - merge_sds, merge_column_config, StylingAnalysis) + merge_sds, merge_column_config, StylingAnalysis, wire_stat_keys) from .abc_dataflow import ABCDataflow @@ -648,11 +648,16 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str, # ### end summary stats block def _sd_to_jsondf(self, sd:SDType): - """Serialize summary stats dict. Returns parquet-b64 tagged dict by default. + """Serialize summary stats to the wire payload (parquet-b64 tagged dict). - Exists so this can be overridden for polars. + Projects ``sd`` down to just the stats the frontend reads — the + pinned-row values + histogram bins (see ``wire_stat_keys`` / #880) — + before serializing. The full ``sd`` stays on the dataflow's + ``merged_sd`` for styling regeneration and server-side use; only the + wire copy shrinks. """ - return sd_to_parquet_b64(sd) + keep = wire_stat_keys(self.df_display_klasses.values(), self.pinned_rows) + return sd_to_parquet_b64(project_sd(sd, keep)) def _df_to_obj(self, df:pd.DataFrame) -> TDict[str, TAny]: return pd_to_obj(self.sampling_klass.serialize_sample(df)) diff --git a/buckaroo/dataflow/styling_core.py b/buckaroo/dataflow/styling_core.py index d2496f4fe..36ef8acd7 100644 --- a/buckaroo/dataflow/styling_core.py +++ b/buckaroo/dataflow/styling_core.py @@ -472,6 +472,47 @@ def style_columns(cls, sd:SDType, df:pd.DataFrame) -> List[ColumnConfig]: continue ret_col_config.append(base_style) return ret_col_config - - + +# Stat keys the JS color-map rule reads per column straight off the wire +# payload (``histogram_bins`` / ``histogram_log_bins`` in gridUtils.ts), +# independent of any pinned row. +HISTOGRAM_BIN_WIRE_KEYS = frozenset({'histogram_bins', 'histogram_log_bins'}) + + +def _pinned_row_stat_keys(pinned_rows: Any) -> set: + """Stat keys referenced by a list of ``PinnedRowConfig`` entries. + + A leading ``?`` marks an optional/scoped row whose data is keyed by the + unprefixed name (mirrors ``stripOptionalPinnedKey`` in gridUtils.ts). + """ + keys = set() + for pr in pinned_rows or []: + pkey = pr.get('primary_key_val') + if not pkey: + continue + keys.add(pkey[1:] if pkey.startswith('?') else pkey) + return keys + + +def wire_stat_keys(styling_classes: Iterable[Any], extra_pinned_rows: Any = ()) -> set: + """Stat keys the frontend reads from the ``all_stats`` wire payload. + + The frontend reads exactly two things out of the summary-stats payload: + the histogram-bin arrays the color-map rule bins against + (``HISTOGRAM_BIN_WIRE_KEYS``), and the per-column pinned-row values it + looks up by ``primary_key_val``. Everything else in ``merged_sd`` + (``value_counts``, ``histogram_args``, ``memory_usage``, the ``is_*`` + typing flags, the heuristic ``*_frac`` cleaning stats, ...) is shipped + today but never read. This is the allowlist used to trim the wire copy + (see ``project_sd`` / #880). + + ``styling_classes`` are the active ``StylingAnalysis`` subclasses (the + dataflow's ``df_display_klasses`` values); ``extra_pinned_rows`` carries + any runtime ``pinned_rows`` override set on the dataflow. + """ + keys = set(HISTOGRAM_BIN_WIRE_KEYS) + for kls in styling_classes: + keys |= _pinned_row_stat_keys(getattr(kls, 'pinned_rows', None)) + keys |= _pinned_row_stat_keys(extra_pinned_rows) + return keys diff --git a/buckaroo/polars_buckaroo.py b/buckaroo/polars_buckaroo.py index a4d35bb69..595540b0b 100644 --- a/buckaroo/polars_buckaroo.py +++ b/buckaroo/polars_buckaroo.py @@ -9,7 +9,7 @@ from .pluggable_analysis_framework.df_stats_v2 import PlDfStatsV2 from .pluggable_analysis_framework.polars_analysis_management import PlDfStats from .customizations.pl_stats_v2 import PL_ANALYSIS_V2 -from .serialization_utils import pd_to_obj, sd_to_parquet_b64 +from .serialization_utils import pd_to_obj from .customizations.styling import DefaultSummaryStatsStyling, DefaultMainStyling from .customizations.pl_autocleaning_conf import NoCleaningConfPl from .dataflow.dataflow import Sampling @@ -69,9 +69,8 @@ class PolarsBuckarooWidget(BuckarooWidget): DFStatsClass = PlDfStatsV2 sampling_klass = PLSampling - def _sd_to_jsondf(self, sd): - """Serialize summary stats dict as parquet-b64.""" - return sd_to_parquet_b64(sd) + # _sd_to_jsondf is inherited from BuckarooWidgetBase, which delegates to + # the dataflow so the wire-stat projection (#880) lives in one place. def _build_error_dataframe(self, e): return pl.DataFrame({'err': [str(e)]}) diff --git a/buckaroo/serialization_utils.py b/buckaroo/serialization_utils.py index f3fb6ed7a..0cc5eb442 100644 --- a/buckaroo/serialization_utils.py +++ b/buckaroo/serialization_utils.py @@ -394,6 +394,28 @@ def _stat_value_to_pa_array(val): return pa.array([_json_encode_cell(val)], type=pa.string()) +def project_sd(sd: Dict[str, Any], keep_keys: Any) -> Dict[str, Any]: + """Project a summary-stats dict down to ``keep_keys`` per column. + + ``sd`` is ``{short_col: {stat_name: value}}``. Each column's inner stat + dict is filtered to the stats in ``keep_keys``; non-dict column values + (defensive — shouldn't occur) pass through untouched. The input is not + mutated. + + Used to trim the ``all_stats`` wire payload to just the stats the frontend + reads before ``sd_to_parquet_b64`` (see ``wire_stat_keys`` / #880). The + full ``sd`` stays on the dataflow's ``merged_sd`` for styling regeneration + and server-side use; only the wire copy shrinks. + """ + projected: Dict[str, Any] = {} + for col, stats in sd.items(): + if isinstance(stats, dict): + projected[col] = {k: v for k, v in stats.items() if k in keep_keys} + else: + projected[col] = stats + return projected + + def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]: """Convert a summary stats dict to a tagged parquet-b64 payload. diff --git a/tests/unit/basic_widget_test.py b/tests/unit/basic_widget_test.py index f273e2697..b1fc77c8d 100644 --- a/tests/unit/basic_widget_test.py +++ b/tests/unit/basic_widget_test.py @@ -126,6 +126,43 @@ def test_string_column_handling(): assert ten_col['tooltip_config'] == {'tooltip_type': 'simple', 'val_column': 'a'} +def _wire_stat_names(all_stats_payload): + """Stat names present in a decoded ``all_stats`` parquet-b64 payload.""" + import base64 + import io + import pyarrow.parquet as pq + assert all_stats_payload['format'] == 'parquet_b64' + raw = base64.b64decode(all_stats_payload['data']) + tbl = pq.read_table(io.BytesIO(raw)) + return {n.split('__', 1)[1] for n in tbl.schema.names if '__' in n} + + +def test_all_stats_wire_payload_trimmed_to_displayed_keys(): + """The ``all_stats`` wire payload carries only the stats the frontend + reads — the pinned-row values it looks up by ``primary_key_val`` and the + histogram bins the color-map rule reads. Dead weight (``value_counts``, + ``histogram_args``, ``memory_usage``, the ``is_*`` typing flags) stays on + the dataflow's ``merged_sd`` for styling but never ships. See #880.""" + df = pd.DataFrame({'int_col': [1, 2, 3, 2, 1], 'flt': [1.5, 2.5, 3.5, 1.0, 2.0]}) + w = BuckarooWidget(df) + + merged_sd = w.dataflow.widget_args_tuple[2] + # The full sd on the dataflow keeps everything for styling/regeneration. + assert 'value_counts' in merged_sd['a'] + assert 'histogram_args' in merged_sd['a'] + + wire_stats = _wire_stat_names(w.df_data_dict['all_stats']) + # Dead weight is gone from the wire ... + assert 'value_counts' not in wire_stats + assert 'histogram_args' not in wire_stats + assert 'memory_usage' not in wire_stats + assert 'is_numeric' not in wire_stats + # ... but everything the frontend reads survives. + for needed in ('dtype', 'mean', 'std', 'min', 'max', 'median', + 'histogram', 'histogram_bins', + 'non_null_count', 'most_freq'): + assert needed in wire_stats, needed + def test_non_unique_column_names(): #you end up with columns named [0,1,2, 0,1,2] diff --git a/tests/unit/polars_basic_widget_test.py b/tests/unit/polars_basic_widget_test.py index 730a3c27f..c3a5098a4 100644 --- a/tests/unit/polars_basic_widget_test.py +++ b/tests/unit/polars_basic_widget_test.py @@ -11,6 +11,7 @@ from buckaroo.polars_buckaroo import PolarsBuckarooWidget, PolarsBuckarooInfiniteWidget, to_parquet from buckaroo.pluggable_analysis_framework.polars_analysis_management import PlDfStats from buckaroo.dataflow.dataflow import StylingAnalysis +from buckaroo.styling_helpers import inherit_ from buckaroo.serialization_utils import resolve_summary_stats_payload as _resolve_all_stats from buckaroo.jlisp.lisp_utils import (s, sQ) @@ -36,6 +37,15 @@ class SelectOnlyAnalysis(PolarsAnalysis): F.all().mean().name.map(json_postfix('mean')), F.all().quantile(.99).name.map(json_postfix('quin99'))] + +# Pin the stats SelectOnlyAnalysis produces so they survive the #880 wire +# projection (which trims all_stats to the displayed/pinned keys). +SELECT_ONLY_PINNED = [inherit_('null_count'), inherit_('mean'), inherit_('quin99')] + + +class SelectOnlyStyling(StylingAnalysis): + pinned_rows = SELECT_ONLY_PINNED + test_df = pl.DataFrame({'normal_int_series' : pl.Series([1,2,3,4])}) @@ -56,19 +66,19 @@ def test_polars_all_stats(): #dsdf = replace_in_dict(sdf, [(np.nan, None)]) class SimplePolarsBuckaroo(PolarsBuckarooWidget): DFStatsClass = PlDfStats # v1 PolarsAnalysis classes need PlDfStats - analysis_klasses= [SelectOnlyAnalysis, StylingAnalysis] + analysis_klasses= [SelectOnlyAnalysis, SelectOnlyStyling] spbw = SimplePolarsBuckaroo(test_df) assert spbw.dataflow.merged_sd == expected resolved_stats = _resolve_all_stats(spbw.df_data_dict['all_stats']) - assert resolved_stats == [ - {'index': 'orig_col_name', 'a': 'normal_int_series', 'level_0':'orig_col_name'}, - {'index': 'rewritten_col_name', 'a': 'a', 'level_0':'rewritten_col_name'}, - {'index': 'null_count', 'a': 0.0, 'level_0':'null_count'}, - {'index': 'mean', 'a': 2.5, 'level_0':'mean'}, - {'index': 'quin99', 'a': 4.0, 'level_0':'quin99'}] - assert spbw.df_display_args['main']['df_viewer_config'] == EXPECTED_DF_VIEWER_CONFIG + # #880: the wire payload is projected to the displayed (pinned) stats. + # orig_col_name / rewritten_col_name stay on merged_sd (asserted above) + # but aren't pinned, so they no longer ship to the frontend. + by_index = {row['index']: row['a'] for row in resolved_stats} + assert by_index == {'null_count': 0.0, 'mean': 2.5, 'quin99': 4.0} + assert spbw.df_display_args['main']['df_viewer_config'] == dict( + EXPECTED_DF_VIEWER_CONFIG, pinned_rows=SELECT_ONLY_PINNED) def test_polars_boolean(): bool_df = pl.DataFrame({'bools':[True, True, False, False, True, None]}) diff --git a/tests/unit/test_sd_to_parquet_b64.py b/tests/unit/test_sd_to_parquet_b64.py index 320206d1a..b27af5936 100644 --- a/tests/unit/test_sd_to_parquet_b64.py +++ b/tests/unit/test_sd_to_parquet_b64.py @@ -13,7 +13,8 @@ import numpy as np import pyarrow.parquet as pq -from buckaroo.serialization_utils import sd_to_parquet_b64 +from buckaroo.serialization_utils import sd_to_parquet_b64, project_sd +from buckaroo.dataflow.styling_core import wire_stat_keys def _decode(result): @@ -206,6 +207,31 @@ def test_uint64_max_round_trips_in_wide_layout(): assert table.to_pydict()['a__max'] == [2**63 + 7] +def test_project_sd_keeps_only_requested_keys(): + """``project_sd`` filters each column's inner dict to ``keep_keys`` and + leaves the input untouched (see #880).""" + sd = {'a': {'mean': 1.0, 'value_counts': [1, 2, 3], 'dtype': 'int64'}, + 'b': {'mean': 2.0, 'memory_usage': 999, 'dtype': 'float64'}} + projected = project_sd(sd, {'mean', 'dtype'}) + assert projected == {'a': {'mean': 1.0, 'dtype': 'int64'}, 'b': {'mean': 2.0, 'dtype': 'float64'}} + # input is not mutated + assert 'value_counts' in sd['a'] + + +def test_wire_stat_keys_unions_pinned_rows_and_histogram_bins(): + """``wire_stat_keys`` is the histogram-bin keys plus every pinned-row + ``primary_key_val`` (``?`` scope prefix stripped) across the active + styling classes and any runtime pinned-rows override.""" + class _Styling: + pinned_rows = [{'primary_key_val': 'dtype'}, + {'primary_key_val': 'mean'}, + {'primary_key_val': '?filtered_histogram'}] + + keys = wire_stat_keys([_Styling], extra_pinned_rows=[{'primary_key_val': 'std'}]) + assert keys == {'histogram_bins', 'histogram_log_bins', + 'dtype', 'mean', 'filtered_histogram', 'std'} + + def test_negative_int_beyond_int64_falls_back_to_json_string(): """Ints outside both int64 and uint64 range fall back to JSON-encoded string.