Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions buckaroo/buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .pluggable_analysis_framework.col_analysis import ColAnalysis
from buckaroo.extension_utils import copy_extend

from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64
from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet
from .dataflow.dataflow import CustomizableDataflow
from .dataflow.dataflow_extras import (Sampling, exception_protect)
from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG)
Expand Down Expand Up @@ -261,11 +261,10 @@ def post_process_df(kls, df):
self.buckaroo_state = temp_buckaroo_state

def _sd_to_jsondf(self, sd):
"""Serialize summary stats dict. Returns parquet-b64 tagged dict by default.

Exists so this can be overridden for polars.
"""
return sd_to_parquet_b64(sd)
"""Serialize summary stats. Delegates to the dataflow so the wire
projection (see #880) lives in exactly one place — used by the
infinite-widget path, which assembles ``all_stats`` on the widget."""
return self.dataflow._sd_to_jsondf(sd)



Expand Down
15 changes: 10 additions & 5 deletions buckaroo/dataflow/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from traitlets import Unicode, Any, observe, Dict

from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis, SDType
from ..serialization_utils import pd_to_obj, sd_to_parquet_b64
from ..serialization_utils import pd_to_obj, sd_to_parquet_b64, project_sd
from buckaroo.pluggable_analysis_framework.utils import (filter_analysis)
from buckaroo.pluggable_analysis_framework.df_stats_v2 import DfStatsV2
from .autocleaning import SentinelAutocleaning
Expand All @@ -17,7 +17,7 @@
OverrideColumnConfig,
PinnedRowConfig,
merge_sd_overrides,
merge_sds, merge_column_config, StylingAnalysis)
merge_sds, merge_column_config, StylingAnalysis, wire_stat_keys)


from .abc_dataflow import ABCDataflow
Expand Down Expand Up @@ -648,11 +648,16 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str,
# ### end summary stats block

def _sd_to_jsondf(self, sd:SDType):
"""Serialize summary stats dict. Returns parquet-b64 tagged dict by default.
"""Serialize summary stats to the wire payload (parquet-b64 tagged dict).

Exists so this can be overridden for polars.
Projects ``sd`` down to just the stats the frontend reads — the
pinned-row values + histogram bins (see ``wire_stat_keys`` / #880) —
before serializing. The full ``sd`` stays on the dataflow's
``merged_sd`` for styling regeneration and server-side use; only the
wire copy shrinks.
"""
return sd_to_parquet_b64(sd)
keep = wire_stat_keys(self.df_display_klasses.values(), self.pinned_rows)
return sd_to_parquet_b64(project_sd(sd, keep))

def _df_to_obj(self, df:pd.DataFrame) -> TDict[str, TAny]:
return pd_to_obj(self.sampling_klass.serialize_sample(df))
Expand Down
45 changes: 43 additions & 2 deletions buckaroo/dataflow/styling_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,47 @@ def style_columns(cls, sd:SDType, df:pd.DataFrame) -> List[ColumnConfig]:
continue
ret_col_config.append(base_style)
return ret_col_config




# Stat keys the JS color-map rule reads per column straight off the wire
# payload (``histogram_bins`` / ``histogram_log_bins`` in gridUtils.ts),
# independent of any pinned row.
HISTOGRAM_BIN_WIRE_KEYS = frozenset({'histogram_bins', 'histogram_log_bins'})


def _pinned_row_stat_keys(pinned_rows: Any) -> set:
"""Stat keys referenced by a list of ``PinnedRowConfig`` entries.

A leading ``?`` marks an optional/scoped row whose data is keyed by the
unprefixed name (mirrors ``stripOptionalPinnedKey`` in gridUtils.ts).
"""
keys = set()
for pr in pinned_rows or []:
pkey = pr.get('primary_key_val')
if not pkey:
continue
keys.add(pkey[1:] if pkey.startswith('?') else pkey)
return keys


def wire_stat_keys(styling_classes: Iterable[Any], extra_pinned_rows: Any = ()) -> set:
"""Stat keys the frontend reads from the ``all_stats`` wire payload.

The frontend reads exactly two things out of the summary-stats payload:
the histogram-bin arrays the color-map rule bins against
(``HISTOGRAM_BIN_WIRE_KEYS``), and the per-column pinned-row values it
looks up by ``primary_key_val``. Everything else in ``merged_sd``
(``value_counts``, ``histogram_args``, ``memory_usage``, the ``is_*``
typing flags, the heuristic ``*_frac`` cleaning stats, ...) is shipped
today but never read. This is the allowlist used to trim the wire copy
(see ``project_sd`` / #880).

``styling_classes`` are the active ``StylingAnalysis`` subclasses (the
dataflow's ``df_display_klasses`` values); ``extra_pinned_rows`` carries
any runtime ``pinned_rows`` override set on the dataflow.
"""
keys = set(HISTOGRAM_BIN_WIRE_KEYS)
for kls in styling_classes:
keys |= _pinned_row_stat_keys(getattr(kls, 'pinned_rows', None))
keys |= _pinned_row_stat_keys(extra_pinned_rows)
return keys
7 changes: 3 additions & 4 deletions buckaroo/polars_buckaroo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .pluggable_analysis_framework.df_stats_v2 import PlDfStatsV2
from .pluggable_analysis_framework.polars_analysis_management import PlDfStats
from .customizations.pl_stats_v2 import PL_ANALYSIS_V2
from .serialization_utils import pd_to_obj, sd_to_parquet_b64
from .serialization_utils import pd_to_obj
from .customizations.styling import DefaultSummaryStatsStyling, DefaultMainStyling
from .customizations.pl_autocleaning_conf import NoCleaningConfPl
from .dataflow.dataflow import Sampling
Expand Down Expand Up @@ -69,9 +69,8 @@ class PolarsBuckarooWidget(BuckarooWidget):
DFStatsClass = PlDfStatsV2
sampling_klass = PLSampling

def _sd_to_jsondf(self, sd):
"""Serialize summary stats dict as parquet-b64."""
return sd_to_parquet_b64(sd)
# _sd_to_jsondf is inherited from BuckarooWidgetBase, which delegates to
# the dataflow so the wire-stat projection (#880) lives in one place.

def _build_error_dataframe(self, e):
return pl.DataFrame({'err': [str(e)]})
Expand Down
22 changes: 22 additions & 0 deletions buckaroo/serialization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,28 @@ def _stat_value_to_pa_array(val):
return pa.array([_json_encode_cell(val)], type=pa.string())


def project_sd(sd: Dict[str, Any], keep_keys: Any) -> Dict[str, Any]:
"""Project a summary-stats dict down to ``keep_keys`` per column.

``sd`` is ``{short_col: {stat_name: value}}``. Each column's inner stat
dict is filtered to the stats in ``keep_keys``; non-dict column values
(defensive — shouldn't occur) pass through untouched. The input is not
mutated.

Used to trim the ``all_stats`` wire payload to just the stats the frontend
reads before ``sd_to_parquet_b64`` (see ``wire_stat_keys`` / #880). The
full ``sd`` stays on the dataflow's ``merged_sd`` for styling regeneration
and server-side use; only the wire copy shrinks.
"""
projected: Dict[str, Any] = {}
for col, stats in sd.items():
if isinstance(stats, dict):
projected[col] = {k: v for k, v in stats.items() if k in keep_keys}
else:
projected[col] = stats
return projected


def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
"""Convert a summary stats dict to a tagged parquet-b64 payload.

Expand Down
37 changes: 37 additions & 0 deletions tests/unit/basic_widget_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,43 @@ def test_string_column_handling():
assert ten_col['tooltip_config'] == {'tooltip_type': 'simple', 'val_column': 'a'}


def _wire_stat_names(all_stats_payload):
"""Stat names present in a decoded ``all_stats`` parquet-b64 payload."""
import base64
import io
import pyarrow.parquet as pq
assert all_stats_payload['format'] == 'parquet_b64'
raw = base64.b64decode(all_stats_payload['data'])
tbl = pq.read_table(io.BytesIO(raw))
return {n.split('__', 1)[1] for n in tbl.schema.names if '__' in n}


def test_all_stats_wire_payload_trimmed_to_displayed_keys():
"""The ``all_stats`` wire payload carries only the stats the frontend
reads — the pinned-row values it looks up by ``primary_key_val`` and the
histogram bins the color-map rule reads. Dead weight (``value_counts``,
``histogram_args``, ``memory_usage``, the ``is_*`` typing flags) stays on
the dataflow's ``merged_sd`` for styling but never ships. See #880."""
df = pd.DataFrame({'int_col': [1, 2, 3, 2, 1], 'flt': [1.5, 2.5, 3.5, 1.0, 2.0]})
w = BuckarooWidget(df)

merged_sd = w.dataflow.widget_args_tuple[2]
# The full sd on the dataflow keeps everything for styling/regeneration.
assert 'value_counts' in merged_sd['a']
assert 'histogram_args' in merged_sd['a']

wire_stats = _wire_stat_names(w.df_data_dict['all_stats'])
# Dead weight is gone from the wire ...
assert 'value_counts' not in wire_stats

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Include the implementation before asserting trimmed wire stats

In this commit the production serializers are unchanged: DataFlow._sd_to_jsondf, BuckarooWidgetBase._sd_to_jsondf, and PolarsBuckarooWidget._sd_to_jsondf still pass the complete sd directly to sd_to_parquet_b64(sd), which emits every {column}__{stat} in the schema. As a result, once the test environment has pandas/pyarrow installed, wire_stats will still include value_counts (and the other dead-weight keys) from merged_sd, so this new test fails in CI rather than documenting a shipped optimization.

Useful? React with 👍 / 👎.

assert 'histogram_args' not in wire_stats
assert 'memory_usage' not in wire_stats
assert 'is_numeric' not in wire_stats
# ... but everything the frontend reads survives.
for needed in ('dtype', 'mean', 'std', 'min', 'max', 'median',
'histogram', 'histogram_bins',
'non_null_count', 'most_freq'):
assert needed in wire_stats, needed


def test_non_unique_column_names():
#you end up with columns named [0,1,2, 0,1,2]
Expand Down
26 changes: 18 additions & 8 deletions tests/unit/polars_basic_widget_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from buckaroo.polars_buckaroo import PolarsBuckarooWidget, PolarsBuckarooInfiniteWidget, to_parquet
from buckaroo.pluggable_analysis_framework.polars_analysis_management import PlDfStats
from buckaroo.dataflow.dataflow import StylingAnalysis
from buckaroo.styling_helpers import inherit_
from buckaroo.serialization_utils import resolve_summary_stats_payload as _resolve_all_stats
from buckaroo.jlisp.lisp_utils import (s, sQ)

Expand All @@ -36,6 +37,15 @@ class SelectOnlyAnalysis(PolarsAnalysis):
F.all().mean().name.map(json_postfix('mean')),
F.all().quantile(.99).name.map(json_postfix('quin99'))]


# Pin the stats SelectOnlyAnalysis produces so they survive the #880 wire
# projection (which trims all_stats to the displayed/pinned keys).
SELECT_ONLY_PINNED = [inherit_('null_count'), inherit_('mean'), inherit_('quin99')]


class SelectOnlyStyling(StylingAnalysis):
pinned_rows = SELECT_ONLY_PINNED

test_df = pl.DataFrame({'normal_int_series' : pl.Series([1,2,3,4])})


Expand All @@ -56,19 +66,19 @@ def test_polars_all_stats():
#dsdf = replace_in_dict(sdf, [(np.nan, None)])
class SimplePolarsBuckaroo(PolarsBuckarooWidget):
DFStatsClass = PlDfStats # v1 PolarsAnalysis classes need PlDfStats
analysis_klasses= [SelectOnlyAnalysis, StylingAnalysis]
analysis_klasses= [SelectOnlyAnalysis, SelectOnlyStyling]

spbw = SimplePolarsBuckaroo(test_df)
assert spbw.dataflow.merged_sd == expected

resolved_stats = _resolve_all_stats(spbw.df_data_dict['all_stats'])
assert resolved_stats == [
{'index': 'orig_col_name', 'a': 'normal_int_series', 'level_0':'orig_col_name'},
{'index': 'rewritten_col_name', 'a': 'a', 'level_0':'rewritten_col_name'},
{'index': 'null_count', 'a': 0.0, 'level_0':'null_count'},
{'index': 'mean', 'a': 2.5, 'level_0':'mean'},
{'index': 'quin99', 'a': 4.0, 'level_0':'quin99'}]
assert spbw.df_display_args['main']['df_viewer_config'] == EXPECTED_DF_VIEWER_CONFIG
# #880: the wire payload is projected to the displayed (pinned) stats.
# orig_col_name / rewritten_col_name stay on merged_sd (asserted above)
# but aren't pinned, so they no longer ship to the frontend.
by_index = {row['index']: row['a'] for row in resolved_stats}
assert by_index == {'null_count': 0.0, 'mean': 2.5, 'quin99': 4.0}
assert spbw.df_display_args['main']['df_viewer_config'] == dict(
EXPECTED_DF_VIEWER_CONFIG, pinned_rows=SELECT_ONLY_PINNED)

def test_polars_boolean():
bool_df = pl.DataFrame({'bools':[True, True, False, False, True, None]})
Expand Down
28 changes: 27 additions & 1 deletion tests/unit/test_sd_to_parquet_b64.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import numpy as np
import pyarrow.parquet as pq

from buckaroo.serialization_utils import sd_to_parquet_b64
from buckaroo.serialization_utils import sd_to_parquet_b64, project_sd
from buckaroo.dataflow.styling_core import wire_stat_keys


def _decode(result):
Expand Down Expand Up @@ -206,6 +207,31 @@ def test_uint64_max_round_trips_in_wide_layout():
assert table.to_pydict()['a__max'] == [2**63 + 7]


def test_project_sd_keeps_only_requested_keys():
"""``project_sd`` filters each column's inner dict to ``keep_keys`` and
leaves the input untouched (see #880)."""
sd = {'a': {'mean': 1.0, 'value_counts': [1, 2, 3], 'dtype': 'int64'},
'b': {'mean': 2.0, 'memory_usage': 999, 'dtype': 'float64'}}
projected = project_sd(sd, {'mean', 'dtype'})
assert projected == {'a': {'mean': 1.0, 'dtype': 'int64'}, 'b': {'mean': 2.0, 'dtype': 'float64'}}
# input is not mutated
assert 'value_counts' in sd['a']


def test_wire_stat_keys_unions_pinned_rows_and_histogram_bins():
"""``wire_stat_keys`` is the histogram-bin keys plus every pinned-row
``primary_key_val`` (``?`` scope prefix stripped) across the active
styling classes and any runtime pinned-rows override."""
class _Styling:
pinned_rows = [{'primary_key_val': 'dtype'},
{'primary_key_val': 'mean'},
{'primary_key_val': '?filtered_histogram'}]

keys = wire_stat_keys([_Styling], extra_pinned_rows=[{'primary_key_val': 'std'}])
assert keys == {'histogram_bins', 'histogram_log_bins',
'dtype', 'mean', 'filtered_histogram', 'std'}


def test_negative_int_beyond_int64_falls_back_to_json_string():
"""Ints outside both int64 and uint64 range fall back to JSON-encoded string.

Expand Down
Loading