Skip to content

Commit 0eecccd

Browse files
authored
fix(datasets): avoid exponential blow-up of nested struct sample values (#9506)
## Summary `NarwhalsTableManager.get_sample_values` recursively re-stringified nested list/dict cells, causing each ancestor level to re-escape the children's repr. For deeply nested polars `Struct`/`List` columns this scaled ~8× per depth and produced multi-GB strings that hung the browser when the dataframe was registered as a dataset. Replace the recursion with one `json.dumps` pass, preserving the `Enum.name`-at-any-depth contract via a `default=` callback. Scalar paths are unchanged. Fixes #9378. ## Test plan - [x] New tests in `tests/_plugins/ui/_impl/tables/test_narwhals.py` cover: bounded time/size at nesting depth 8, JSON-shaped output, non-JSON leaf (datetime) embedded in a struct. - [x] `uv run --group test pytest tests/_plugins/ui/_impl/tables/` — 468 passed. - [x] `make py-check` clean. - [x] Manual: register a polars df with depth-8 nested struct via a top-level variable; browser no longer hangs and SQL column hover shows a readable JSON preview.
1 parent af0556c commit 0eecccd

2 files changed

Lines changed: 85 additions & 7 deletions

File tree

marimo/_plugins/ui/_impl/tables/narwhals_table.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import datetime
55
import functools
66
import io
7+
import json
78
import math
9+
from enum import Enum
810
from functools import cached_property
911
from typing import TYPE_CHECKING, Any, Literal, cast
1012

@@ -698,17 +700,22 @@ def get_sample_values(self, column: str) -> list[str | int | float]:
698700
# Sample 3 values from the column
699701
SAMPLE_SIZE = 3
700702
try:
701-
from enum import Enum
703+
704+
def _json_default(o: Any) -> str:
705+
if isinstance(o, Enum):
706+
return o.name
707+
return str(o)
702708

703709
def to_primitive(value: Any) -> str | int | float:
704-
if isinstance(value, list):
705-
return str([to_primitive(v) for v in value])
706-
elif isinstance(value, dict):
707-
return str({k: to_primitive(v) for k, v in value.items()})
708-
elif isinstance(value, Enum):
710+
if isinstance(value, Enum):
709711
return value.name
710-
elif isinstance(value, (float, int)):
712+
if isinstance(value, (int, float)):
711713
return value
714+
if isinstance(value, (list, dict)):
715+
try:
716+
return json.dumps(value, default=_json_default)
717+
except (TypeError, ValueError):
718+
return str(value)
712719
return str(value)
713720

714721
if self.data[column].dtype == nw.Datetime:

tests/_plugins/ui/_impl/tables/test_narwhals.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,6 +1535,77 @@ def test_get_sample_values(df: Any) -> None:
15351535
assert sample_values == ["b'bytes1'", "b'bytes2'", "b'bytes3'"]
15361536

15371537

1538+
@pytest.mark.skipif(not HAS_DEPS, reason="polars not installed")
1539+
def test_get_sample_values_nested_struct_is_bounded() -> None:
1540+
import polars as pl
1541+
1542+
def build_payload(row_idx: int, depth: int):
1543+
base = {
1544+
"kind": chr(65 + (row_idx % 26)),
1545+
"scores": [row_idx + 1, row_idx + 2, row_idx + 3],
1546+
"meta": {"city": "Zurich", "active": row_idx % 2 == 0},
1547+
}
1548+
if depth == 0:
1549+
return base
1550+
return {
1551+
"level": depth,
1552+
"items": [
1553+
base,
1554+
{
1555+
"branch": row_idx,
1556+
"child": build_payload(row_idx, depth - 1),
1557+
},
1558+
],
1559+
"summary": {"row": row_idx, "depth": depth},
1560+
}
1561+
1562+
df = pl.DataFrame({"payload": [build_payload(i, 8) for i in range(3)]})
1563+
manager = NarwhalsTableManager.from_dataframe(
1564+
nw.from_native(df, eager_only=True)
1565+
)
1566+
1567+
t0 = time.perf_counter()
1568+
samples = manager.get_sample_values("payload")
1569+
elapsed = time.perf_counter() - t0
1570+
1571+
assert len(samples) == 3
1572+
assert elapsed < 1.0, (
1573+
f"get_sample_values took {elapsed:.2f}s (expected < 1s)"
1574+
)
1575+
for s in samples:
1576+
assert isinstance(s, str)
1577+
assert len(s) < 1_000_000
1578+
1579+
1580+
@pytest.mark.skipif(not HAS_DEPS, reason="polars not installed")
1581+
def test_get_sample_values_nested_struct_is_json() -> None:
1582+
import polars as pl
1583+
1584+
df = pl.DataFrame({"x": [{"a": 1, "b": [1, 2, 3]}]})
1585+
manager = NarwhalsTableManager.from_dataframe(
1586+
nw.from_native(df, eager_only=True)
1587+
)
1588+
1589+
samples = manager.get_sample_values("x")
1590+
assert len(samples) == 1
1591+
assert samples[0].startswith('{"')
1592+
parsed = json.loads(samples[0])
1593+
assert parsed == {"a": 1, "b": [1, 2, 3]}
1594+
1595+
1596+
@pytest.mark.skipif(not HAS_DEPS, reason="polars not installed")
1597+
def test_get_sample_values_nested_struct_with_non_json_leaf() -> None:
1598+
import polars as pl
1599+
1600+
df = pl.DataFrame({"x": [{"when": datetime.datetime(2026, 1, 1), "n": 1}]})
1601+
manager = NarwhalsTableManager.from_dataframe(
1602+
nw.from_native(df, eager_only=True)
1603+
)
1604+
samples = manager.get_sample_values("x")
1605+
assert len(samples) == 1
1606+
assert "2026" in samples[0]
1607+
1608+
15381609
@pytest.mark.skipif(not HAS_DEPS, reason="optional dependencies not installed")
15391610
@pytest.mark.parametrize(
15401611
"df",

0 commit comments

Comments
 (0)