Skip to content

Commit c02b489

Browse files
committed
Preliminary support for views during sort_by operations
1 parent bea8483 commit c02b489

3 files changed

Lines changed: 154 additions & 73 deletions

File tree

doc/getting_started/tutorials/13.ctable-basics.ipynb

Lines changed: 5 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -774,12 +774,7 @@
774774
"cell_type": "markdown",
775775
"id": "4f466e5d",
776776
"metadata": {},
777-
"source": [
778-
"### 3.3 Sorting\n",
779-
"\n",
780-
"`sort_by()` returns a sorted copy by default (or sorts in-place with `inplace=True`).\n",
781-
"Multi-column sorting is supported — primary key first."
782-
]
777+
"source": "### 3.3 Sorting\n\n`sort_by()` returns a sorted copy by default (or sorts in-place with `inplace=True`).\nPass `view=True` for a zero-copy sorted **view** that shares the table's data and gathers\nrows on demand — ideal for reading a sorted slice of a large table without copying it.\nMulti-column sorting is supported — primary key first."
783778
},
784779
{
785780
"cell_type": "code",
@@ -1197,37 +1192,9 @@
11971192
"start_time": "2026-05-21T09:38:01.039615Z"
11981193
}
11991194
},
1200-
"source": [
1201-
"# Top 10 hottest days in Madrid across the whole year\n",
1202-
"# Sort the full table, then filter — views cannot be sorted directly\n",
1203-
"hottest_all = climate.sort_by(\"temperature\", ascending=False)\n",
1204-
"madrid_sorted = hottest_all.where(hottest_all.city == \"Madrid\")\n",
1205-
"print(\"10 hottest days in Madrid:\")\n",
1206-
"print(madrid_sorted.select([\"city\", \"day\", \"temperature\", \"humidity\"]).head(10))"
1207-
],
1208-
"outputs": [
1209-
{
1210-
"name": "stdout",
1211-
"output_type": "stream",
1212-
"text": [
1213-
"10 hottest days in Madrid:\n",
1214-
" city day temperature humidity\n",
1215-
"0 Madrid 191 31.399208 42.543335\n",
1216-
"1 Madrid 190 31.232576 44.303246\n",
1217-
"2 Madrid 227 31.227442 46.992290\n",
1218-
"3 Madrid 194 30.915184 35.044228\n",
1219-
"4 Madrid 186 30.879374 48.080303\n",
1220-
"5 Madrid 202 30.745684 43.722813\n",
1221-
"6 Madrid 177 30.469023 38.390163\n",
1222-
"7 Madrid 163 30.215179 46.051888\n",
1223-
"8 Madrid 181 30.181025 43.726521\n",
1224-
"9 Madrid 184 29.936199 50.654797\n",
1225-
"\n",
1226-
"[10 rows x 4 columns]\n"
1227-
]
1228-
}
1229-
],
1230-
"execution_count": 21
1195+
"source": "# Top 10 hottest days in Madrid across the whole year.\n# Views *can* be sorted: sort_by() on a where()-view returns a zero-copy sorted\n# view — it shares the table's columns and gathers rows on demand, no full-table\n# copy. (On a base table, pass view=True for the same lazy behaviour.)\nmadrid = climate.where(climate.city == \"Madrid\")\nmadrid_sorted = madrid.sort_by(\"temperature\", ascending=False)\nprint(\"10 hottest days in Madrid:\")\nprint(madrid_sorted.select([\"city\", \"day\", \"temperature\", \"humidity\"]).head(10))",
1196+
"outputs": [],
1197+
"execution_count": null
12311198
},
12321199
{
12331200
"cell_type": "markdown",
@@ -2876,30 +2843,7 @@
28762843
"cell_type": "markdown",
28772844
"id": "405cd155",
28782845
"metadata": {},
2879-
"source": [
2880-
"---\n",
2881-
"## Summary\n",
2882-
"\n",
2883-
"Here's everything we covered:\n",
2884-
"\n",
2885-
"| Feature | API |\n",
2886-
"|---------|-----|\n",
2887-
"| Create | `CTable(Schema)`, `CTable(Schema, new_data=...)` |\n",
2888-
"| Insert | `append(row)`, `extend(list_or_array)` |\n",
2889-
"| View | `head()`, `tail()`, `print(t)`, `t.info()` |\n",
2890-
"| Filter | `where(expr)` → view |\n",
2891-
"| Project | `select([cols])` → view |\n",
2892-
"| Sort | `sort_by(cols)`, `sort_by(cols, inplace=True)` |\n",
2893-
"| Aggregates | `col.sum()`, `.mean()`, `.std()`, `.min()`, `.max()` |\n",
2894-
"| Stats | `describe()`, `cov()` |\n",
2895-
"| Mutate | `delete()`, `compact()`, `add_column()`, `drop_column()`, `assign()` |\n",
2896-
"| Persist | `save(path)`, `to_b2z()`, `to_b2d()`, `CTable.open(path)`, `CTable.load(path)` |\n",
2897-
"| Interop | `to_arrow()`, `from_arrow()`, `to_csv()`, `from_csv()` |\n",
2898-
"| Nullable | `null_value=` on spec, `is_null()`, `notnull()`, `null_count()` |\n",
2899-
"\n",
2900-
"CTable is designed for **compressed analytical workloads** — large tables that need to stay small in RAM\n",
2901-
"while still being fast to query and easy to persist."
2902-
]
2846+
"source": "---\n## Summary\n\nHere's everything we covered:\n\n| Feature | API |\n|---------|-----|\n| Create | `CTable(Schema)`, `CTable(Schema, new_data=...)` |\n| Insert | `append(row)`, `extend(list_or_array)` |\n| View | `head()`, `tail()`, `print(t)`, `t.info()` |\n| Filter | `where(expr)` → view |\n| Project | `select([cols])` → view |\n| Sort | `sort_by(cols)`, `sort_by(cols, view=True)`, `sort_by(cols, inplace=True)` |\n| Aggregates | `col.sum()`, `.mean()`, `.std()`, `.min()`, `.max()` |\n| Stats | `describe()`, `cov()` |\n| Mutate | `delete()`, `compact()`, `add_column()`, `drop_column()`, `assign()` |\n| Persist | `save(path)`, `to_b2z()`, `to_b2d()`, `CTable.open(path)`, `CTable.load(path)` |\n| Interop | `to_arrow()`, `from_arrow()`, `to_csv()`, `from_csv()` |\n| Nullable | `null_value=` on spec, `is_null()`, `notnull()`, `null_count()` |\n\nCTable is designed for **compressed analytical workloads** — large tables that need to stay small in RAM\nwhile still being fast to query and easy to persist."
29032847
}
29042848
],
29052849
"metadata": {

src/blosc2/ctable.py

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9920,7 +9920,7 @@ def _normalise_sort_keys(
99209920
)
99219921
return cols, ascending
99229922

9923-
def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.ndarray | None:
9923+
def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.ndarray | None: # noqa: C901
99249924
"""Return live physical positions from a matching FULL index, if available.
99259925
99269926
Reads the pre-sorted positions sidecar directly rather than going through
@@ -9931,10 +9931,11 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd
99319931
catalog = root._get_index_catalog()
99329932
descriptor = None
99339933

9934+
null_value = None
99349935
if name in root._cols:
99359936
col_info = root._schema.columns_by_name.get(name)
9936-
if col_info is not None and getattr(col_info.spec, "null_value", None) is not None:
9937-
return None
9937+
if col_info is not None:
9938+
null_value = getattr(col_info.spec, "null_value", None)
99389939
descriptor = catalog.get(name)
99399940
if descriptor is None or descriptor.get("kind") != "full" or descriptor.get("stale", False):
99409941
descriptor = None
@@ -9960,8 +9961,12 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd
99609961
# machinery which is built for selective range queries and is ~70x slower
99619962
# for full-table streaming.
99629963
if positions_path is not None:
9963-
# Persistent table: positions live in a sidecar .b2nd file.
9964-
positions_nd = blosc2.open(positions_path, mode="r")
9964+
# Persistent table: positions live in a sidecar .b2nd file. Use the
9965+
# sidecar opener so .b2z (zip) stores are read at their zip offset —
9966+
# blosc2.open() would look for a standalone file that isn't there.
9967+
from blosc2.indexing import _open_sidecar_file
9968+
9969+
positions_nd = _open_sidecar_file(positions_path)
99659970
else:
99669971
# In-memory table: positions live in the sidecar handle cache.
99679972
from blosc2.indexing import _SIDECAR_HANDLE_CACHE, _sidecar_handle_cache_key
@@ -9976,13 +9981,45 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd
99769981
return None
99779982

99789983
positions = np.asarray(positions_nd[:], dtype=np.int64)
9979-
valid = root._valid_rows[:]
9980-
positions = np.asarray(positions, dtype=np.int64)
9981-
positions = positions[(positions >= 0) & (positions < len(valid))]
9982-
positions = positions[valid[positions]]
9984+
total = len(root._valid_rows)
9985+
# Index sidecars can carry padding positions beyond the live range, so
9986+
# the bounds clip always runs — but the ``.all()`` check skips the copy
9987+
# (and a 24M-element temporary) when there is nothing to clip.
9988+
in_bounds = (positions >= 0) & (positions < total)
9989+
if not bool(in_bounds.all()):
9990+
positions = positions[in_bounds]
9991+
del in_bounds
9992+
# Validity filtering only matters when the table has gaps (deleted rows);
9993+
# for a compact table every clipped position is already live.
9994+
if root._n_rows is None or root._n_rows != total:
9995+
valid = root._valid_rows[:]
9996+
positions = positions[valid[positions]]
99839997
if self is not root:
99849998
current_valid = self._valid_rows[:]
99859999
positions = positions[current_valid[positions]]
10000+
10001+
if null_value is not None:
10002+
# The index sorts by raw value, but sort_by's contract is nulls-last.
10003+
# Partition explicitly so it holds for any sentinel (NaN sorts last,
10004+
# an integer sentinel like INT64_MIN sorts first) and either order.
10005+
# Free each 24M-element temporary as soon as it is consumed to keep
10006+
# peak memory near the size of the permutation itself.
10007+
raw = np.asarray(root._cols[name][:])
10008+
if isinstance(null_value, float) and np.isnan(null_value):
10009+
null_phys = np.isnan(raw)
10010+
else:
10011+
null_phys = raw == null_value
10012+
del raw
10013+
if null_phys.any():
10014+
is_null = null_phys[positions]
10015+
del null_phys
10016+
nulls = positions[is_null]
10017+
nonnull = positions[~is_null]
10018+
del is_null, positions
10019+
if not ascending:
10020+
nonnull = nonnull[::-1]
10021+
return np.concatenate([nonnull, nulls])
10022+
998610023
if not ascending:
998710024
positions = positions[::-1]
998810025
return positions
@@ -10047,8 +10084,15 @@ def sort_by(
1004710084
ascending: bool | list[bool] = True,
1004810085
*,
1004910086
inplace: bool = False,
10087+
view: bool = False,
1005010088
) -> CTable:
10051-
"""Return a copy of the table sorted by one or more columns.
10089+
"""Return the table sorted by one or more columns.
10090+
10091+
By default this materialises a new in-memory copy of the sorted rows.
10092+
Pass ``view=True`` to instead get a lightweight **sorted view** that
10093+
shares the parent's column data and gathers rows on demand in sorted
10094+
order — no whole-table copy. This is ideal for reading a sorted slice
10095+
of a large persistent table (e.g. ``t.sort_by("col", view=True)[:10]``).
1005210096
1005310097
Parameters
1005410098
----------
@@ -10069,17 +10113,31 @@ def sort_by(
1006910113
``self`` (like :meth:`compact` but sorted). If ``False``
1007010114
(default), return a new in-memory CTable leaving this one
1007110115
untouched.
10116+
view:
10117+
If ``True``, return a zero-copy sorted **view** over this table
10118+
instead of materialising a copy: it shares the parent's columns and
10119+
stores only the sort permutation, gathering rows on demand in sorted
10120+
order. Slicing the view (``sv[start:stop:step]``) keeps the sorted
10121+
order and touches only the rows read. A single-column sort backed by
10122+
a non-stale ``FULL`` index reuses its pre-sorted positions (no sort at
10123+
read time); otherwise only the sort-key column(s) are materialised to
10124+
build the permutation — never the whole table. Mutually exclusive
10125+
with ``inplace``. Sorting an existing view is always lazy regardless
10126+
of this flag.
1007210127
1007310128
Raises
1007410129
------
1007510130
ValueError
10076-
If called on a view or a read-only table when ``inplace=True``.
10131+
If called on a view or a read-only table when ``inplace=True``, or if
10132+
both ``inplace`` and ``view`` are ``True``.
1007710133
KeyError
1007810134
If any column name is not found.
1007910135
TypeError
1008010136
If a column used as a sort key does not support ordering
1008110137
(e.g. complex numbers).
1008210138
"""
10139+
if inplace and view:
10140+
raise ValueError("inplace=True and view=True are mutually exclusive.")
1008310141
if self.base is not None and inplace:
1008410142
raise ValueError(
1008510143
"Cannot sort a view inplace (would modify shared column data). Use sort_by(inplace=False) to get a sorted copy."
@@ -10120,7 +10178,7 @@ def sort_by(
1012010178
# use those positions directly, so columns are fetched on demand and in
1012110179
# the correct sorted order — identical performance to pre-projecting
1012210180
# with columns= before calling sort_by.
10123-
if self.base is not None:
10181+
if self.base is not None or view:
1012410182
result = CTable._make_view(self, self._valid_rows)
1012510183
result._cached_live_positions = sorted_pos
1012610184
result._n_rows = n
@@ -11332,6 +11390,12 @@ def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable:
1133211390

1133311391
mant_pos = true_pos[ind]
1133411392

11393+
# For an ordered view (sorted view or position view), preserve the row
11394+
# order and any duplicates by carrying the positions forward. A boolean
11395+
# mask is physical-order and set-like, so it would silently drop both.
11396+
if getattr(self, "_cached_live_positions", None) is not None:
11397+
return self._view_from_positions(np.asarray(mant_pos))
11398+
1133511399
new_mask_np = np.zeros(len(self._valid_rows), dtype=bool)
1133611400
new_mask_np[mant_pos] = True
1133711401

tests/ctable/test_sort_by.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,5 +414,78 @@ def test_sort_unprojected_view_opens_only_needed_columns(tmp_path):
414414
t.close()
415415

416416

417+
def test_sort_view_zero_copy_slice(tmp_path):
418+
"""sort_by(view=True) returns a zero-copy view whose slices keep sorted order."""
419+
rng = np.random.default_rng(0)
420+
n = 1000
421+
score = rng.integers(0, 50, n).astype(np.float64) # duplicates on purpose
422+
ids = np.arange(n)
423+
data = list(zip(ids.tolist(), score.tolist(), [True] * n, strict=True))
424+
425+
urlpath = str(tmp_path / "sort-view.b2z")
426+
t = CTable(Row, new_data=data, urlpath=urlpath, mode="w")
427+
t.create_index("id", kind=blosc2.IndexKind.FULL) # id has a FULL index
428+
429+
sv = t.sort_by("score", view=True)
430+
assert sv.base is not None # a view, not a materialised copy
431+
432+
order = np.argsort(score, kind="stable")
433+
for sl in [slice(0, 10), slice(-10, None), slice(None, None, 2), slice(100, 50, -1), slice(5, 25, 3)]:
434+
np.testing.assert_array_equal(np.asarray(sv[sl]["score"][:]), score[order][sl])
435+
436+
# Descending, and a FULL-index-backed single-column sort, both stay ordered.
437+
svd = t.sort_by("score", ascending=False, view=True)
438+
np.testing.assert_array_equal(np.asarray(svd[:10]["score"][:]), score[order[::-1]][:10])
439+
svf = t.sort_by("id", view=True)
440+
np.testing.assert_array_equal(np.asarray(svf[:10]["id"][:]), np.arange(10))
441+
442+
443+
@pytest.mark.parametrize("ascending", [True, False])
444+
def test_sort_view_full_index_nullable_persistent(tmp_path, ascending):
445+
"""A FULL index on a nullable column accelerates sort_by(view=True) on a .b2z,
446+
and the result keeps nulls last (matching the materialised copy path)."""
447+
448+
@dataclass
449+
class NullRow:
450+
key: int = blosc2.field(blosc2.int64(ge=0))
451+
val: float = blosc2.field(blosc2.float64(null_value=float("nan")), default=float("nan"))
452+
453+
rng = np.random.default_rng(1)
454+
n = 2000
455+
val = rng.integers(0, 100, n).astype(np.float64)
456+
val[rng.choice(n, 50, replace=False)] = np.nan # scattered nulls
457+
data = list(zip(range(n), val.tolist(), strict=True))
458+
459+
urlpath = str(tmp_path / "nullable.b2z")
460+
t = CTable(NullRow, new_data=data, urlpath=urlpath, mode="w")
461+
t.create_index("val", kind=blosc2.IndexKind.FULL)
462+
t.close()
463+
464+
t = blosc2.CTable.open(urlpath, mode="r")
465+
try:
466+
# Reference: copy path (its nulls-last behaviour is the contract).
467+
ref = np.asarray(t.sort_by("val", ascending=ascending)["val"][:])
468+
got = np.asarray(t.sort_by("val", ascending=ascending, view=True)["val"][:])
469+
np.testing.assert_array_equal(got, ref) # NaNs compare equal here via positions
470+
# Nulls must be last regardless of direction.
471+
assert np.isnan(got[-50:]).all()
472+
assert not np.isnan(got[:-50]).any()
473+
finally:
474+
t.close()
475+
476+
477+
def test_sort_view_false_returns_copy():
478+
"""The default (view=False) still returns an independent in-memory copy."""
479+
t = CTable(Row, new_data=DATA)
480+
cp = t.sort_by("score")
481+
assert cp.base is None
482+
483+
484+
def test_sort_view_inplace_mutually_exclusive():
485+
t = CTable(Row, new_data=DATA)
486+
with pytest.raises(ValueError, match="mutually exclusive"):
487+
t.sort_by("score", inplace=True, view=True)
488+
489+
417490
if __name__ == "__main__":
418491
pytest.main(["-v", __file__])

0 commit comments

Comments
 (0)