Blosc
diff --git a/‎examples/ctable/index_on_b2z.py‎
Lines changed: 147 additions & 0 deletions b/‎examples/ctable/index_on_b2z.py‎
Lines changed: 147 additions & 0 deletions
diff --git a/‎src/blosc2/ctable.py‎
Lines changed: 94 additions & 12 deletions b/‎src/blosc2/ctable.py‎
Lines changed: 94 additions & 12 deletions
@@ -0,0 +1,147 @@
+"""Demonstrate that CTable indexes survive a .b2z round-trip.
+
+Steps
+-----
+1. Build a small CTable with synthetic sensor data and save it as .b2z.
+2. Measure query speed with full scan (no index).
+3. Reopen in append mode, create FULL indexes, close (triggers rezip).
+4. Reopen read-only — indexes are present and queries are faster.
+"""
+
+import os
+import shutil
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+import blosc2
+
+# ---------------------------------------------------------------------------
+# 1. Schema and synthetic data
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class Reading:
+    sensor_id: int = blosc2.field(blosc2.int32())
+    timestamp: int = blosc2.field(blosc2.int64())
+    value: float = blosc2.field(blosc2.float64())
+    active: bool = blosc2.field(blosc2.bool())
+
+
+N = 5_000_000
+rng = np.random.default_rng(42)
+B2D = "/tmp/sensors.b2d"
+B2Z = "/tmp/sensors.b2z"
+
+for p in (B2D, B2Z):
+    if os.path.exists(p):
+        (shutil.rmtree if os.path.isdir(p) else os.remove)(p)
+
+# ---------------------------------------------------------------------------
+# 2. Create and zip
+# ---------------------------------------------------------------------------
+
+print(f"Creating CTable with {N:,} rows ...")
+ct = blosc2.CTable(Reading, urlpath=B2D, mode="w", expected_size=N)
+ct.extend(
+    {
+        "sensor_id": rng.integers(0, 100, N, dtype=np.int32),
+        "timestamp": np.arange(N, dtype=np.int64),
+        "value": rng.uniform(-50.0, 150.0, N),
+        "active": rng.integers(0, 2, N).astype(bool),
+    }
+)
+ct.close()
+
+store = blosc2.TreeStore(B2D, mode="r")
+store.to_b2z(filename=B2Z, overwrite=True)
+store.discard()
+shutil.rmtree(B2D)
+print(f"  saved → {B2Z}  ({os.path.getsize(B2Z) / 1e6:.1f} MB)")
+
+# ---------------------------------------------------------------------------
+# 3. Baseline: full scan (no index)
+# ---------------------------------------------------------------------------
+
+QUERIES = [
+    ("value > 100.0", lambda ct: ct.where(ct["value"] > 100.0)),
+    ("value > 120.0", lambda ct: ct.where(ct["value"] > 120.0)),
+    ("value between 0 and 10", lambda ct: ct.where((ct["value"] >= 0.0) & (ct["value"] <= 10.0))),
+    ("timestamp > 450_000", lambda ct: ct.where(ct["timestamp"] > 4_500_000)),
+    ("timestamp > 4_999_000", lambda ct: ct.where(ct["timestamp"] > 4_999_000)),
+]
+
+
+def bench(fn, reps=5):
+    times = [0.0] * reps
+    for i in range(reps):
+        t = time.perf_counter()
+        result = fn()
+        times[i] = (time.perf_counter() - t) * 1000
+    return min(times), result
+
+
+print("\n--- Full scan (no index) ---")
+baseline = {}
+ct = blosc2.CTable.open(B2Z, mode="r")
+assert not ct.indexes, "expected no indexes yet"
+for label, fn in QUERIES:
+    ms, view = bench(lambda fn=fn: fn(ct))
+    baseline[label] = (ms, len(view))
+    print(f"  {label:<38}  {ms:7.1f} ms   {len(view):>8,} rows")
+ct.close()
+
+# ---------------------------------------------------------------------------
+# 4. Build indexes (append mode → rezip on close)
+# ---------------------------------------------------------------------------
+
+print("\nBuilding indexes (mode='a') ...")
+ct = blosc2.CTable.open(B2Z, mode="a")
+
+t0 = time.perf_counter()
+ct.create_index("value", kind=blosc2.IndexKind.FULL)
+print(f"  value     FULL index  {(time.perf_counter() - t0) * 1000:.0f} ms")
+
+t0 = time.perf_counter()
+ct.create_index("timestamp", kind=blosc2.IndexKind.FULL)
+print(f"  timestamp FULL index  {(time.perf_counter() - t0) * 1000:.0f} ms")
+
+t0 = time.perf_counter()
+ct.close()
+print(
+    f"  closed + rezipped     {(time.perf_counter() - t0) * 1000:.0f} ms  "
+    f"({os.path.getsize(B2Z) / 1e6:.1f} MB)"
+)
+
+# ---------------------------------------------------------------------------
+# 5. Read-only: verify indexes survived, benchmark
+# ---------------------------------------------------------------------------
+
+print("\nReopening .b2z read-only ...")
+ct = blosc2.CTable.open(B2Z, mode="r")
+found = [idx.col_name for idx in ct.indexes]
+print(f"  indexes present: {found}")
+assert "value" in found, "index for 'value' missing after round-trip!"
+assert "timestamp" in found, "index for 'timestamp' missing after round-trip!"
+
+print()
+print(f"{'query':<38}  {'no index':>9}  {'indexed':>9}  {'speedup':>8}  {'rows':>8}")
+print("-" * 78)
+
+for label, fn in QUERIES:
+    i_ms, view = bench(lambda fn=fn: fn(ct))
+    b_ms, b_n = baseline[label]
+    sp = b_ms / i_ms if i_ms > 0 else float("inf")
+    assert len(view) == b_n, f"row count mismatch for {label!r}"
+    print(f"  {label:<38}  {b_ms:8.1f}ms  {i_ms:8.1f}ms  {sp:7.1f}x  {len(view):>8,}")
+
+ct.close()
+
+# ---------------------------------------------------------------------------
+# 6. Cleanup
+# ---------------------------------------------------------------------------
+
+os.remove(B2Z)
+print("\nDone.")
@@ -1457,12 +1457,18 @@ def _init_columns(
                     dparams=col_storage.get("dparams"),
                 )
                 continue
+            # Recompute chunks/blocks using the actual dtype so that wide
+            # string columns (e.g. U183642) don't produce multi-GB chunks.
+            chunks = col_storage["chunks"]
+            blocks = col_storage["blocks"]
+            if col.config.chunks is None and col.config.blocks is None:
+                chunks, blocks = compute_chunks_blocks((expected_size,), dtype=col.dtype)
             self._cols[col.name] = storage.create_column(
                 col.name,
                 dtype=col.dtype,
                 shape=(expected_size,),
-                chunks=col_storage["chunks"],
-                blocks=col_storage["blocks"],
+                chunks=chunks,
+                blocks=blocks,
                 cparams=col_storage.get("cparams"),
                 dparams=col_storage.get("dparams"),
             )
@@ -3494,8 +3500,10 @@ def sort_by(
             If a column used as a sort key does not support ordering
             (e.g. complex numbers).
         """
-        if self.base is not None:
-            raise ValueError("Cannot sort a view. Materialise it first with .to_table() or sort the parent.")
+        if self.base is not None and inplace:
+            raise ValueError(
+                "Cannot sort a view inplace (would modify shared column data). Use sort_by(inplace=False) to get a sorted copy."
+            )
         if inplace and self._read_only:
             raise ValueError("Table is read-only (opened with mode='r').")
 
@@ -3522,8 +3530,15 @@ def sort_by(
             sorted_pos = live_pos[order]
 
         if inplace:
-            for _col_name, arr in self._cols.items():
-                arr[:n] = arr[sorted_pos]
+            for col in self._schema.columns:
+                arr = self._cols[col.name]
+                if self._is_list_column(col):
+                    new_arr = ListArray(spec=col.spec)
+                    new_arr.extend((arr[int(pos)] for pos in sorted_pos), validate=False)
+                    new_arr.flush()
+                    self._cols[col.name] = new_arr
+                else:
+                    arr[:n] = arr[sorted_pos]
             self._valid_rows[:n] = True
             self._valid_rows[n:] = False
             self._n_rows = n
@@ -3537,7 +3552,7 @@ def sort_by(
                 col_name = col.name
                 arr = self._cols[col_name]
                 if self._is_list_column(col):
-                    result._cols[col_name].extend(arr[int(pos)] for pos in sorted_pos)
+                    result._cols[col_name].extend((arr[int(pos)] for pos in sorted_pos), validate=False)
                     result._cols[col_name].flush()
                 else:
                     result._cols[col_name][:n] = arr[sorted_pos]
@@ -3547,11 +3562,66 @@ def sort_by(
             result._last_pos = n
             return result
 
-    def _empty_copy(self) -> CTable:
+    def copy(self, compact: bool = True) -> CTable:
+        """Return a new standalone in-memory copy of this table.
+
+        Parameters
+        ----------
+        compact:
+            If ``True`` (default), only live (non-deleted) rows are copied.
+            The result is a dense table with no tombstones and no parent
+            dependency — ideal for materialising a filtered view.
+            If ``False``, all physical slots are copied including deleted gaps,
+            preserving the tombstone state exactly.
+        """
+        valid_np = self._valid_rows[:]
+        live_pos = np.where(valid_np)[0]
+        n_live = len(live_pos)
+
+        if compact:
+            n = n_live
+        else:
+            # High watermark: number of slots ever written.
+            # List columns are written sequentially with no gaps — their length
+            # is the exact high watermark.  For scalar-only tables fall back to
+            # the last live position + 1 (writes are always sequential so no
+            # deleted slot can exist beyond the last live one).
+            n = 0
+            for col in self._schema.columns:
+                if self._is_list_column(col):
+                    n = len(self._cols[col.name])
+                    break
+            if n == 0:
+                n = int(live_pos[-1]) + 1 if n_live > 0 else 0
+
+        result = self._empty_copy(capacity=n)
+
+        for col in self._schema.columns:
+            col_name = col.name
+            arr = self._cols[col_name]
+            if self._is_list_column(col):
+                src = (arr[int(pos)] for pos in live_pos) if compact else (arr[i] for i in range(n))
+                result._cols[col_name].extend(src, validate=False)
+                result._cols[col_name].flush()
+            else:
+                result._cols[col_name][:n] = arr[live_pos] if compact else arr[:n]
+
+        if compact:
+            result._valid_rows[:n] = True
+            result._n_rows = n
+            result._last_pos = n - 1 if n > 0 else None
+        else:
+            result._valid_rows[:n] = valid_np[:n]
+            result._n_rows = n_live
+            result._last_pos = None  # recomputed lazily on next append
+
+        return result
+
+    def _empty_copy(self, capacity: int | None = None) -> CTable:
         """Return a new empty in-memory CTable with the same schema and capacity."""
         from blosc2 import compute_chunks_blocks
 
-        capacity = max(self._n_rows, 1)
+        capacity = max(capacity if capacity is not None else self._n_rows, 1)
         default_chunks, default_blocks = compute_chunks_blocks((capacity,))
         mem_storage = InMemoryTableStorage()
 
@@ -4385,10 +4455,18 @@ def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None:
         primary_col_name, primary_col_arr, _ = indexed_columns[0]
 
         # Inject every usable table-owned descriptor so plan_query can combine them.
+        # In .b2z read mode all columns share the same urlpath, so _array_key()
+        # returns the same key for every column — causing _SIDECAR_HANDLE_CACHE
+        # collisions across queries.  Clear stale handles before each injection so
+        # the upcoming query always loads the correct sidecar for this column.
+        from blosc2.indexing import _clear_cached_data
+
         for _col_name, col_arr, descriptor in indexed_columns:
             arr_key = _array_key(col_arr)
             if _is_persistent_array(col_arr):
                 store = _PERSISTENT_INDEXES.get(arr_key) or _default_index_store()
+                if store["indexes"].get(descriptor["token"]) is not descriptor:
+                    _clear_cached_data(col_arr, descriptor["token"])
                 store["indexes"][descriptor["token"]] = descriptor
                 _PERSISTENT_INDEXES[arr_key] = store
             else:
@@ -4603,7 +4681,11 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
                     raw_columns[name] = data._cols[name][: data._n_rows]
                     provided_names.add(name)
         else:
-            if isinstance(data, np.ndarray) and data.dtype.names is not None:
+            if isinstance(data, dict):
+                provided_names = set(data) & set(current_col_names)
+                new_nrows = len(next(iter(data.values())))
+                raw_columns = {name: data[name] for name in provided_names}
+            elif isinstance(data, np.ndarray) and data.dtype.names is not None:
                 new_nrows = len(data)
                 raw_columns = {name: data[name] for name in data.dtype.names if name in current_col_names}
                 provided_names = set(raw_columns)
@@ -4634,7 +4716,7 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
                 list_processed_cols[name] = list(raw_columns[name])
             else:
                 target_dtype = self._cols[name].dtype
-                scalar_processed_cols[name] = blosc2.asarray(raw_columns[name], dtype=target_dtype)
+                scalar_processed_cols[name] = np.ascontiguousarray(raw_columns[name], dtype=target_dtype)
 
         end_pos = start_pos + new_nrows
 
@@ -4649,7 +4731,7 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
         for name in current_col_names:
             col_meta = self._schema.columns_by_name[name]
             if self._is_list_column(col_meta):
-                self._cols[name].extend(list_processed_cols[name])
+                self._cols[name].extend(list_processed_cols[name], validate=do_validate)
             else:
                 self._cols[name][start_pos:end_pos] = scalar_processed_cols[name][:]