Merge branch 'main' into feat/memory-store-registry

d-v-b · web-flow · commit b3f368f0a63e · 2026-02-25T15:39:51.000-05:00
diff --git a/changes/3713.misc.md b/changes/3713.misc.md
@@ -0,0 +1 @@
+Vectorize get_chunk_slice for faster sharded array writes.
diff --git a/changes/3717.misc.md b/changes/3717.misc.md
@@ -0,0 +1 @@
+Add benchmarks for Morton order computation with non-power-of-2 and near-miss shard shapes, covering both pure computation and end-to-end read/write performance.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -46,6 +46,8 @@
 from zarr.core.indexing import (
     BasicIndexer,
     SelectorTuple,
+    _morton_order,
+    _morton_order_keys,
     c_order_iter,
     get_indexer,
     morton_order_iter,
@@ -144,6 +146,45 @@ def get_chunk_slice(self, chunk_coords: tuple[int, ...]) -> tuple[int, int] | No
         else:
             return (int(chunk_start), int(chunk_start + chunk_len))
 
+    def get_chunk_slices_vectorized(
+        self, chunk_coords_array: npt.NDArray[np.integer[Any]]
+    ) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint64], npt.NDArray[np.bool_]]:
+        """Get chunk slices for multiple coordinates at once.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates to look up.
+
+        Returns
+        -------
+        starts : ndarray of shape (n_chunks,)
+            Start byte positions for each chunk.
+        ends : ndarray of shape (n_chunks,)
+            End byte positions for each chunk.
+        valid : ndarray of shape (n_chunks,)
+            Boolean mask indicating which chunks are non-empty.
+        """
+        # Localize coordinates via modulo (vectorized)
+        shard_shape = np.array(self.offsets_and_lengths.shape[:-1], dtype=np.uint64)
+        localized = chunk_coords_array.astype(np.uint64) % shard_shape
+
+        # Build index tuple for advanced indexing
+        index_tuple = tuple(localized[:, i] for i in range(localized.shape[1]))
+
+        # Fetch all offsets and lengths at once
+        offsets_and_lengths = self.offsets_and_lengths[index_tuple]
+        starts = offsets_and_lengths[:, 0]
+        lengths = offsets_and_lengths[:, 1]
+
+        # Check for valid (non-empty) chunks
+        valid = starts != MAX_UINT_64
+
+        # Compute end positions
+        ends = starts + lengths
+
+        return starts, ends, valid
+
     def set_chunk_slice(self, chunk_coords: tuple[int, ...], chunk_slice: slice | None) -> None:
         localized_chunk = self._localize_chunk(chunk_coords)
         if chunk_slice is None:
@@ -225,6 +266,34 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[tuple[int, ...]]:
         return c_order_iter(self.index.offsets_and_lengths.shape[:-1])
 
+    def to_dict_vectorized(
+        self,
+        chunk_coords_array: npt.NDArray[np.integer[Any]],
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Build a dict of chunk coordinates to buffers using vectorized lookup.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates for vectorized index lookup.
+
+        Returns
+        -------
+        dict mapping chunk coordinate tuples to Buffer or None
+        """
+        starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
+        chunks_per_shard = tuple(self.index.offsets_and_lengths.shape[:-1])
+        chunk_coords_keys = _morton_order_keys(chunks_per_shard)
+
+        result: dict[tuple[int, ...], Buffer | None] = {}
+        for i, coords in enumerate(chunk_coords_keys):
+            if valid[i]:
+                result[coords] = self.buf[int(starts[i]) : int(ends[i])]
+            else:
+                result[coords] = None
+
+        return result
+
 
 @dataclass(frozen=True)
 class ShardingCodec(
@@ -511,7 +580,8 @@ async def _encode_partial_single(
             chunks_per_shard=chunks_per_shard,
         )
         shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
-        shard_dict = {k: shard_reader.get(k) for k in morton_order_iter(chunks_per_shard)}
+        # Use vectorized lookup for better performance
+        shard_dict = shard_reader.to_dict_vectorized(np.asarray(_morton_order(chunks_per_shard)))
 
         indexer = list(
             get_indexer(
diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
@@ -1504,70 +1504,67 @@ def decode_morton_vectorized(
 
 
 @lru_cache(maxsize=16)
-def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+def _morton_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]:
     n_total = product(chunk_shape)
-    if n_total == 0:
-        return ()
-
-    # Optimization: Remove singleton dimensions to enable magic number usage
-    # for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand.
-    singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1)
-    if singleton_dims:
-        squeezed_shape = tuple(s for s in chunk_shape if s != 1)
-        if squeezed_shape:
-            # Compute Morton order on squeezed shape
-            squeezed_order = _morton_order(squeezed_shape)
-            # Expand coordinates to include singleton dimensions (always 0)
-            expanded: list[tuple[int, ...]] = []
-            for coord in squeezed_order:
-                full_coord: list[int] = []
-                squeezed_idx = 0
-                for i in range(len(chunk_shape)):
-                    if chunk_shape[i] == 1:
-                        full_coord.append(0)
-                    else:
-                        full_coord.append(coord[squeezed_idx])
-                        squeezed_idx += 1
-                expanded.append(tuple(full_coord))
-            return tuple(expanded)
-        else:
-            # All dimensions are singletons, just return the single point
-            return ((0,) * len(chunk_shape),)
-
     n_dims = len(chunk_shape)
-
-    # Find the largest power-of-2 hypercube that fits within chunk_shape.
-    # Within this hypercube, Morton codes are guaranteed to be in bounds.
-    min_dim = min(chunk_shape)
-    if min_dim >= 1:
-        power = min_dim.bit_length() - 1  # floor(log2(min_dim))
-        hypercube_size = 1 << power  # 2^power
-        n_hypercube = hypercube_size**n_dims
-    else:
-        n_hypercube = 0
-
-    # Within the hypercube, no bounds checking needed - use vectorized decoding
-    order: list[tuple[int, ...]]
-    if n_hypercube > 0:
-        z_values = np.arange(n_hypercube, dtype=np.intp)
-        hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
-        order = [tuple(row) for row in hypercube_coords]
+    if n_total == 0:
+        out = np.empty((0, n_dims), dtype=np.intp)
+        out.flags.writeable = False
+        return out
+
+    # Ceiling hypercube: smallest power-of-2 hypercube whose Morton codes span
+    # all valid coordinates in chunk_shape. (c-1).bit_length() gives the number
+    # of bits needed to index c values (0 for singleton dims). n_z = 2**total_bits
+    # is the size of this hypercube.
+    total_bits = sum((c - 1).bit_length() for c in chunk_shape)
+    n_z = 1 << total_bits if total_bits > 0 else 1
+
+    # Decode all Morton codes in the ceiling hypercube, then filter to valid coords.
+    # This is fully vectorized. For shapes with n_z >> n_total (e.g. (33,33,33):
+    # n_z=262144, n_total=35937), consider the argsort strategy below.
+    order: npt.NDArray[np.intp]
+    if n_z <= 4 * n_total:
+        # Ceiling strategy: decode all n_z codes vectorized, filter in-bounds.
+        # Works well when the overgeneration ratio n_z/n_total is small (≤4).
+        z_values = np.arange(n_z, dtype=np.intp)
+        all_coords = decode_morton_vectorized(z_values, chunk_shape)
+        shape_arr = np.array(chunk_shape, dtype=np.intp)
+        valid_mask = np.all(all_coords < shape_arr, axis=1)
+        order = all_coords[valid_mask]
     else:
-        order = []
+        # Argsort strategy: enumerate all n_total valid coordinates directly,
+        # encode each to a Morton code, then sort by code. Avoids the 8x or
+        # larger overgeneration penalty for near-miss shapes like (33,33,33).
+        # Cost: O(n_total * bits) encode + O(n_total log n_total) sort,
+        # vs O(n_z * bits) = O(8 * n_total * bits) for ceiling.
+        grids = np.meshgrid(*[np.arange(c, dtype=np.intp) for c in chunk_shape], indexing="ij")
+        all_coords = np.stack([g.ravel() for g in grids], axis=1)
+
+        # Encode all coordinates to Morton codes (vectorized).
+        bits_per_dim = tuple((c - 1).bit_length() for c in chunk_shape)
+        max_coord_bits = max(bits_per_dim)
+        z_codes = np.zeros(n_total, dtype=np.intp)
+        output_bit = 0
+        for coord_bit in range(max_coord_bits):
+            for dim in range(n_dims):
+                if coord_bit < bits_per_dim[dim]:
+                    z_codes |= ((all_coords[:, dim] >> coord_bit) & 1) << output_bit
+                    output_bit += 1
+
+        sort_idx: npt.NDArray[np.intp] = np.argsort(z_codes, kind="stable")
+        order = np.asarray(all_coords[sort_idx], dtype=np.intp)
+
+    order.flags.writeable = False
+    return order
 
-    # For remaining elements, bounds checking is needed
-    i = n_hypercube
-    while len(order) < n_total:
-        m = decode_morton(i, chunk_shape)
-        if all(x < y for x, y in zip(m, chunk_shape, strict=False)):
-            order.append(m)
-        i += 1
 
-    return tuple(order)
+@lru_cache(maxsize=16)
+def _morton_order_keys(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+    return tuple(tuple(int(x) for x in row) for row in _morton_order(chunk_shape))
 
 
 def morton_order_iter(chunk_shape: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
-    return iter(_morton_order(tuple(chunk_shape)))
+    return iter(_morton_order_keys(tuple(chunk_shape)))
 
 
 def c_order_iter(chunks_per_shard: tuple[int, ...]) -> Iterator[tuple[int, ...]]:
diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py
@@ -340,13 +340,13 @@ def delete_array_using_del(self, data: DataObject) -> None:
         self.all_arrays.remove(array_path)
 
     @precondition(lambda self: self.store.supports_deletes)
-    @precondition(lambda self: len(self.all_groups) >= 2)  # fixme don't delete root
+    @precondition(lambda self: bool(self.all_groups))
     @rule(data=st.data())
     def delete_group_using_del(self, data: DataObject) -> None:
-        # ensure that we don't include the root group in the list of member names that we try
-        # to delete
-        member_names = tuple(filter(lambda v: "/" in v, sorted(self.all_groups)))
-        group_path = data.draw(st.sampled_from(member_names), label="Group deletion target")
+        group_path = data.draw(
+            st.sampled_from(sorted(self.all_groups)),
+            label="Group deletion target",
+        )
         prefix, group_name = split_prefix_name(group_path)
         note(f"Deleting group '{group_path=!r}', {prefix=!r}, {group_name=!r} using delete")
         members = zarr.open_group(store=self.model, path=group_path).members(max_depth=None)
@@ -359,9 +359,7 @@ def delete_group_using_del(self, data: DataObject) -> None:
             group = zarr.open_group(store=store, path=prefix)
             group[group_name]  # check that it exists
             del group[group_name]
-        if group_path != "/":
-            # The root group is always present
-            self.all_groups.remove(group_path)
+        self.all_groups.remove(group_path)
 
     # # --------------- assertions -----------------
     # def check_group_arrays(self, group):
diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
@@ -74,7 +74,7 @@ def test_sharded_morton_indexing(
     The Morton order cache is cleared before each iteration to measure the
     full computation cost.
     """
-    from zarr.core.indexing import _morton_order
+    from zarr.core.indexing import _morton_order, _morton_order_keys
 
     # Create array where each shard contains many small chunks
     # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
@@ -98,14 +98,18 @@ def test_sharded_morton_indexing(
 
     def read_with_cache_clear() -> None:
         _morton_order.cache_clear()
+        _morton_order_keys.cache_clear()
         getitem(data, indexer)
 
     benchmark(read_with_cache_clear)
 
 
 # Benchmark with larger chunks_per_shard to make Morton order impact more visible
 large_morton_shards = (
-    (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
+    (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2)
+    (30,) * 3,  # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2)
+    (33,)
+    * 3,  # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2)
 )
 
 
@@ -122,7 +126,7 @@ def test_sharded_morton_indexing_large(
     the Morton order computation a more significant portion of total time.
     The Morton order cache is cleared before each iteration.
     """
-    from zarr.core.indexing import _morton_order
+    from zarr.core.indexing import _morton_order, _morton_order_keys
 
     # 1x1x1 chunks means chunks_per_shard equals shard shape
     shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
@@ -145,6 +149,7 @@ def test_sharded_morton_indexing_large(
 
     def read_with_cache_clear() -> None:
         _morton_order.cache_clear()
+        _morton_order_keys.cache_clear()
         getitem(data, indexer)
 
     benchmark(read_with_cache_clear)
@@ -164,7 +169,7 @@ def test_sharded_morton_single_chunk(
     computing the full Morton order, making the optimization impact clear.
     The Morton order cache is cleared before each iteration.
     """
-    from zarr.core.indexing import _morton_order
+    from zarr.core.indexing import _morton_order, _morton_order_keys
 
     # 1x1x1 chunks means chunks_per_shard equals shard shape
     shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
@@ -187,16 +192,21 @@ def test_sharded_morton_single_chunk(
 
     def read_with_cache_clear() -> None:
         _morton_order.cache_clear()
+        _morton_order_keys.cache_clear()
         getitem(data, indexer)
 
     benchmark(read_with_cache_clear)
 
 
 # Benchmark for morton_order_iter directly (no I/O)
 morton_iter_shapes = (
-    (8, 8, 8),  # 512 elements
-    (16, 16, 16),  # 4096 elements
-    (32, 32, 32),  # 32768 elements
+    (8, 8, 8),  # 512 elements    (power-of-2)
+    (10, 10, 10),  # 1000 elements   (non-power-of-2)
+    (16, 16, 16),  # 4096 elements   (power-of-2)
+    (20, 20, 20),  # 8000 elements   (non-power-of-2)
+    (32, 32, 32),  # 32768 elements  (power-of-2)
+    (30, 30, 30),  # 27000 elements  (non-power-of-2)
+    (33, 33, 33),  # 35937 elements  (near-miss: just above power-of-2, n_z=262144)
 )
 
 
@@ -211,10 +221,11 @@ def test_morton_order_iter(
     optimization impact without array read/write overhead.
     The cache is cleared before each iteration.
     """
-    from zarr.core.indexing import _morton_order, morton_order_iter
+    from zarr.core.indexing import _morton_order, _morton_order_keys, morton_order_iter
 
     def compute_morton_order() -> None:
         _morton_order.cache_clear()
+        _morton_order_keys.cache_clear()
         # Consume the iterator to force computation
         list(morton_order_iter(shape))
 
@@ -239,7 +250,7 @@ def test_sharded_morton_write_single_chunk(
     """
     import numpy as np
 
-    from zarr.core.indexing import _morton_order
+    from zarr.core.indexing import _morton_order, _morton_order_keys
 
     # 1x1x1 chunks means chunks_per_shard equals shard shape
     shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
@@ -262,6 +273,7 @@ def test_sharded_morton_write_single_chunk(
 
     def write_with_cache_clear() -> None:
         _morton_order.cache_clear()
+        _morton_order_keys.cache_clear()
         data[indexer] = write_data
 
     benchmark(write_with_cache_clear)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Vectorize get_chunk_slice for faster sharded array writes.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add benchmarks for Morton order computation with non-power-of-2 and near-miss shard shapes, covering both pure computation and end-to-end read/write performance.`