perf: Vectorize get_chunk_slice for faster sharded writes

mkitti · claude · mkitti · commit 6129cd32ec13 · 2026-02-17T01:09:08.000-05:00
Add vectorized methods to _ShardIndex and _ShardReader for batch
chunk slice lookups, reducing per-chunk function call overhead
when writing to shards.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/changes/3713.misc.md b/changes/3713.misc.md
@@ -0,0 +1 @@
+Vectorize get_chunk_slice for faster sharded array writes.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -46,6 +46,7 @@
 from zarr.core.indexing import (
     BasicIndexer,
     SelectorTuple,
+    _morton_order,
     c_order_iter,
     get_indexer,
     morton_order_iter,
@@ -138,6 +139,45 @@ def get_chunk_slice(self, chunk_coords: tuple[int, ...]) -> tuple[int, int] | No
         else:
             return (int(chunk_start), int(chunk_start + chunk_len))
 
+    def get_chunk_slices_vectorized(
+        self, chunk_coords_array: npt.NDArray[np.uint64]
+    ) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint64], npt.NDArray[np.bool_]]:
+        """Get chunk slices for multiple coordinates at once.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates to look up.
+
+        Returns
+        -------
+        starts : ndarray of shape (n_chunks,)
+            Start byte positions for each chunk.
+        ends : ndarray of shape (n_chunks,)
+            End byte positions for each chunk.
+        valid : ndarray of shape (n_chunks,)
+            Boolean mask indicating which chunks are non-empty.
+        """
+        # Localize coordinates via modulo (vectorized)
+        shard_shape = np.array(self.offsets_and_lengths.shape[:-1], dtype=np.uint64)
+        localized = chunk_coords_array % shard_shape
+
+        # Build index tuple for advanced indexing
+        index_tuple = tuple(localized[:, i] for i in range(localized.shape[1]))
+
+        # Fetch all offsets and lengths at once
+        offsets_and_lengths = self.offsets_and_lengths[index_tuple]
+        starts = offsets_and_lengths[:, 0]
+        lengths = offsets_and_lengths[:, 1]
+
+        # Check for valid (non-empty) chunks
+        valid = starts != MAX_UINT_64
+
+        # Compute end positions
+        ends = starts + lengths
+
+        return starts, ends, valid
+
     def set_chunk_slice(self, chunk_coords: tuple[int, ...], chunk_slice: slice | None) -> None:
         localized_chunk = self._localize_chunk(chunk_coords)
         if chunk_slice is None:
@@ -219,6 +259,35 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[tuple[int, ...]]:
         return c_order_iter(self.index.offsets_and_lengths.shape[:-1])
 
+    def to_dict_vectorized(
+        self,
+        chunk_coords_array: npt.NDArray[np.uint64],
+        chunk_coords_tuples: tuple[tuple[int, ...], ...],
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Build a dict of chunk coordinates to buffers using vectorized lookup.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates for vectorized index lookup.
+        chunk_coords_tuples : tuple of tuples
+            The same coordinates as tuples, used as dict keys to avoid conversion.
+
+        Returns
+        -------
+        dict mapping chunk coordinate tuples to Buffer or None
+        """
+        starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
+
+        result: dict[tuple[int, ...], Buffer | None] = {}
+        for i, coords in enumerate(chunk_coords_tuples):
+            if valid[i]:
+                result[coords] = self.buf[int(starts[i]) : int(ends[i])]
+            else:
+                result[coords] = None
+
+        return result
+
 
 @dataclass(frozen=True)
 class ShardingCodec(
@@ -505,7 +574,10 @@ async def _encode_partial_single(
             chunks_per_shard=chunks_per_shard,
         )
         shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
-        shard_dict = {k: shard_reader.get(k) for k in morton_order_iter(chunks_per_shard)}
+        # Use vectorized lookup for better performance
+        morton_coords = _morton_order(chunks_per_shard)
+        chunk_coords_array = np.array(morton_coords, dtype=np.uint64)
+        shard_dict = shard_reader.to_dict_vectorized(chunk_coords_array, morton_coords)
 
         indexer = list(
             get_indexer(

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Vectorize get_chunk_slice for faster sharded array writes.`