fix: dispatch partial-decode in SyncCodecPipeline.read_sync

d-v-b · d-v-b · commit acfc59fd09fb · 2026-04-17T13:42:24.000+02:00
read_sync was always fetching the full chunk/shard blob and decoding
it through the full codec chain. For sharded arrays, this meant a
single-element read fetched the entire shard (~125x more IO than
needed) and decoded every inner chunk (~125x more compute).

Mirror the partial-encode dispatch already in write_sync: when the
AB codec implements partial decoding (i.e. ShardingCodec), let the
codec own its IO via _decode_partial_sync, fetching only the
inner-chunk byte ranges that overlap the read selection.

Add ShardingCodec._decode_partial_sync — sync equivalent of
_decode_partial_single. Reads the shard index (or full shard if the
selection covers everything), decodes only the needed inner chunks
through the inner ChunkTransform, scatters into the output buffer.

Also extend tests/test_pipeline_parity.py with test_pipeline_read_parity:
parametric over (codec config, layout, selection) where selections
include scalar reads, partial slices, strided reads, and full reads.
The original parity test only exercised full reads — this new test
covers the partial-read code path that the regression hit.

Benchmark on shape=(105,)^3, chunks=(10,)^3, shards=(50,)^3, MemoryStore:

  Selection                   batched   sync (before)   sync (after)
  scalar (0,0,0)               0.46 ms       1.6 ms        0.24 ms
  full slice                  83.4 ms       (n/a)          17.5 ms
  strided 4                   82.8 ms       (n/a)          16.7 ms
  sub-block (10:-10:4)        42.3 ms       (n/a)           9.7 ms

Fixes the codspeed regression on
test_slice_indexing[(50,50,50)-(0,0,0)-memory] (was 4.6x slower,
now 1.9x faster) and similar partial-read cases.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -882,6 +882,92 @@ async def _decode_partial_single(
         else:
             return out
 
+    def _decode_partial_sync(
+        self,
+        byte_getter: Any,
+        selection: SelectorTuple,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer | None:
+        """Sync equivalent of ``_decode_partial_single``.
+
+        Reads only the inner-chunk byte ranges that overlap ``selection``
+        (plus the shard index) and decodes them through the inner codec
+        chain.  The store must support ``get_sync`` with byte ranges.
+        """
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = get_indexer(
+            selection,
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
+        )
+
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=indexer.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        indexed_chunks = list(indexer)
+        all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
+
+        # Read just the inner chunks we need.
+        if self._is_total_shard(all_chunk_coords, chunks_per_shard):
+            shard_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype)
+            if shard_bytes is None:
+                return None
+            shard_reader = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+            shard_dict: ShardMapping = shard_reader
+        else:
+            shard_index_size = self._shard_index_size(chunks_per_shard)
+            if self.index_location == ShardingCodecIndexLocation.start:
+                index_bytes = byte_getter.get_sync(
+                    prototype=numpy_buffer_prototype(),
+                    byte_range=RangeByteRequest(0, shard_index_size),
+                )
+            else:
+                index_bytes = byte_getter.get_sync(
+                    prototype=numpy_buffer_prototype(),
+                    byte_range=SuffixByteRequest(shard_index_size),
+                )
+            if index_bytes is None:
+                return None
+            shard_index = self._decode_shard_index_sync(index_bytes, chunks_per_shard)
+            shard_dict_mut: dict[tuple[int, ...], Buffer | None] = {}
+            for chunk_coords in all_chunk_coords:
+                chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords)
+                if chunk_byte_slice is not None:
+                    chunk_bytes = byte_getter.get_sync(
+                        prototype=chunk_spec.prototype,
+                        byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]),
+                    )
+                    if chunk_bytes is not None:
+                        shard_dict_mut[chunk_coords] = chunk_bytes
+            shard_dict = shard_dict_mut
+
+        # Decode each needed inner chunk and scatter into out.
+        fill_value = shard_spec.fill_value
+        if fill_value is None:
+            fill_value = shard_spec.dtype.default_scalar()
+        for chunk_coords, chunk_selection, out_selection, _ in indexed_chunks:
+            try:
+                chunk_bytes = shard_dict[chunk_coords]
+            except KeyError:
+                chunk_bytes = None
+            if chunk_bytes is None:
+                out[out_selection] = fill_value
+                continue
+            chunk_array = inner_transform.decode_chunk(chunk_bytes, chunk_spec)
+            out[out_selection] = chunk_array[chunk_selection]
+
+        if hasattr(indexer, "sel_shape"):
+            return out.reshape(indexer.sel_shape)
+        return out
+
     async def _encode_single(
         self,
         shard_array: NDBuffer,
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -904,6 +904,12 @@ def read_sync(
         When ``n_workers > 0`` and there are multiple chunks, the decode
         step is parallelized across threads. This helps when codecs
         release the GIL (e.g. gzip, blosc, zstd).
+
+        Mirrors ``BatchedCodecPipeline.read_batch``: when the AB codec
+        supports partial decoding (e.g. sharding), the codec handles its
+        own IO and only fetches the inner-chunk byte ranges that overlap
+        the read selection. Otherwise the pipeline fetches the full
+        blob and decodes the whole chunk.
         """
         assert self._sync_transform is not None
         transform = self._sync_transform
@@ -915,6 +921,25 @@ def read_sync(
         fill = fill_value_or_default(batch[0][1])
         _missing = GetResult(status="missing")
 
+        # Partial-decode fast path: the AB codec owns IO (read only the
+        # byte ranges needed for the requested selection). Same condition
+        # and dispatch as BatchedCodecPipeline.read_batch.
+        if self.supports_partial_decode:
+            codec = self.array_bytes_codec
+            assert hasattr(codec, "_decode_partial_sync")
+            partial_results: list[GetResult] = []
+            for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch:
+                decoded = codec._decode_partial_sync(byte_getter, chunk_selection, chunk_spec)
+                if decoded is None:
+                    out[out_selection] = fill
+                    partial_results.append(_missing)
+                    continue
+                if drop_axes:
+                    decoded = decoded.squeeze(axis=drop_axes)
+                out[out_selection] = decoded
+                partial_results.append(GetResult(status="present"))
+            return tuple(partial_results)
+
         # Phase 1: fetch all chunks (IO, sequential)
         raw_buffers: list[Buffer | None] = [
             bg.get_sync(prototype=cs.prototype)  # type: ignore[attr-defined]
diff --git a/tests/test_pipeline_parity.py b/tests/test_pipeline_parity.py
@@ -279,3 +279,77 @@ def test_pipeline_parity(
         sync_arr,
         err_msg="BatchedCodecPipeline could not correctly read SyncCodecPipeline's output",
     )
+
+
+# ---------------------------------------------------------------------------
+# Read parity: cover partial reads (not just full reads as in the matrix above)
+# ---------------------------------------------------------------------------
+
+
+def _read_selections(shape: tuple[int, ...]) -> list[tuple[str, Any]]:
+    """Selections that exercise the partial-decode path differently."""
+    if len(shape) == 1:
+        n = shape[0]
+        return [
+            ("scalar-first", (0,)),
+            ("scalar-mid", (n // 2,)),
+            ("partial-slice", (slice(n // 4, 3 * n // 4),)),
+            ("strided", (slice(0, n, 3),)),
+            ("full", (slice(None),)),
+        ]
+    return [
+        ("scalar-first", (0,) * len(shape)),
+        ("scalar-mid", tuple(s // 2 for s in shape)),
+        ("partial-slice", tuple(slice(s // 4, 3 * s // 4) for s in shape)),
+        ("full", (slice(None),) * len(shape)),
+    ]
+
+
+def _read_matrix() -> Iterator[Any]:
+    for codec_id, codec_kwargs in CODEC_CONFIGS:
+        for layout_id, layout in LAYOUT_CONFIGS:
+            for sel_id, sel in _read_selections(layout["shape"]):
+                yield pytest.param(
+                    codec_kwargs,
+                    layout,
+                    sel,
+                    id=f"{layout_id}-{codec_id}-{sel_id}",
+                )
+
+
+@pytest.mark.parametrize(
+    ("codec_kwargs", "layout", "selection"),
+    list(_read_matrix()),
+)
+def test_pipeline_read_parity(
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    selection: Any,
+) -> None:
+    """Partial reads via SyncCodecPipeline must match BatchedCodecPipeline.
+
+    The full-write/full-read parity test above doesn't exercise partial
+    reads (e.g. a single element from a sharded array), which take a
+    different code path (``_decode_partial_single`` on the sharding
+    codec). This test fills the array under one pipeline and reads
+    arbitrary selections under both, asserting equality.
+    """
+    # Fill under batched (the canonical pipeline) so the contents are
+    # well-defined regardless of the codec under test.
+    store, _full = _write_under_pipeline(
+        _BATCHED, codec_kwargs, layout, _full_overwrite(layout["shape"]), True
+    )
+
+    with zarr_config.set({"codec_pipeline.path": _BATCHED}):
+        batched_arr = zarr.open_array(store=store, mode="r")[selection]
+    with zarr_config.set({"codec_pipeline.path": _SYNC}):
+        sync_arr = zarr.open_array(store=store, mode="r")[selection]
+
+    np.testing.assert_array_equal(
+        sync_arr,
+        batched_arr,
+        err_msg=(
+            f"SyncCodecPipeline read returned different result than BatchedCodecPipeline "
+            f"for selection {selection!r}"
+        ),
+    )