refactor is_complete_chunk usage, add chunkrequest

d-v-b · d-v-b · commit 2a3b4042471d · 2026-02-27T18:45:20.000-05:00
diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
@@ -19,6 +19,7 @@
     from zarr.abc.store import ByteGetter, ByteSetter, Store
     from zarr.core.array_spec import ArraySpec
     from zarr.core.chunk_grids import ChunkGrid
+    from zarr.core.codec_pipeline import ChunkRequest
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
     from zarr.core.indexing import ChunkProjection, SelectorTuple
     from zarr.core.metadata import ArrayMetadata
@@ -751,7 +752,7 @@ async def encode(
     @abstractmethod
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -760,12 +761,10 @@ async def read(
 
         Parameters
         ----------
-        batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]]
-            Ordered set of information about the chunks.
-            The first slice selection determines which parts of the chunk will be fetched.
-            The second slice selection determines where in the output array the chunk data will be written.
-            The ByteGetter is used to fetch the necessary bytes.
-            The chunk spec contains information about the construction of an array from the bytes.
+        batch_info : Iterable[ChunkRequest]
+            Ordered set of chunk requests. Each ``ChunkRequest`` carries the
+            store path (``byte_setter``), the ``ArraySpec`` for that chunk,
+            chunk and output selections, and whether the chunk is complete.
 
             If the Store returns ``None`` for a chunk, then the chunk was not
             written and the implementation must set the values of that chunk (or
@@ -778,7 +777,7 @@ async def read(
     @abstractmethod
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -788,12 +787,10 @@ async def write(
 
         Parameters
         ----------
-        batch_info : Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]]
-            Ordered set of information about the chunks.
-            The first slice selection determines which parts of the chunk will be encoded.
-            The second slice selection determines where in the value array the chunk data is located.
-            The ByteSetter is used to fetch and write the necessary bytes.
-            The chunk spec contains information about the chunk.
+        batch_info : Iterable[ChunkRequest]
+            Ordered set of chunk requests. Each ``ChunkRequest`` carries the
+            store path (``byte_setter``), the ``ArraySpec`` for that chunk,
+            chunk and output selections, and whether the chunk is complete.
         value : NDBuffer
         """
         ...
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -393,7 +393,7 @@ async def _decode_single(
 
         transform = self._get_chunk_transform(chunk_spec)
         fill_value = fill_value_or_default(chunk_spec)
-        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+        for chunk_coords, chunk_selection, out_selection, _is_complete in indexer:
             chunk_bytes = shard_dict.get(chunk_coords)
             if chunk_bytes is not None:
                 chunk_array = await transform.decode_chunk_async(chunk_bytes)
@@ -461,7 +461,7 @@ async def _decode_partial_single(
         # decode chunks and write them into the output buffer
         transform = self._get_chunk_transform(chunk_spec)
         fill_value = fill_value_or_default(chunk_spec)
-        for chunk_coords, chunk_selection, out_selection, _ in indexed_chunks:
+        for chunk_coords, chunk_selection, out_selection, _is_complete in indexed_chunks:
             chunk_bytes = shard_dict.get(chunk_coords)
             if chunk_bytes is not None:
                 chunk_array = await transform.decode_chunk_async(chunk_bytes)
@@ -541,14 +541,14 @@ async def _encode_partial_single(
         fill_value = fill_value_or_default(chunk_spec)
 
         is_scalar = len(shard_array.shape) == 0
-        for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer:
+        for chunk_coords, chunk_selection, out_selection, is_complete in indexer:
             value = shard_array if is_scalar else shard_array[out_selection]
-            if is_complete_chunk and not is_scalar and value.shape == chunk_spec.shape:
+            if is_complete and not is_scalar and value.shape == chunk_spec.shape:
                 # Complete overwrite with matching shape — use value directly
                 chunk_data = value
             else:
                 # Read-modify-write: decode existing or create new, merge data
-                if is_complete_chunk:
+                if is_complete:
                     existing_bytes = None
                 else:
                     existing_bytes = shard_dict.get(chunk_coords)
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -48,6 +48,7 @@
     V2ChunkKeyEncoding,
     parse_chunk_key_encoding,
 )
+from zarr.core.codec_pipeline import ChunkRequest
 from zarr.core.common import (
     JSON,
     ZARR_JSON,
@@ -5602,12 +5603,12 @@ async def _get_selection(
         # reading chunks and decoding them
         await codec_pipeline.read(
             [
-                (
-                    store_path / metadata.encode_chunk_key(chunk_coords),
-                    metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
-                    chunk_selection,
-                    out_selection,
-                    is_complete_chunk,
+                ChunkRequest(
+                    byte_setter=store_path / metadata.encode_chunk_key(chunk_coords),
+                    chunk_spec=metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
+                    chunk_selection=chunk_selection,
+                    out_selection=out_selection,
+                    is_complete_chunk=is_complete_chunk,
                 )
                 for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
             ],
@@ -5912,12 +5913,12 @@ async def _set_selection(
     # merging with existing data and encoding chunks
     await codec_pipeline.write(
         [
-            (
-                store_path / metadata.encode_chunk_key(chunk_coords),
-                metadata.get_chunk_spec(chunk_coords, _config, prototype),
-                chunk_selection,
-                out_selection,
-                is_complete_chunk,
+            ChunkRequest(
+                byte_setter=store_path / metadata.encode_chunk_key(chunk_coords),
+                chunk_spec=metadata.get_chunk_spec(chunk_coords, _config, prototype),
+                chunk_selection=chunk_selection,
+                out_selection=out_selection,
+                is_complete_chunk=is_complete_chunk,
             )
             for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
         ],
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -220,6 +220,20 @@ def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
         return byte_length
 
 
+@dataclass(slots=True)
+class ChunkRequest:
+    """A single chunk's worth of metadata for a pipeline read or write.
+
+    Replaces the anonymous 5-tuples formerly threaded through ``batch_info``.
+    """
+
+    byte_setter: ByteSetter
+    chunk_spec: ArraySpec
+    chunk_selection: SelectorTuple
+    out_selection: SelectorTuple
+    is_complete_chunk: bool
+
+
 @dataclass(frozen=True)
 class BatchedCodecPipeline(CodecPipeline):
     """Default codec pipeline.
@@ -400,48 +414,40 @@ async def encode_partial_batch(
 
     async def read_batch(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
+        batch_info = list(batch_info)
         if self.supports_partial_decode:
             chunk_array_batch = await self.decode_partial_batch(
-                [
-                    (byte_getter, chunk_selection, chunk_spec)
-                    for byte_getter, chunk_spec, chunk_selection, *_ in batch_info
-                ]
+                [(req.byte_setter, req.chunk_selection, req.chunk_spec) for req in batch_info]
             )
-            for chunk_array, (_, chunk_spec, _, out_selection, _) in zip(
-                chunk_array_batch, batch_info, strict=False
-            ):
+            for chunk_array, req in zip(chunk_array_batch, batch_info, strict=False):
                 if chunk_array is not None:
-                    out[out_selection] = chunk_array
+                    out[req.out_selection] = chunk_array
                 else:
-                    out[out_selection] = fill_value_or_default(chunk_spec)
+                    out[req.out_selection] = fill_value_or_default(req.chunk_spec)
         else:
             chunk_bytes_batch = await concurrent_map(
-                [(byte_getter, chunk_spec.prototype) for byte_getter, chunk_spec, *_ in batch_info],
+                [(req.byte_setter, req.chunk_spec.prototype) for req in batch_info],
                 lambda byte_getter, prototype: byte_getter.get(prototype),
                 config.get("async.concurrency"),
             )
             chunk_array_batch = await self.decode_batch(
                 [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
+                    (chunk_bytes, req.chunk_spec)
+                    for chunk_bytes, req in zip(chunk_bytes_batch, batch_info, strict=False)
                 ],
             )
-            for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
-                chunk_array_batch, batch_info, strict=False
-            ):
+            for chunk_array, req in zip(chunk_array_batch, batch_info, strict=False):
                 if chunk_array is not None:
-                    tmp = chunk_array[chunk_selection]
+                    tmp = chunk_array[req.chunk_selection]
                     if drop_axes != ():
                         tmp = tmp.squeeze(axis=drop_axes)
-                    out[out_selection] = tmp
+                    out[req.out_selection] = tmp
                 else:
-                    out[out_selection] = fill_value_or_default(chunk_spec)
+                    out[req.out_selection] = fill_value_or_default(req.chunk_spec)
 
     def _merge_chunk_array(
         self,
@@ -450,13 +456,11 @@ def _merge_chunk_array(
         out_selection: SelectorTuple,
         chunk_spec: ArraySpec,
         chunk_selection: SelectorTuple,
-        is_complete_chunk: bool,
         drop_axes: tuple[int, ...],
     ) -> NDBuffer:
         if (
-            is_complete_chunk
+            existing_chunk_array is None
             and value.shape == chunk_spec.shape
-            # Guard that this is not a partial chunk at the end with is_complete_chunk=True
             and value[out_selection].shape == chunk_spec.shape
         ):
             return value
@@ -489,24 +493,30 @@ def _merge_chunk_array(
 
     async def write_batch(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
+        batch_info = list(batch_info)
         if self.supports_partial_encode:
             # Pass scalar values as is
             if len(value.shape) == 0:
                 await self.encode_partial_batch(
                     [
-                        (byte_setter, value, chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
+                        (req.byte_setter, value, req.chunk_selection, req.chunk_spec)
+                        for req in batch_info
                     ],
                 )
             else:
                 await self.encode_partial_batch(
                     [
-                        (byte_setter, value[out_selection], chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
+                        (
+                            req.byte_setter,
+                            value[req.out_selection],
+                            req.chunk_selection,
+                            req.chunk_spec,
+                        )
+                        for req in batch_info
                     ],
                 )
 
@@ -523,61 +533,48 @@ async def _read_key(
             chunk_bytes_batch = await concurrent_map(
                 [
                     (
-                        None if is_complete_chunk else byte_setter,
-                        chunk_spec.prototype,
+                        None if req.is_complete_chunk else req.byte_setter,
+                        req.chunk_spec.prototype,
                     )
-                    for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info
+                    for req in batch_info
                 ],
                 _read_key,
                 config.get("async.concurrency"),
             )
             chunk_array_decoded = await self.decode_batch(
                 [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
+                    (chunk_bytes, req.chunk_spec)
+                    for chunk_bytes, req in zip(chunk_bytes_batch, batch_info, strict=False)
                 ],
             )
 
             chunk_array_merged = [
                 self._merge_chunk_array(
                     chunk_array,
                     value,
-                    out_selection,
-                    chunk_spec,
-                    chunk_selection,
-                    is_complete_chunk,
+                    req.out_selection,
+                    req.chunk_spec,
+                    req.chunk_selection,
                     drop_axes,
                 )
-                for chunk_array, (
-                    _,
-                    chunk_spec,
-                    chunk_selection,
-                    out_selection,
-                    is_complete_chunk,
-                ) in zip(chunk_array_decoded, batch_info, strict=False)
+                for chunk_array, req in zip(chunk_array_decoded, batch_info, strict=False)
             ]
             chunk_array_batch: list[NDBuffer | None] = []
-            for chunk_array, (_, chunk_spec, *_) in zip(
-                chunk_array_merged, batch_info, strict=False
-            ):
+            for chunk_array, req in zip(chunk_array_merged, batch_info, strict=False):
                 if chunk_array is None:
                     chunk_array_batch.append(None)  # type: ignore[unreachable]
                 else:
-                    if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                        fill_value_or_default(chunk_spec)
+                    if not req.chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
+                        fill_value_or_default(req.chunk_spec)
                     ):
                         chunk_array_batch.append(None)
                     else:
                         chunk_array_batch.append(chunk_array)
 
             chunk_bytes_batch = await self.encode_batch(
                 [
-                    (chunk_array, chunk_spec)
-                    for chunk_array, (_, chunk_spec, *_) in zip(
-                        chunk_array_batch, batch_info, strict=False
-                    )
+                    (chunk_array, req.chunk_spec)
+                    for chunk_array, req in zip(chunk_array_batch, batch_info, strict=False)
                 ],
             )
 
@@ -589,10 +586,8 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non
 
             await concurrent_map(
                 [
-                    (byte_setter, chunk_bytes)
-                    for chunk_bytes, (byte_setter, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
+                    (req.byte_setter, chunk_bytes)
+                    for chunk_bytes, req in zip(chunk_bytes_batch, batch_info, strict=False)
                 ],
                 _write_key,
                 config.get("async.concurrency"),
@@ -618,7 +613,7 @@ async def encode(
 
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -633,7 +628,7 @@ async def read(
 
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        batch_info: Iterable[ChunkRequest],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
diff --git a/tests/test_config.py b/tests/test_config.py