zarr-developers
diff --git a/‎src/zarr/abc/codec.py‎
Lines changed: 183 additions & 2 deletions b/‎src/zarr/abc/codec.py‎
Lines changed: 183 additions & 2 deletions
diff --git a/‎src/zarr/abc/store.py‎
Lines changed: 17 additions & 20 deletions b/‎src/zarr/abc/store.py‎
Lines changed: 17 additions & 20 deletions
@@ -2,7 +2,8 @@
 
 from abc import abstractmethod
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Generic, Protocol, TypeGuard, TypeVar, runtime_checkable
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeGuard, TypeVar, runtime_checkable
 
 from typing_extensions import ReadOnly, TypedDict
 
@@ -19,7 +20,7 @@
     from zarr.core.array_spec import ArraySpec
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
-    from zarr.core.indexing import SelectorTuple
+    from zarr.core.indexing import ChunkProjection, SelectorTuple
     from zarr.core.metadata import ArrayMetadata
 
 __all__ = [
@@ -32,6 +33,7 @@
     "CodecInput",
     "CodecOutput",
     "CodecPipeline",
+    "PreparedWrite",
     "SupportsSyncCodec",
 ]
 
@@ -200,9 +202,188 @@ class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
     """Base class for array-to-array codecs."""
 
 
+def _is_complete_selection(selection: Any, shape: tuple[int, ...]) -> bool:
+    """Check whether a chunk selection covers the entire chunk shape."""
+    if not isinstance(selection, tuple):
+        selection = (selection,)
+    for sel, dim_len in zip(selection, shape, strict=False):
+        if isinstance(sel, int):
+            if dim_len != 1:
+                return False
+        elif isinstance(sel, slice):
+            start, stop, step = sel.indices(dim_len)
+            if not (start == 0 and stop == dim_len and step == 1):
+                return False
+        else:
+            return False
+    return True
+
+
+@dataclass
+class PreparedWrite:
+    """Result of prepare_write: existing encoded chunk bytes + selection info."""
+
+    chunk_dict: dict[tuple[int, ...], Buffer | None]
+    inner_codec_chain: Any  # CodecChain
+    inner_chunk_spec: ArraySpec
+    indexer: list[ChunkProjection]
+    value_selection: SelectorTuple | None = None
+    # If not None, slice value with this before using inner out_selections.
+    # For sharding: the outer out_selection from batch_info.
+    # For non-sharded: None (inner out_selection IS the outer out_selection).
+    write_full_shard: bool = True
+    # True when the entire shard blob will be written from scratch (either
+    # because the shard doesn't exist yet or because the selection is complete).
+    # Used by ShardingCodec.finalize_write to decide between set vs set_range.
+    is_complete_shard: bool = False
+    # True when the outer selection covers the entire shard. When True,
+    # the indexer is empty and finalize_write receives the shard value
+    # via shard_data. The codec then encodes the full shard in one shot
+    # rather than iterating over individual inner chunks.
+    shard_data: NDBuffer | None = None
+    # The full shard value for complete-selection writes. Set by the pipeline
+    # when is_complete_shard is True, before calling finalize_write.
+
+
 class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]):
     """Base class for array-to-bytes codecs."""
 
+    @property
+    def inner_codec_chain(self) -> Any:
+        """The codec chain for decoding inner chunks after deserialization.
+
+        Returns None by default — the pipeline should use its own codec_chain.
+        ShardingCodec overrides to return its inner codec chain.
+        """
+        return None
+
+    def deserialize(
+        self, raw: Buffer | None, chunk_spec: ArraySpec
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Pure compute: unpack stored bytes into per-inner-chunk buffers.
+
+        Default implementation: single chunk at (0,).
+        ShardingCodec overrides to decode shard index and slice blob into per-chunk buffers.
+        """
+        return {(0,): raw}
+
+    def serialize(
+        self, chunk_dict: dict[tuple[int, ...], Buffer | None], chunk_spec: ArraySpec
+    ) -> Buffer | None:
+        """Pure compute: pack per-inner-chunk buffers into a storage blob.
+
+        Default implementation: return the single chunk's bytes (or None if absent).
+        ShardingCodec overrides to concatenate chunks + build index.
+        Returns None if all chunks are empty (caller should delete the key).
+        """
+        return chunk_dict.get((0,))
+
+    def prepare_read_sync(
+        self,
+        byte_getter: Any,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        codec_chain: Any,
+        aa_chain: Any,
+        ab_pair: Any,
+        bb_chain: Any,
+    ) -> NDBuffer | None:
+        """IO + full decode for the selected region. Returns decoded sub-array."""
+        raw = byte_getter.get_sync(prototype=chunk_spec.prototype)
+        chunk_array: NDBuffer | None = codec_chain.decode_chunk(
+            raw, chunk_spec, aa_chain, ab_pair, bb_chain
+        )
+        if chunk_array is not None:
+            return chunk_array[chunk_selection]
+        return None
+
+    def prepare_write_sync(
+        self,
+        byte_setter: Any,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        codec_chain: Any,
+    ) -> PreparedWrite:
+        """IO + deserialize. Returns PreparedWrite for the pipeline to decode/merge/encode."""
+        is_complete = _is_complete_selection(chunk_selection, chunk_spec.shape)
+        existing: Buffer | None = None
+        if not is_complete:
+            existing = byte_setter.get_sync(prototype=chunk_spec.prototype)
+        chunk_dict = self.deserialize(existing, chunk_spec)
+        inner_chain = self.inner_codec_chain or codec_chain
+        return PreparedWrite(
+            chunk_dict=chunk_dict,
+            inner_codec_chain=inner_chain,
+            inner_chunk_spec=chunk_spec,
+            indexer=[((0,), chunk_selection, out_selection, is_complete)],  # type: ignore[list-item]
+        )
+
+    async def prepare_read(
+        self,
+        byte_getter: Any,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        codec_chain: Any,
+        aa_chain: Any,
+        ab_pair: Any,
+        bb_chain: Any,
+    ) -> NDBuffer | None:
+        """Async IO + full decode for the selected region. Returns decoded sub-array."""
+        raw = await byte_getter.get(prototype=chunk_spec.prototype)
+        chunk_array: NDBuffer | None = codec_chain.decode_chunk(
+            raw, chunk_spec, aa_chain, ab_pair, bb_chain
+        )
+        if chunk_array is not None:
+            return chunk_array[chunk_selection]
+        return None
+
+    async def prepare_write(
+        self,
+        byte_setter: Any,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        codec_chain: Any,
+    ) -> PreparedWrite:
+        """Async IO + deserialize. Returns PreparedWrite for the pipeline to decode/merge/encode."""
+        is_complete = _is_complete_selection(chunk_selection, chunk_spec.shape)
+        existing: Buffer | None = None
+        if not is_complete:
+            existing = await byte_setter.get(prototype=chunk_spec.prototype)
+        chunk_dict = self.deserialize(existing, chunk_spec)
+        inner_chain = self.inner_codec_chain or codec_chain
+        return PreparedWrite(
+            chunk_dict=chunk_dict,
+            inner_codec_chain=inner_chain,
+            inner_chunk_spec=chunk_spec,
+            indexer=[((0,), chunk_selection, out_selection, is_complete)],  # type: ignore[list-item]
+        )
+
+    def finalize_write_sync(
+        self, prepared: PreparedWrite, chunk_spec: ArraySpec, byte_setter: Any
+    ) -> None:
+        """Serialize prepared chunk_dict and write to store.
+
+        Default: serialize to a single blob and call set (or delete if all empty).
+        ShardingCodec overrides this for byte-range writes when inner codecs are fixed-size.
+        """
+        blob = self.serialize(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            byte_setter.delete_sync()
+        else:
+            byte_setter.set_sync(blob)
+
+    async def finalize_write(
+        self, prepared: PreparedWrite, chunk_spec: ArraySpec, byte_setter: Any
+    ) -> None:
+        """Async version of finalize_write_sync."""
+        blob = self.serialize(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            await byte_setter.delete()
+        else:
+            await byte_setter.set(blob)
+
 
 class BytesBytesCodec(BaseCodec[Buffer, Buffer]):
     """Base class for bytes-to-bytes codecs."""
 
@@ -20,8 +20,6 @@
     "ByteGetter",
     "ByteSetter",
     "Store",
-    "SyncByteGetter",
-    "SyncByteSetter",
     "set_or_delete",
 ]
 
@@ -473,6 +471,21 @@ async def set(self, key: str, value: Buffer) -> None:
         """
         ...
 
+    async def set_range(self, key: str, value: Buffer, start: int) -> None:
+        """Write ``value`` into an existing key beginning at byte offset ``start``.
+
+        The key must already exist and ``start + len(value)`` must not exceed
+        the current size of the stored value.
+
+        Parameters
+        ----------
+        key : str
+        value : Buffer
+        start : int
+            Byte offset at which to begin writing.
+        """
+        raise NotImplementedError(f"{type(self).__name__} does not support set_range")
+
     async def set_if_not_exists(self, key: str, value: Buffer) -> None:
         """
         Store a key to ``value`` if the key is not already present.
@@ -702,29 +715,13 @@ async def get(
 
     async def set(self, value: Buffer) -> None: ...
 
+    async def set_range(self, value: Buffer, start: int) -> None: ...
+
     async def delete(self) -> None: ...
 
     async def set_if_not_exists(self, default: Buffer) -> None: ...
 
 
-@runtime_checkable
-class SyncByteGetter(Protocol):
-    """Protocol for stores that support synchronous byte reads."""
-
-    def get_sync(
-        self, prototype: BufferPrototype, byte_range: ByteRequest | None = None
-    ) -> Buffer | None: ...
-
-
-@runtime_checkable
-class SyncByteSetter(SyncByteGetter, Protocol):
-    """Protocol for stores that support synchronous byte reads, writes, and deletes."""
-
-    def set_sync(self, value: Buffer) -> None: ...
-
-    def delete_sync(self) -> None: ...
-
-
 async def set_or_delete(byte_setter: ByteSetter, value: Buffer | None) -> None:
     """Set or delete a value in a byte setter