chunktransform requires sync codecs

d-v-b · d-v-b · commit d22b6f0d29a5 · 2026-03-16T21:45:04.000+01:00
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass, field
 from itertools import islice, pairwise
-from typing import TYPE_CHECKING, Any, TypeVar, cast
+from typing import TYPE_CHECKING, Any, TypeVar
 from warnings import warn
 
 from zarr.abc.codec import (
@@ -80,6 +80,9 @@ class ChunkTransform:
     The chunk's ``shape`` and ``dtype`` reflect the representation
     **after** all ArrayArrayCodec layers have been applied — i.e. the
     spec that feeds the ArrayBytesCodec.
+
+    All codecs must implement ``SupportsSyncCodec``. Construction will
+    raise ``TypeError`` if any codec does not.
     """
 
     codecs: tuple[Codec, ...]
@@ -92,9 +95,15 @@ class ChunkTransform:
     _ab_codec: ArrayBytesCodec = field(init=False, repr=False, compare=False)
     _ab_spec: ArraySpec = field(init=False, repr=False, compare=False)
     _bb_codecs: tuple[BytesBytesCodec, ...] = field(init=False, repr=False, compare=False)
-    _all_sync: bool = field(init=False, repr=False, compare=False)
 
     def __post_init__(self) -> None:
+        non_sync = [c for c in self.codecs if not isinstance(c, SupportsSyncCodec)]
+        if non_sync:
+            names = ", ".join(type(c).__name__ for c in non_sync)
+            raise TypeError(
+                f"All codecs must implement SupportsSyncCodec. The following do not: {names}"
+            )
+
         aa, ab, bb = codecs_from_list(list(self.codecs))
 
         layers: tuple[tuple[ArrayArrayCodec, ArraySpec], ...] = ()
@@ -107,7 +116,6 @@ def __post_init__(self) -> None:
         self._ab_codec = ab
         self._ab_spec = spec
         self._bb_codecs = bb
-        self._all_sync = all(isinstance(c, SupportsSyncCodec) for c in self.codecs)
 
     @property
     def shape(self) -> tuple[int, ...]:
@@ -119,26 +127,22 @@ def dtype(self) -> ZDType[TBaseDType, TBaseScalar]:
         """Dtype after all ArrayArrayCodec layers (input to the ArrayBytesCodec)."""
         return self._ab_spec.dtype
 
-    @property
-    def all_sync(self) -> bool:
-        return self._all_sync
-
     def decode_chunk(
         self,
         chunk_bytes: Buffer,
     ) -> NDBuffer:
         """Decode a single chunk through the full codec chain, synchronously.
 
-        Pure compute -- no IO. Only callable when all codecs support sync.
+        Pure compute -- no IO.
         """
         bb_out: Any = chunk_bytes
         for bb_codec in reversed(self._bb_codecs):
-            bb_out = cast("SupportsSyncCodec", bb_codec)._decode_sync(bb_out, self._ab_spec)
+            bb_out = bb_codec._decode_sync(bb_out, self._ab_spec)  # type: ignore[union-attr]
 
-        ab_out: Any = cast("SupportsSyncCodec", self._ab_codec)._decode_sync(bb_out, self._ab_spec)
+        ab_out: Any = self._ab_codec._decode_sync(bb_out, self._ab_spec)  # type: ignore[union-attr]
 
         for aa_codec, spec in reversed(self.layers):
-            ab_out = cast("SupportsSyncCodec", aa_codec)._decode_sync(ab_out, spec)
+            ab_out = aa_codec._decode_sync(ab_out, spec)  # type: ignore[union-attr]
 
         return ab_out  # type: ignore[no-any-return]
 
@@ -148,23 +152,23 @@ def encode_chunk(
     ) -> Buffer | None:
         """Encode a single chunk through the full codec chain, synchronously.
 
-        Pure compute -- no IO. Only callable when all codecs support sync.
+        Pure compute -- no IO.
         """
         aa_out: Any = chunk_array
 
         for aa_codec, spec in self.layers:
             if aa_out is None:
                 return None
-            aa_out = cast("SupportsSyncCodec", aa_codec)._encode_sync(aa_out, spec)
+            aa_out = aa_codec._encode_sync(aa_out, spec)  # type: ignore[union-attr]
 
         if aa_out is None:
             return None
-        bb_out: Any = cast("SupportsSyncCodec", self._ab_codec)._encode_sync(aa_out, self._ab_spec)
+        bb_out: Any = self._ab_codec._encode_sync(aa_out, self._ab_spec)  # type: ignore[union-attr]
 
         for bb_codec in self._bb_codecs:
             if bb_out is None:
                 return None
-            bb_out = cast("SupportsSyncCodec", bb_codec)._encode_sync(bb_out, self._ab_spec)
+            bb_out = bb_codec._encode_sync(bb_out, self._ab_spec)  # type: ignore[union-attr]
 
         return bb_out  # type: ignore[no-any-return]
 
diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py
@@ -3,13 +3,16 @@
 from typing import Any
 
 import numpy as np
+import pytest
 
+from zarr.abc.codec import ArrayBytesCodec
 from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.crc32c_ import Crc32cCodec
 from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.transpose import TransposeCodec
 from zarr.codecs.zstd import ZstdCodec
 from zarr.core.array_spec import ArrayConfig, ArraySpec
-from zarr.core.buffer import NDBuffer, default_buffer_prototype
+from zarr.core.buffer import Buffer, NDBuffer, default_buffer_prototype
 from zarr.core.codec_pipeline import ChunkTransform
 from zarr.core.dtype import get_data_type_from_native_dtype
 
@@ -30,24 +33,26 @@ def _make_nd_buffer(arr: np.ndarray[Any, np.dtype[Any]]) -> NDBuffer:
 
 
 class TestChunkTransform:
-    def test_all_sync(self) -> None:
+    def test_construction_bytes_only(self) -> None:
+        # Construction succeeds when all codecs implement SupportsSyncCodec.
         spec = _make_array_spec((100,), np.dtype("float64"))
-        chain = ChunkTransform(codecs=(BytesCodec(),), array_spec=spec)
-        assert chain.all_sync is True
+        ChunkTransform(codecs=(BytesCodec(),), array_spec=spec)
 
-    def test_all_sync_with_compression(self) -> None:
+    def test_construction_with_compression(self) -> None:
+        # AB + BB codec chain where both implement SupportsSyncCodec.
         spec = _make_array_spec((100,), np.dtype("float64"))
-        chain = ChunkTransform(codecs=(BytesCodec(), GzipCodec()), array_spec=spec)
-        assert chain.all_sync is True
+        ChunkTransform(codecs=(BytesCodec(), GzipCodec()), array_spec=spec)
 
-    def test_all_sync_full_chain(self) -> None:
+    def test_construction_full_chain(self) -> None:
+        # All three codec types (AA + AB + BB), all implementing SupportsSyncCodec.
         spec = _make_array_spec((3, 4), np.dtype("float64"))
-        chain = ChunkTransform(
+        ChunkTransform(
             codecs=(TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec()), array_spec=spec
         )
-        assert chain.all_sync is True
 
     def test_encode_decode_roundtrip_bytes_only(self) -> None:
+        # Minimal round-trip: BytesCodec serializes the array to bytes and back.
+        # No compression, no AA transform.
         arr = np.arange(100, dtype="float64")
         spec = _make_array_spec(arr.shape, arr.dtype)
         chain = ChunkTransform(codecs=(BytesCodec(),), array_spec=spec)
@@ -59,11 +64,14 @@ def test_encode_decode_roundtrip_bytes_only(self) -> None:
         np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
     def test_layers_no_aa_codecs(self) -> None:
+        # When there are no ArrayArrayCodecs, layers should be empty.
         spec = _make_array_spec((100,), np.dtype("float64"))
         chunk = ChunkTransform(codecs=(BytesCodec(), GzipCodec()), array_spec=spec)
         assert chunk.layers == ()
 
     def test_layers_with_transpose(self) -> None:
+        # With one AA codec (TransposeCodec), layers should contain exactly one
+        # entry pairing the codec with its input ArraySpec.
         spec = _make_array_spec((3, 4), np.dtype("float64"))
         transpose = TransposeCodec(order=(1, 0))
         chunk = ChunkTransform(codecs=(transpose, BytesCodec(), ZstdCodec()), array_spec=spec)
@@ -72,19 +80,24 @@ def test_layers_with_transpose(self) -> None:
         assert chunk.layers[0][1] is spec
 
     def test_shape_dtype_no_aa_codecs(self) -> None:
+        # Without AA codecs, shape and dtype should match the input ArraySpec
+        # (no transforms applied before the AB codec).
         spec = _make_array_spec((100,), np.dtype("float64"))
         chunk = ChunkTransform(codecs=(BytesCodec(),), array_spec=spec)
         assert chunk.shape == (100,)
         assert chunk.dtype == spec.dtype
 
     def test_shape_dtype_with_transpose(self) -> None:
+        # TransposeCodec(order=(1,0)) on a (3, 4) array produces (4, 3).
+        # shape/dtype reflect what the AB codec sees after all AA transforms.
         spec = _make_array_spec((3, 4), np.dtype("float64"))
         chunk = ChunkTransform(codecs=(TransposeCodec(order=(1, 0)), BytesCodec()), array_spec=spec)
-        # After transpose (1,0), shape (3,4) becomes (4,3)
         assert chunk.shape == (4, 3)
         assert chunk.dtype == spec.dtype
 
     def test_encode_decode_roundtrip_with_compression(self) -> None:
+        # Round-trip with a BB codec (GzipCodec) to verify that bytes-bytes
+        # compression/decompression is wired correctly.
         arr = np.arange(100, dtype="float64")
         spec = _make_array_spec(arr.shape, arr.dtype)
         chain = ChunkTransform(codecs=(BytesCodec(), GzipCodec(level=1)), array_spec=spec)
@@ -96,6 +109,9 @@ def test_encode_decode_roundtrip_with_compression(self) -> None:
         np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
     def test_encode_decode_roundtrip_with_transpose(self) -> None:
+        # Full AA + AB + BB chain round-trip. Transpose permutes axes on encode,
+        # then BytesCodec serializes, then ZstdCodec compresses. Decode reverses
+        # all three stages. Verifies the full pipeline works end to end.
         arr = np.arange(12, dtype="float64").reshape(3, 4)
         spec = _make_array_spec(arr.shape, arr.dtype)
         chain = ChunkTransform(
@@ -108,3 +124,115 @@ def test_encode_decode_roundtrip_with_transpose(self) -> None:
         assert encoded is not None
         decoded = chain.decode_chunk(encoded)
         np.testing.assert_array_equal(arr, decoded.as_numpy_array())
+
+    def test_rejects_non_sync_codec(self) -> None:
+        # Construction must raise TypeError when a codec lacks SupportsSyncCodec.
+
+        class AsyncOnlyCodec(ArrayBytesCodec):
+            is_fixed_size = True
+
+            async def _decode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+                raise NotImplementedError  # pragma: no cover
+
+            async def _encode_single(
+                self, chunk_array: NDBuffer, chunk_spec: ArraySpec
+            ) -> Buffer | None:
+                raise NotImplementedError  # pragma: no cover
+
+            def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+                return input_byte_length  # pragma: no cover
+
+        spec = _make_array_spec((100,), np.dtype("float64"))
+        with pytest.raises(TypeError, match="AsyncOnlyCodec"):
+            ChunkTransform(codecs=(AsyncOnlyCodec(),), array_spec=spec)
+
+    def test_rejects_mixed_sync_and_non_sync(self) -> None:
+        # Even if some codecs support sync, a single non-sync codec should
+        # cause construction to fail.
+
+        class AsyncOnlyCodec(ArrayBytesCodec):
+            is_fixed_size = True
+
+            async def _decode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+                raise NotImplementedError  # pragma: no cover
+
+            async def _encode_single(
+                self, chunk_array: NDBuffer, chunk_spec: ArraySpec
+            ) -> Buffer | None:
+                raise NotImplementedError  # pragma: no cover
+
+            def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+                return input_byte_length  # pragma: no cover
+
+        spec = _make_array_spec((3, 4), np.dtype("float64"))
+        with pytest.raises(TypeError, match="AsyncOnlyCodec"):
+            ChunkTransform(
+                codecs=(TransposeCodec(order=(1, 0)), AsyncOnlyCodec()),
+                array_spec=spec,
+            )
+
+    def test_compute_encoded_size_bytes_only(self) -> None:
+        # BytesCodec is size-preserving: encoded size == input size.
+        spec = _make_array_spec((100,), np.dtype("float64"))
+        chain = ChunkTransform(codecs=(BytesCodec(),), array_spec=spec)
+        assert chain.compute_encoded_size(800, spec) == 800
+
+    def test_compute_encoded_size_with_crc32c(self) -> None:
+        # Crc32cCodec appends a 4-byte checksum, so encoded size = input + 4.
+        spec = _make_array_spec((100,), np.dtype("float64"))
+        chain = ChunkTransform(codecs=(BytesCodec(), Crc32cCodec()), array_spec=spec)
+        assert chain.compute_encoded_size(800, spec) == 804
+
+    def test_compute_encoded_size_with_transpose(self) -> None:
+        # TransposeCodec reorders axes but doesn't change the byte count.
+        # Verifies that compute_encoded_size walks through AA codecs correctly.
+        spec = _make_array_spec((3, 4), np.dtype("float64"))
+        chain = ChunkTransform(codecs=(TransposeCodec(order=(1, 0)), BytesCodec()), array_spec=spec)
+        assert chain.compute_encoded_size(96, spec) == 96
+
+    def test_encode_chunk_returns_none_propagation(self) -> None:
+        # When an AA codec returns None (signaling "this chunk is the fill value,
+        # don't store it"), encode_chunk must short-circuit and return None
+        # instead of passing None into the next codec.
+
+        class NoneReturningAACodec(TransposeCodec):
+            """An ArrayArrayCodec that always returns None from encode."""
+
+            def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer | None:
+                return None
+
+        spec = _make_array_spec((3, 4), np.dtype("float64"))
+        chain = ChunkTransform(
+            codecs=(NoneReturningAACodec(order=(1, 0)), BytesCodec()),
+            array_spec=spec,
+        )
+        arr = np.arange(12, dtype="float64").reshape(3, 4)
+        nd_buf = _make_nd_buffer(arr)
+        assert chain.encode_chunk(nd_buf) is None
+
+    def test_encode_decode_roundtrip_with_crc32c(self) -> None:
+        # Round-trip through BytesCodec + Crc32cCodec. Crc32c appends a checksum
+        # on encode and verifies it on decode, so this tests that the BB codec
+        # pipeline runs correctly in both directions.
+        arr = np.arange(100, dtype="float64")
+        spec = _make_array_spec(arr.shape, arr.dtype)
+        chain = ChunkTransform(codecs=(BytesCodec(), Crc32cCodec()), array_spec=spec)
+        nd_buf = _make_nd_buffer(arr)
+
+        encoded = chain.encode_chunk(nd_buf)
+        assert encoded is not None
+        decoded = chain.decode_chunk(encoded)
+        np.testing.assert_array_equal(arr, decoded.as_numpy_array())
+
+    def test_encode_decode_roundtrip_int32(self) -> None:
+        # Round-trip with int32 data to verify that the codec chain is not
+        # float-specific. Exercises a different dtype path through BytesCodec.
+        arr = np.arange(50, dtype="int32")
+        spec = _make_array_spec(arr.shape, arr.dtype)
+        chain = ChunkTransform(codecs=(BytesCodec(), ZstdCodec(level=1)), array_spec=spec)
+        nd_buf = _make_nd_buffer(arr)
+
+        encoded = chain.encode_chunk(nd_buf)
+        assert encoded is not None
+        decoded = chain.decode_chunk(encoded)
+        np.testing.assert_array_equal(arr, decoded.as_numpy_array())