Skip to content

Commit 021662c

Browse files
authored
fix: use contextual arrayspec when validating / evolving codecs (zarr-developers#3941)
* fix: use contextual arrayspec when validating / evolving codecs * docs: changelog * fix: gate tests depending on presence of compiled backend
1 parent ad374b5 commit 021662c

6 files changed

Lines changed: 88 additions & 2 deletions

File tree

changes/3938.bugfix.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fixed a `CastValue` validation bug where the "can we use an out-of-range mode" check
2+
inspected the source dtype instead of the target dtype. This meant arrays with a
3+
float source dtype and an integer target dtype incorrectly raised a `ValueError`
4+
when configured with a `wrap` out-of-range mode.

changes/3941.bugfix.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Fixed a bug where the codec pipeline evolved each codec against the original
2+
array spec instead of the spec produced by upstream array-to-array codecs. This
3+
caused failures whenever an upstream codec changed the dtype between codec
4+
boundaries — e.g. arrays using `CastValue` to convert a single-byte source dtype
5+
(`int8`) to a multi-byte target dtype (`int16`) raised a `ValueError` from
6+
`BytesCodec` about a missing `endian` configuration.

src/zarr/core/codec_pipeline.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,18 @@ class BatchedCodecPipeline(CodecPipeline):
187187
batch_size: int
188188

189189
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
190-
return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self)
190+
# Each codec must be evolved against the spec it will actually see
191+
# at run-time, not the original array spec. Earlier array->array
192+
# codecs may transform the dtype (e.g. cast_value), so the spec
193+
# threaded into later codecs (the array->bytes serializer and any
194+
# bytes->bytes filters) must reflect those transformations.
195+
evolved: list[Codec] = []
196+
spec = array_spec
197+
for codec in self:
198+
evolved_codec = codec.evolve_from_array_spec(array_spec=spec)
199+
evolved.append(evolved_codec)
200+
spec = evolved_codec.resolve_metadata(spec)
201+
return type(self).from_codecs(evolved)
191202

192203
@classmethod
193204
def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:

src/zarr/core/metadata/v3.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,21 @@ def __init__(
503503
config=ArrayConfig.from_dict({}), # TODO: config is not needed here.
504504
prototype=default_buffer_prototype(), # TODO: prototype is not needed here.
505505
)
506-
codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial)
506+
# Thread the spec through evolution: each codec must be evolved against
507+
# the spec it will actually see at run-time, not the original array spec.
508+
# Earlier array->array codecs may transform the dtype (e.g. cast_value),
509+
# so the spec passed to later codecs must reflect those transformations.
510+
# Per-codec validate() must run before resolve_metadata(), since the
511+
# latter may rely on invariants the former checks (e.g. cast_value
512+
# rejects complex source dtypes that would otherwise crash _do_cast).
513+
evolved: list[Codec] = []
514+
spec = array_spec
515+
for c in codecs_parsed_partial:
516+
evolved_codec = c.evolve_from_array_spec(spec)
517+
evolved_codec.validate(shape=spec.shape, dtype=spec.dtype, chunk_grid=chunk_grid_parsed)
518+
evolved.append(evolved_codec)
519+
spec = evolved_codec.resolve_metadata(spec)
520+
codecs_parsed = tuple(evolved)
507521
validate_codecs(codecs_parsed_partial, data_type)
508522

509523
object.__setattr__(self, "shape", shape_parsed)

tests/test_codec_pipeline.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from __future__ import annotations
22

3+
import numpy as np
34
import pytest
45

56
import zarr
7+
from zarr.codecs import BytesCodec, CastValue
68
from zarr.core.array import _get_chunk_spec
79
from zarr.core.buffer.core import default_buffer_prototype
810
from zarr.core.indexing import BasicIndexer
@@ -70,3 +72,51 @@ async def test_read_returns_get_results(
7072
assert len(results) == len(expected_statuses)
7173
for result, expected_status in zip(results, expected_statuses, strict=True):
7274
assert result["status"] == expected_status
75+
76+
77+
try:
78+
import cast_value_rs # noqa: F401
79+
80+
_HAS_CAST_VALUE_RS = True
81+
except ModuleNotFoundError:
82+
_HAS_CAST_VALUE_RS = False
83+
84+
requires_cast_value_rs = pytest.mark.skipif(
85+
not _HAS_CAST_VALUE_RS, reason="cast-value-rs not installed"
86+
)
87+
88+
89+
@requires_cast_value_rs
90+
@pytest.mark.parametrize(
91+
("source_dtype", "target_dtype"),
92+
[
93+
# Source is single-byte (no endianness); target is multi-byte (has endianness).
94+
# Without the fix, BytesCodec.evolve_from_array_spec sees the source dtype,
95+
# strips its `endian` to None, and then chokes when the chunk_spec dtype
96+
# gets transformed to the multi-byte target before bytes-decoding.
97+
("int8", "int16"),
98+
("uint8", "int32"),
99+
("int8", "float32"),
100+
# Source is multi-byte; target is single-byte (the reverse direction also
101+
# exercises the spec-threading logic).
102+
("int16", "int8"),
103+
],
104+
)
105+
def test_codec_pipeline_threads_dtype_through_evolve(source_dtype: str, target_dtype: str) -> None:
106+
"""Regression for #3937: each codec must be evolved against the spec it
107+
will see at runtime, not the original array spec. cast_value transforms
108+
the dtype between AA codecs and the array->bytes serializer."""
109+
arr = zarr.create_array(
110+
store={},
111+
shape=(4,),
112+
chunks=(4,),
113+
dtype=source_dtype,
114+
fill_value=0,
115+
filters=[CastValue(data_type=target_dtype)],
116+
serializer=BytesCodec(endian="little"),
117+
compressors=[],
118+
zarr_format=3,
119+
overwrite=True,
120+
)
121+
arr[:] = np.asarray([0, 1, 2, 3], dtype=source_dtype)
122+
np.testing.assert_array_equal(arr[:], np.asarray([0, 1, 2, 3], dtype=source_dtype))

tests/test_codecs/test_cast_value.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def test_validation_rejects_invalid(case: ExpectErr[dict[str, Any]]) -> None:
165165
)
166166

167167

168+
@requires_cast_value_rs
168169
@pytest.mark.parametrize(
169170
("source_dtype", "target_dtype"),
170171
[

0 commit comments

Comments
 (0)