Skip to content

Commit e9d06c2

Browse files
committed
Merge branch 'main' into poc/unified-chunk-grid
2 parents 5cef539 + 8f14d67 commit e9d06c2

File tree

10 files changed

+174
-9
lines changed

10 files changed

+174
-9
lines changed

changes/3748.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added `array.read_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `ChunkNotFoundError` instead of filling them with the array's fill value.

docs/user-guide/arrays.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,25 @@ print(f"Shape after second append: {z.shape}")
158158

159159
Zarr arrays are parametrized with a configuration that determines certain aspects of array behavior.
160160

161-
We currently support two configuration options for arrays: `write_empty_chunks` and `order`.
161+
We currently support three configuration options for arrays: `write_empty_chunks`, `read_missing_chunks`, and `order`.
162162

163163
| field | type | default | description |
164164
| - | - | - | - |
165165
| `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks).
166+
| `read_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError].
166167
| `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store.
167168

169+
!!! info
170+
The Zarr V3 spec states that readers should interpret an uninitialized chunk as containing the
171+
array's `fill_value`. By default, Zarr-Python follows this behavior: a missing chunk is treated
172+
as uninitialized and filled with the array's `fill_value`. However, if you know that all chunks
173+
have been written (i.e., are initialized), you may want to treat a missing chunk as an error. Set
174+
`read_missing_chunks=False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead.
175+
176+
!!! note
177+
`write_empty_chunks=False` skips writing chunks that are entirely the array's fill value.
178+
If `read_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError].
179+
168180
You can specify the configuration when you create an array with the `config` keyword argument.
169181
`config` can be passed as either a `dict` or an `ArrayConfig` object.
170182

docs/user-guide/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Configuration options include the following:
3131
- Default array order in memory `array.order`
3232
- Whether empty chunks are written to storage `array.write_empty_chunks`
3333
- Enable experimental rectilinear chunks `array.rectilinear_chunks`
34+
- Whether missing chunks are filled with the array's fill value on read `array.read_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead.
3435
- Async and threading options, e.g. `async.concurrency` and `threading.max_workers`
3536
- Selections of implementations of codecs, codec pipelines and buffers
3637
- Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more.

src/zarr/core/array.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
from zarr.core.sync import sync
129129
from zarr.errors import (
130130
ArrayNotFoundError,
131+
ChunkNotFoundError,
131132
MetadataValidationError,
132133
ZarrDeprecationWarning,
133134
ZarrUserWarning,
@@ -5849,7 +5850,8 @@ async def _get_selection(
58495850
_config = replace(_config, order=order)
58505851

58515852
# reading chunks and decoding them
5852-
await codec_pipeline.read(
5853+
indexed_chunks = list(indexer)
5854+
results = await codec_pipeline.read(
58535855
[
58545856
(
58555857
store_path / metadata.encode_chunk_key(chunk_coords),
@@ -5858,11 +5860,26 @@ async def _get_selection(
58585860
out_selection,
58595861
is_complete_chunk,
58605862
)
5861-
for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
5863+
for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexed_chunks
58625864
],
58635865
out_buffer,
58645866
drop_axes=indexer.drop_axes,
58655867
)
5868+
if _config.read_missing_chunks is False:
5869+
missing_info = []
5870+
for i, result in enumerate(results):
5871+
if result["status"] == "missing":
5872+
coords = indexed_chunks[i][0]
5873+
key = metadata.encode_chunk_key(coords)
5874+
missing_info.append(f" chunk '{key}' (grid position {coords})")
5875+
if missing_info:
5876+
chunks_str = "\n".join(missing_info)
5877+
raise ChunkNotFoundError(
5878+
f"{len(missing_info)} chunk(s) not found in store '{store_path}'.\n"
5879+
f"Set the 'array.read_missing_chunks' config to True to fill "
5880+
f"missing chunks with the fill value.\n"
5881+
f"Missing chunks:\n{chunks_str}"
5882+
)
58665883
if isinstance(indexer, BasicIndexer) and indexer.shape == ():
58675884
return out_buffer.as_scalar()
58685885
return out_buffer.as_ndarray_like()

src/zarr/core/array_spec.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ArrayConfigParams(TypedDict):
2828

2929
order: NotRequired[MemoryOrder]
3030
write_empty_chunks: NotRequired[bool]
31+
read_missing_chunks: NotRequired[bool]
3132

3233

3334
@dataclass(frozen=True)
@@ -41,17 +42,25 @@ class ArrayConfig:
4142
The memory layout of the arrays returned when reading data from the store.
4243
write_empty_chunks : bool
4344
If True, empty chunks will be written to the store.
45+
read_missing_chunks : bool
46+
If True, missing chunks will be filled with the array's fill value on read.
47+
If False, reading missing chunks will raise a ``ChunkNotFoundError``.
4448
"""
4549

4650
order: MemoryOrder
4751
write_empty_chunks: bool
52+
read_missing_chunks: bool
4853

49-
def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None:
54+
def __init__(
55+
self, order: MemoryOrder, write_empty_chunks: bool, *, read_missing_chunks: bool = True
56+
) -> None:
5057
order_parsed = parse_order(order)
5158
write_empty_chunks_parsed = parse_bool(write_empty_chunks)
59+
read_missing_chunks_parsed = parse_bool(read_missing_chunks)
5260

5361
object.__setattr__(self, "order", order_parsed)
5462
object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed)
63+
object.__setattr__(self, "read_missing_chunks", read_missing_chunks_parsed)
5564

5665
@classmethod
5766
def from_dict(cls, data: ArrayConfigParams) -> Self:
@@ -62,7 +71,9 @@ def from_dict(cls, data: ArrayConfigParams) -> Self:
6271
"""
6372
kwargs_out: ArrayConfigParams = {}
6473
for f in fields(ArrayConfig):
65-
field_name = cast("Literal['order', 'write_empty_chunks']", f.name)
74+
field_name = cast(
75+
"Literal['order', 'write_empty_chunks', 'read_missing_chunks']", f.name
76+
)
6677
if field_name not in data:
6778
kwargs_out[field_name] = zarr_config.get(f"array.{field_name}")
6879
else:
@@ -73,7 +84,11 @@ def to_dict(self) -> ArrayConfigParams:
7384
"""
7485
Serialize an instance of this class to a dict.
7586
"""
76-
return {"order": self.order, "write_empty_chunks": self.write_empty_chunks}
87+
return {
88+
"order": self.order,
89+
"write_empty_chunks": self.write_empty_chunks,
90+
"read_missing_chunks": self.read_missing_chunks,
91+
}
7792

7893

7994
ArrayConfigLike = ArrayConfig | ArrayConfigParams

src/zarr/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def enable_gpu(self) -> ConfigSet:
9696
"array": {
9797
"order": "C",
9898
"write_empty_chunks": False,
99+
"read_missing_chunks": True,
99100
"target_shard_size_bytes": None,
100101
"rectilinear_chunks": False,
101102
},

src/zarr/core/dtype/npy/time.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,8 @@ def cast_scalar(self, data: object) -> np.timedelta64:
545545
raise a TypeError.
546546
"""
547547
if self._check_scalar(data):
548+
if isinstance(data, np.timedelta64) and np.isnat(data):
549+
return np.timedelta64("NaT", self.unit)
548550
return self._cast_scalar_unchecked(data)
549551
msg = (
550552
f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the "
@@ -559,7 +561,7 @@ def default_scalar(self) -> np.timedelta64:
559561
This method provides a default value for the timedelta64 scalar, which is
560562
a 'Not-a-Time' (NaT) value.
561563
"""
562-
return np.timedelta64("NaT")
564+
return np.timedelta64("NaT", self.unit)
563565

564566
def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64:
565567
"""

src/zarr/errors.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"ArrayNotFoundError",
44
"BaseZarrError",
55
"BoundsCheckError",
6+
"ChunkNotFoundError",
67
"ContainsArrayAndGroupError",
78
"ContainsArrayError",
89
"ContainsGroupError",
@@ -144,3 +145,9 @@ class BoundsCheckError(IndexError): ...
144145

145146

146147
class ArrayIndexError(IndexError): ...
148+
149+
150+
class ChunkNotFoundError(BaseZarrError):
151+
"""
152+
Raised when a chunk that was expected to exist in storage was not retrieved successfully.
153+
"""

tests/test_config.py

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from zarr.core.codec_pipeline import BatchedCodecPipeline
2424
from zarr.core.config import BadConfigError, config
2525
from zarr.core.indexing import SelectorTuple
26-
from zarr.errors import ZarrUserWarning
26+
from zarr.errors import ChunkNotFoundError, ZarrUserWarning
2727
from zarr.registry import (
2828
fully_qualified_name,
2929
get_buffer_class,
@@ -53,6 +53,7 @@ def test_config_defaults_set() -> None:
5353
"array": {
5454
"order": "C",
5555
"write_empty_chunks": False,
56+
"read_missing_chunks": True,
5657
"target_shard_size_bytes": None,
5758
"rectilinear_chunks": False,
5859
},
@@ -320,6 +321,108 @@ class NewCodec2(BytesCodec):
320321
get_codec_class("new_codec")
321322

322323

324+
@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
325+
@pytest.mark.parametrize(
326+
"kwargs",
327+
[
328+
{"shards": (4, 4)},
329+
{"compressors": None},
330+
],
331+
ids=["partial_decode", "full_decode"],
332+
)
333+
def test_config_read_missing_chunks(store: Store, kwargs: dict[str, Any]) -> None:
334+
arr = zarr.create_array(
335+
store=store,
336+
shape=(4, 4),
337+
chunks=(2, 2),
338+
dtype="int32",
339+
fill_value=42,
340+
**kwargs,
341+
)
342+
343+
# default behavior: missing chunks are filled with the fill value
344+
result = zarr.open_array(store)[:]
345+
assert np.array_equal(result, np.full((4, 4), 42, dtype="int32"))
346+
347+
# with read_missing_chunks=False, reading missing chunks raises an error
348+
with config.set({"array.read_missing_chunks": False}):
349+
with pytest.raises(ChunkNotFoundError):
350+
zarr.open_array(store)[:]
351+
352+
# after writing data, all chunks exist and no error is raised
353+
arr[:] = np.arange(16, dtype="int32").reshape(4, 4)
354+
with config.set({"array.read_missing_chunks": False}):
355+
result = zarr.open_array(store)[:]
356+
assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4))
357+
358+
359+
@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
360+
def test_config_read_missing_chunks_sharded_inner(store: Store) -> None:
361+
"""Because the shard index and inner chunks should be stored
362+
together in a single storage object (read: a file or blob),
363+
we delegate to the shard index the responsibility of determining
364+
what chunks should be present.
365+
366+
Thus, `read_missing_chunks` raises an error only if the entire *shard*
367+
is missing. Missing inner chunks are filled with the array's fill value
368+
and do not raise an error, even if `read_missing_chunks=False` at the
369+
array level.
370+
"""
371+
arr = zarr.create_array(
372+
store=store,
373+
shape=(8, 4),
374+
chunks=(2, 2),
375+
shards=(4, 4),
376+
dtype="int32",
377+
fill_value=42,
378+
)
379+
380+
# write only one inner chunk in the first shard, leaving the second shard empty
381+
arr[0:2, 0:2] = np.ones((2, 2), dtype="int32")
382+
383+
with config.set({"array.read_missing_chunks": False}):
384+
a = zarr.open_array(store)
385+
386+
# first shard exists: missing inner chunks are filled, no error
387+
result = a[:4]
388+
expected = np.full((4, 4), 42, dtype="int32")
389+
expected[0:2, 0:2] = 1
390+
assert np.array_equal(result, expected)
391+
392+
# second shard is entirely missing: raises an error
393+
with pytest.raises(ChunkNotFoundError):
394+
a[4:]
395+
396+
397+
@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
398+
def test_config_read_missing_chunks_write_empty_chunks(store: Store) -> None:
399+
"""write_empty_chunks=False drops chunks equal to fill_value, which then
400+
appear missing to read_missing_chunks=False."""
401+
arr = zarr.create_array(
402+
store=store,
403+
shape=(4,),
404+
chunks=(2,),
405+
dtype="int32",
406+
fill_value=0,
407+
config={"write_empty_chunks": False, "read_missing_chunks": False},
408+
)
409+
410+
# write non-fill-value data: chunks are stored
411+
arr[:] = [1, 2, 3, 4]
412+
assert np.array_equal(arr[:], [1, 2, 3, 4])
413+
414+
# overwrite with fill_value: chunks are dropped by write_empty_chunks=False
415+
arr[:] = 0
416+
with pytest.raises(ChunkNotFoundError):
417+
arr[:]
418+
419+
# with write_empty_chunks=True, chunks are kept and no error is raised
420+
with config.set({"array.write_empty_chunks": True}):
421+
arr = zarr.open_array(store)
422+
arr[:] = 0
423+
assert np.array_equal(arr[:], [0, 0, 0, 0])
424+
425+
323426
@pytest.mark.parametrize(
324427
"key",
325428
[

tests/test_dtype/test_npy/test_time.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class TestTimeDelta64(_TestTimeBase):
115115

116116
cast_value_params = (
117117
(TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")),
118-
(TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")),
118+
(TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT", "ns")),
119119
)
120120
invalid_scalar_params = (
121121
(TimeDelta64(unit="Y", scale_factor=1), 1.3),
@@ -148,6 +148,12 @@ def test_time_scale_factor_too_low() -> None:
148148
TimeDelta64(scale_factor=scale_factor)
149149

150150

151+
def test_default_is_NaT() -> None:
152+
np.testing.assert_equal(
153+
TimeDelta64(unit="ns", scale_factor=1).default_scalar(), np.timedelta64("NaT", "ns")
154+
)
155+
156+
151157
def test_time_scale_factor_too_high() -> None:
152158
"""
153159
Test that an invalid unit raises a ValueError.

0 commit comments

Comments
 (0)