Fix and test for case where some chunks in shard are all fill

aldenks · aldenks · commit c65cf828eef6 · 2025-07-21T16:47:46.000-04:00
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -90,9 +90,9 @@ async def get(
         self, prototype: BufferPrototype, byte_range: ByteRequest | None = None
     ) -> Buffer | None:
         assert byte_range is None, "byte_range is not supported within shards"
-        assert (
-            prototype == default_buffer_prototype()
-        ), f"prototype is not supported within shards currently. diff: {prototype} != {default_buffer_prototype()}"
+        assert prototype == default_buffer_prototype(), (
+            f"prototype is not supported within shards currently. diff: {prototype} != {default_buffer_prototype()}"
+        )
         return self.shard_dict.get(self.chunk_coords)
 
 
@@ -124,9 +124,7 @@ def chunks_per_shard(self) -> ChunkCoords:
     def _localize_chunk(self, chunk_coords: ChunkCoords) -> ChunkCoords:
         return tuple(
             chunk_i % shard_i
-            for chunk_i, shard_i in zip(
-                chunk_coords, self.offsets_and_lengths.shape, strict=False
-            )
+            for chunk_i, shard_i in zip(chunk_coords, self.offsets_and_lengths.shape, strict=False)
         )
 
     def is_all_empty(self) -> bool:
@@ -143,9 +141,7 @@ def get_chunk_slice(self, chunk_coords: ChunkCoords) -> tuple[int, int] | None:
         else:
             return (int(chunk_start), int(chunk_start + chunk_len))
 
-    def set_chunk_slice(
-        self, chunk_coords: ChunkCoords, chunk_slice: slice | None
-    ) -> None:
+    def set_chunk_slice(self, chunk_coords: ChunkCoords, chunk_slice: slice | None) -> None:
         localized_chunk = self._localize_chunk(chunk_coords)
         if chunk_slice is None:
             self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64)
@@ -167,11 +163,7 @@ def is_dense(self, chunk_byte_length: int) -> bool:
 
         # Are all non-empty offsets unique?
         if len(
-            {
-                offset
-                for offset, _ in sorted_offsets_and_lengths
-                if offset != MAX_UINT_64
-            }
+            {offset for offset, _ in sorted_offsets_and_lengths if offset != MAX_UINT_64}
         ) != len(sorted_offsets_and_lengths):
             return False
 
@@ -275,9 +267,7 @@ def __setitem__(self, chunk_coords: ChunkCoords, value: Buffer) -> None:
         chunk_start = len(self.buf)
         chunk_length = len(value)
         self.buf += value
-        self.index.set_chunk_slice(
-            chunk_coords, slice(chunk_start, chunk_start + chunk_length)
-        )
+        self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
 
     def __delitem__(self, chunk_coords: ChunkCoords) -> None:
         raise NotImplementedError
@@ -291,9 +281,7 @@ async def finalize(
         if index_location == ShardingCodecIndexLocation.start:
             empty_chunks_mask = self.index.offsets_and_lengths[..., 0] == MAX_UINT_64
             self.index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
-            index_bytes = await index_encoder(
-                self.index
-            )  # encode again with corrected offsets
+            index_bytes = await index_encoder(self.index)  # encode again with corrected offsets
             out_buf = index_bytes + self.buf
         else:
             out_buf = self.buf + index_bytes
@@ -371,8 +359,7 @@ def __init__(
         chunk_shape: ChunkCoordsLike,
         codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
         index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
-        index_location: ShardingCodecIndexLocation
-        | str = ShardingCodecIndexLocation.end,
+        index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
     ) -> None:
         chunk_shape_parsed = parse_shapelike(chunk_shape)
         codecs_parsed = parse_codecs(codecs)
@@ -402,9 +389,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"]))
         object.__setattr__(self, "codecs", parse_codecs(config["codecs"]))
         object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"]))
-        object.__setattr__(
-            self, "index_location", parse_index_location(config["index_location"])
-        )
+        object.__setattr__(self, "index_location", parse_index_location(config["index_location"]))
 
         # Use instance-local lru_cache to avoid memory leaks
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
@@ -433,9 +418,7 @@ def to_dict(self) -> dict[str, JSON]:
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         shard_spec = self._get_chunk_spec(array_spec)
-        evolved_codecs = tuple(
-            c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs
-        )
+        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs)
         if evolved_codecs != self.codecs:
             return replace(self, codecs=evolved_codecs)
         return self
@@ -610,9 +593,7 @@ async def _encode_single(
             shard_array,
         )
 
-        return await shard_builder.finalize(
-            self.index_location, self._encode_shard_index
-        )
+        return await shard_builder.finalize(self.index_location, self._encode_shard_index)
 
     async def _encode_partial_single(
         self,
@@ -672,8 +653,7 @@ def _is_total_shard(
         self, all_chunk_coords: set[ChunkCoords], chunks_per_shard: ChunkCoords
     ) -> bool:
         return len(all_chunk_coords) == product(chunks_per_shard) and all(
-            chunk_coords in all_chunk_coords
-            for chunk_coords in c_order_iter(chunks_per_shard)
+            chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(chunks_per_shard)
         )
 
     async def _decode_shard_index(
@@ -699,9 +679,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
                 .encode(
                     [
                         (
-                            get_ndbuffer_class().from_numpy_array(
-                                index.offsets_and_lengths
-                            ),
+                            get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths),
                             self._get_index_chunk_spec(index.chunks_per_shard),
                         )
                     ],
@@ -810,9 +788,10 @@ async def _load_partial_shard_maybe(
             _ChunkCoordsByteSlice(chunk_coords, slice(*chunk_byte_slice))
             for chunk_coords in all_chunk_coords
             # Drop chunks where index lookup fails
+            # e.g. when write_empty_chunks = False and the chunk is empty
             if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
         ]
-        if len(chunks) < len(all_chunk_coords):
+        if len(chunks) == 0:
             return None
 
         groups = self._coalesce_chunks(chunks)
@@ -854,9 +833,7 @@ def _coalesce_chunks(
 
         for chunk in sorted_chunks[1:]:
             gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
-            size_if_coalesced = (
-                chunk.byte_slice.stop - current_group[0].byte_slice.start
-            )
+            size_if_coalesced = chunk.byte_slice.stop - current_group[0].byte_slice.start
             if gap_to_chunk < max_gap_bytes and size_if_coalesced < coalesce_max_bytes:
                 current_group.append(chunk)
             else:
@@ -899,9 +876,7 @@ async def _get_group_bytes(
 
         return shard_dict
 
-    def compute_encoded_size(
-        self, input_byte_length: int, shard_spec: ArraySpec
-    ) -> int:
+    def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int:
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
         return input_byte_length + self._shard_index_size(chunks_per_shard)
 
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
@@ -344,6 +344,79 @@ def test_sharding_multiple_chunks_partial_shard_read(
         assert isinstance(kwargs["byte_range"], (SuffixByteRequest, RangeByteRequest))
 
 
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_read_empty_chunks_within_non_empty_shard_write_empty_false(
+    store: Store, index_location: ShardingCodecIndexLocation
+) -> None:
+    """
+    Case where
+        - some, but not all, chunks in the last shard are empty
+        - the last shard is not complete (array length is not a multiple of shard shape),
+          this takes us down the partial shard read path
+        - write_empty_chunks=False so the shard index will have less entries than chunks in the shard
+    """
+    # array with mixed empty and non-empty chunks in second shard
+    data = np.array([
+        # shard 0. full 8 elements, all chunks have some non-fill data
+        0, 1, 2, 3, 4, 5, 6, 7,
+        # shard 1. 6 elements (< shard shape)
+        2, 0, # chunk 0, written
+        0, 0, # chunk 1, all fill, not written
+        4, 5  # chunk 2, written
+    ], dtype="int32")  # fmt: off
+
+    spath = StorePath(store)
+    a = zarr.create_array(
+        spath,
+        shape=(14,),
+        chunks=(2,),
+        shards={"shape": (8,), "index_location": index_location},
+        dtype="int32",
+        fill_value=0,
+        filters=None,
+        compressors=None,
+        config={"write_empty_chunks": False},
+    )
+    a[:] = data
+
+    assert np.array_equal(a[:], data)
+
+
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_read_empty_chunks_within_empty_shard_write_empty_false(
+    store: Store, index_location: ShardingCodecIndexLocation
+) -> None:
+    """
+    Case where
+        - all chunks in last shard are empty
+        - the last shard is not complete (array length is not a multiple of shard shape),
+          this takes us down the partial shard read path
+        - write_empty_chunks=False so the shard index will have no entries
+    """
+    fill_value = -99
+    shard_size = 8
+    data = np.arange(14, dtype="int32")
+    data[shard_size:] = fill_value  # 2nd shard is all fill value
+
+    spath = StorePath(store)
+    a = zarr.create_array(
+        spath,
+        shape=(14,),
+        chunks=(2,),
+        shards={"shape": (shard_size,), "index_location": index_location},
+        dtype="int32",
+        fill_value=fill_value,
+        filters=None,
+        compressors=None,
+        config={"write_empty_chunks": False},
+    )
+    a[:] = data
+
+    assert np.array_equal(a[:], data)
+
+
 @pytest.mark.parametrize("index_location", ["start", "end"])
 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
 def test_sharding_partial_shard_read__index_load_fails(
@@ -577,7 +650,6 @@ def test_nested_sharding_create_array(
         filters=None,
         compressors=None,
     )
-    print(a.metadata.to_dict())
 
     a[:, :, :] = data
 
@@ -637,7 +709,6 @@ async def test_delete_empty_shards(store: Store) -> None:
         compressors=None,
         fill_value=1,
     )
-    print(a.metadata.to_dict())
     await _AsyncArrayProxy(a)[:, :].set(np.zeros((16, 16)))
     await _AsyncArrayProxy(a)[8:, :].set(np.ones((8, 16)))
     await _AsyncArrayProxy(a)[:, 8:].set(np.ones((16, 8)))
@@ -682,7 +753,6 @@ async def test_sharding_with_empty_inner_chunk(
     )
     data[:4, :4] = fill_value
     await a.setitem(..., data)
-    print("read data")
     data_read = await a.getitem(...)
     assert np.array_equal(data_read, data)
 
diff --git a/tests/test_properties.py b/tests/test_properties.py
@@ -76,17 +76,11 @@ def deep_equal(a: Any, b: Any) -> bool:
 
 
 @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
-@given(data=st.data(), zarr_format=zarr_formats)
-def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None:
-    nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format)))
-    zarray = data.draw(
-        arrays(arrays=st.just(nparray), zarr_formats=st.just(zarr_format))
-    )
-    try:
-        assert_array_equal(nparray, zarray[:])
-    except Exception as e:
-        breakpoint()
-        raise e
+@given(data=st.data())
+def test_array_roundtrip(data: st.DataObject) -> None:
+    nparray = data.draw(numpy_arrays())
+    zarray = data.draw(arrays(arrays=st.just(nparray)))
+    assert_array_equal(nparray, zarray[:])
 
 
 @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
@@ -98,20 +92,12 @@ def test_array_creates_implicit_groups(array):
         parent = "/".join(ancestry[: i + 1])
         if array.metadata.zarr_format == 2:
             assert (
-                sync(
-                    array.store.get(
-                        f"{parent}/.zgroup", prototype=default_buffer_prototype()
-                    )
-                )
+                sync(array.store.get(f"{parent}/.zgroup", prototype=default_buffer_prototype()))
                 is not None
             )
         elif array.metadata.zarr_format == 3:
             assert (
-                sync(
-                    array.store.get(
-                        f"{parent}/zarr.json", prototype=default_buffer_prototype()
-                    )
-                )
+                sync(array.store.get(f"{parent}/zarr.json", prototype=default_buffer_prototype()))
                 is not None
             )
 
@@ -129,9 +115,7 @@ def test_basic_indexing(data: st.DataObject) -> None:
     actual = zarray[indexer]
     assert_array_equal(nparray[indexer], actual)
 
-    new_data = data.draw(
-        numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype)
-    )
+    new_data = data.draw(numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype))
     zarray[indexer] = new_data
     nparray[indexer] = new_data
     assert_array_equal(nparray, zarray[:])
@@ -153,9 +137,7 @@ def test_oindex(data: st.DataObject) -> None:
         if isinstance(idxr, np.ndarray) and idxr.size != np.unique(idxr).size:
             # behaviour of setitem with repeated indices is not guaranteed in practice
             assume(False)
-    new_data = data.draw(
-        numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype)
-    )
+    new_data = data.draw(numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype))
     nparray[npindexer] = new_data
     zarray.oindex[zindexer] = new_data
     assert_array_equal(nparray, zarray[:])
@@ -231,33 +213,7 @@ def test_roundtrip_array_metadata_from_json(data: st.DataObject, zarr_format: in
     orig = metadata.to_dict()
     rt = metadata_roundtripped.to_dict()
 
-    assert deep_equal(
-        orig, rt
-    ), f"Roundtrip mismatch:\nOriginal: {orig}\nRoundtripped: {rt}"
-
-
-# @st.composite
-# def advanced_indices(draw, *, shape):
-#     basic_idxr = draw(
-#         basic_indices(
-#             shape=shape, min_dims=len(shape), max_dims=len(shape), allow_ellipsis=False
-#         ).filter(lambda x: isinstance(x, tuple))
-#     )
-
-#     int_idxr = draw(
-#         npst.integer_array_indices(shape=shape, result_shape=npst.array_shapes(max_dims=1))
-#     )
-#     args = tuple(
-#         st.sampled_from((l, r)) for l, r in zip_longest(basic_idxr, int_idxr, fillvalue=slice(None))
-#     )
-#     return draw(st.tuples(*args))
-
-
-# @given(st.data())
-# def test_roundtrip_object_array(data):
-#     nparray = data.draw(np_arrays)
-#     zarray = data.draw(arrays(arrays=st.just(nparray)))
-#     assert_array_equal(nparray, zarray[:])
+    assert deep_equal(orig, rt), f"Roundtrip mismatch:\nOriginal: {orig}\nRoundtripped: {rt}"
 
 
 def serialized_complex_float_is_valid(
@@ -333,9 +289,7 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N
     # version-specific validations
     if isinstance(meta, ArrayV2Metadata):
         assert asdict_dict["filters"] != ()
-        assert asdict_dict["filters"] is None or isinstance(
-            asdict_dict["filters"], tuple
-        )
+        assert asdict_dict["filters"] is None or isinstance(asdict_dict["filters"], tuple)
         assert asdict_dict["zarr_format"] == 2
     else:
         assert asdict_dict["zarr_format"] == 3