Skip to content

Commit 1fc17c7

Browse files
authored
Merge branch 'main' into mkitti-morton-order-shard-indexing-benchmarks
2 parents 094bfbd + 306e480 commit 1fc17c7

8 files changed

Lines changed: 108 additions & 26 deletions

File tree

changes/3657.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix obstore _transform_list_dir implementation to correctly relativize paths (removing lstrip usage).

changes/3702.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Skip chunk coordinate enumeration in resize when the array is only growing, avoiding unbounded memory usage for large arrays.

changes/3704.misc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Remove an expensive `isinstance` check from the bytes codec decoding routine.

src/zarr/codecs/bytes.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55
from enum import Enum
66
from typing import TYPE_CHECKING
77

8-
import numpy as np
9-
108
from zarr.abc.codec import ArrayBytesCodec
11-
from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
9+
from zarr.core.buffer import Buffer, NDBuffer
1210
from zarr.core.common import JSON, parse_enum, parse_named_configuration
1311
from zarr.core.dtype.common import HasEndianness
1412

@@ -72,20 +70,15 @@ async def _decode_single(
7270
chunk_bytes: Buffer,
7371
chunk_spec: ArraySpec,
7472
) -> NDBuffer:
75-
assert isinstance(chunk_bytes, Buffer)
7673
# TODO: remove endianness enum in favor of literal union
7774
endian_str = self.endian.value if self.endian is not None else None
7875
if isinstance(chunk_spec.dtype, HasEndianness):
7976
dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg]
8077
else:
8178
dtype = chunk_spec.dtype.to_native_dtype()
8279
as_array_like = chunk_bytes.as_array_like()
83-
if isinstance(as_array_like, NDArrayLike):
84-
as_nd_array_like = as_array_like
85-
else:
86-
as_nd_array_like = np.asanyarray(as_array_like)
8780
chunk_array = chunk_spec.prototype.nd_buffer.from_ndarray_like(
88-
as_nd_array_like.view(dtype=dtype)
81+
as_array_like.view(dtype=dtype) # type: ignore[attr-defined]
8982
)
9083

9184
# ensure correct chunk shape

src/zarr/core/array.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5990,7 +5990,10 @@ async def _resize(
59905990
assert len(new_shape) == len(array.metadata.shape)
59915991
new_metadata = array.metadata.update_shape(new_shape)
59925992

5993-
if delete_outside_chunks:
5993+
# ensure deletion is only run if array is shrinking as the delete_outside_chunks path is unbounded in memory
5994+
only_growing = all(new >= old for new, old in zip(new_shape, array.metadata.shape, strict=True))
5995+
5996+
if delete_outside_chunks and not only_growing:
59945997
# Remove all chunks outside of the new shape
59955998
old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(array.metadata.shape))
59965999
new_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(new_shape))

src/zarr/storage/_obstore.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import contextlib
55
import pickle
66
from collections import defaultdict
7+
from itertools import chain
8+
from operator import itemgetter
79
from typing import TYPE_CHECKING, Generic, Self, TypedDict, TypeVar
810

911
from zarr.abc.store import (
@@ -15,6 +17,7 @@
1517
)
1618
from zarr.core.common import concurrent_map
1719
from zarr.core.config import config
20+
from zarr.storage._utils import _relativize_path
1821

1922
if TYPE_CHECKING:
2023
from collections.abc import AsyncGenerator, Coroutine, Iterable, Sequence
@@ -263,10 +266,11 @@ async def _transform_list_dir(
263266
# We assume that the underlying object-store implementation correctly handles the
264267
# prefix, so we don't double-check that the returned results actually start with the
265268
# given prefix.
266-
prefixes = [obj.lstrip(prefix).lstrip("/") for obj in list_result["common_prefixes"]]
267-
objects = [obj["path"].removeprefix(prefix).lstrip("/") for obj in list_result["objects"]]
268-
for item in prefixes + objects:
269-
yield item
269+
prefix = prefix.rstrip("/")
270+
for path in chain(
271+
list_result["common_prefixes"], map(itemgetter("path"), list_result["objects"])
272+
):
273+
yield _relativize_path(path=path, prefix=prefix)
270274

271275

272276
class _BoundedRequest(TypedDict):

src/zarr/testing/store.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -492,24 +492,36 @@ async def test_list_empty_path(self, store: S) -> None:
492492
assert observed_prefix_sorted == expected_prefix_sorted
493493

494494
async def test_list_dir(self, store: S) -> None:
495-
root = "foo"
496-
store_dict = {
497-
root + "/zarr.json": self.buffer_cls.from_bytes(b"bar"),
498-
root + "/c/1": self.buffer_cls.from_bytes(b"\x01"),
499-
}
495+
roots_and_keys: list[tuple[str, dict[str, Buffer]]] = [
496+
(
497+
"foo",
498+
{
499+
"foo/zarr.json": self.buffer_cls.from_bytes(b"bar"),
500+
"foo/c/1": self.buffer_cls.from_bytes(b"\x01"),
501+
},
502+
),
503+
(
504+
"foo/bar",
505+
{
506+
"foo/bar/foobar_first_child": self.buffer_cls.from_bytes(b"1"),
507+
"foo/bar/foobar_second_child/zarr.json": self.buffer_cls.from_bytes(b"2"),
508+
},
509+
),
510+
]
500511

501512
assert await _collect_aiterator(store.list_dir("")) == ()
502-
assert await _collect_aiterator(store.list_dir(root)) == ()
503513

504-
await store._set_many(store_dict.items())
514+
for root, store_dict in roots_and_keys:
515+
assert await _collect_aiterator(store.list_dir(root)) == ()
505516

506-
keys_observed = await _collect_aiterator(store.list_dir(root))
507-
keys_expected = {k.removeprefix(root + "/").split("/")[0] for k in store_dict}
517+
await store._set_many(store_dict.items())
508518

509-
assert sorted(keys_observed) == sorted(keys_expected)
519+
keys_observed = await _collect_aiterator(store.list_dir(root))
520+
keys_expected = {k.removeprefix(root + "/").split("/")[0] for k in store_dict}
521+
assert sorted(keys_observed) == sorted(keys_expected)
510522

511-
keys_observed = await _collect_aiterator(store.list_dir(root + "/"))
512-
assert sorted(keys_expected) == sorted(keys_observed)
523+
keys_observed = await _collect_aiterator(store.list_dir(root + "/"))
524+
assert sorted(keys_expected) == sorted(keys_observed)
513525

514526
async def test_set_if_not_exists(self, store: S) -> None:
515527
key = "k"

tests/test_array.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,73 @@ def test_resize_2d(store: MemoryStore, zarr_format: ZarrFormat) -> None:
781781
assert new_shape == result.shape
782782

783783

784+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
785+
def test_resize_growing_skips_chunk_enumeration(
786+
store: MemoryStore, zarr_format: ZarrFormat
787+
) -> None:
788+
"""Growing an array should not enumerate chunk coords for deletion (#3650 mitigation)."""
789+
from zarr.core.chunk_grids import RegularChunkGrid
790+
791+
z = zarr.create(
792+
shape=(10, 10),
793+
chunks=(5, 5),
794+
dtype="i4",
795+
fill_value=0,
796+
store=store,
797+
zarr_format=zarr_format,
798+
)
799+
z[:] = np.ones((10, 10), dtype="i4")
800+
801+
# growth only - ensure no chunk coords are enumerated
802+
with mock.patch.object(
803+
RegularChunkGrid,
804+
"all_chunk_coords",
805+
wraps=z.metadata.chunk_grid.all_chunk_coords,
806+
) as mock_coords:
807+
z.resize((20, 20))
808+
mock_coords.assert_not_called()
809+
810+
assert z.shape == (20, 20)
811+
np.testing.assert_array_equal(np.ones((10, 10), dtype="i4"), z[:10, :10])
812+
np.testing.assert_array_equal(np.zeros((10, 10), dtype="i4"), z[10:, 10:])
813+
814+
# shrink - ensure no regression of behaviour
815+
with mock.patch.object(
816+
RegularChunkGrid,
817+
"all_chunk_coords",
818+
wraps=z.metadata.chunk_grid.all_chunk_coords,
819+
) as mock_coords:
820+
z.resize((5, 5))
821+
assert mock_coords.call_count > 0
822+
823+
assert z.shape == (5, 5)
824+
np.testing.assert_array_equal(np.ones((5, 5), dtype="i4"), z[:])
825+
826+
# mixed: grow dim 0, shrink dim 1 - ensure deletion path runs
827+
z2 = zarr.create(
828+
shape=(10, 10),
829+
chunks=(5, 5),
830+
dtype="i4",
831+
fill_value=0,
832+
store=store,
833+
zarr_format=zarr_format,
834+
overwrite=True,
835+
)
836+
z2[:] = np.ones((10, 10), dtype="i4")
837+
838+
with mock.patch.object(
839+
RegularChunkGrid,
840+
"all_chunk_coords",
841+
wraps=z2.metadata.chunk_grid.all_chunk_coords,
842+
) as mock_coords:
843+
z2.resize((20, 5))
844+
assert mock_coords.call_count > 0
845+
846+
assert z2.shape == (20, 5)
847+
np.testing.assert_array_equal(np.ones((10, 5), dtype="i4"), z2[:10, :])
848+
np.testing.assert_array_equal(np.zeros((10, 5), dtype="i4"), z2[10:, :])
849+
850+
784851
@pytest.mark.parametrize("store", ["memory"], indirect=True)
785852
def test_append_1d(store: MemoryStore, zarr_format: ZarrFormat) -> None:
786853
a = np.arange(105)

0 commit comments

Comments
 (0)