Merge branch 'main' into mkitti-morton-order-shard-indexing-benchmarks

d-v-b · web-flow · commit 1fc17c74df2e · 2026-02-18T08:57:53.000+01:00
diff --git a/changes/3657.bugfix.md b/changes/3657.bugfix.md
@@ -0,0 +1 @@
+Fix obstore _transform_list_dir implementation to correctly relativize paths (removing lstrip usage).
diff --git a/changes/3702.bugfix.md b/changes/3702.bugfix.md
@@ -0,0 +1 @@
+Skip chunk coordinate enumeration in resize when the array is only growing, avoiding unbounded memory usage for large arrays.
diff --git a/changes/3704.misc.md b/changes/3704.misc.md
@@ -0,0 +1 @@
+Remove an expensive `isinstance` check from the bytes codec decoding routine.
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
@@ -5,10 +5,8 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-import numpy as np
-
 from zarr.abc.codec import ArrayBytesCodec
-from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
+from zarr.core.buffer import Buffer, NDBuffer
 from zarr.core.common import JSON, parse_enum, parse_named_configuration
 from zarr.core.dtype.common import HasEndianness
 
@@ -72,20 +70,15 @@ async def _decode_single(
         chunk_bytes: Buffer,
         chunk_spec: ArraySpec,
     ) -> NDBuffer:
-        assert isinstance(chunk_bytes, Buffer)
         # TODO: remove endianness enum in favor of literal union
         endian_str = self.endian.value if self.endian is not None else None
         if isinstance(chunk_spec.dtype, HasEndianness):
             dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype()  # type: ignore[call-arg]
         else:
             dtype = chunk_spec.dtype.to_native_dtype()
         as_array_like = chunk_bytes.as_array_like()
-        if isinstance(as_array_like, NDArrayLike):
-            as_nd_array_like = as_array_like
-        else:
-            as_nd_array_like = np.asanyarray(as_array_like)
         chunk_array = chunk_spec.prototype.nd_buffer.from_ndarray_like(
-            as_nd_array_like.view(dtype=dtype)
+            as_array_like.view(dtype=dtype)  # type: ignore[attr-defined]
         )
 
         # ensure correct chunk shape
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -5990,7 +5990,10 @@ async def _resize(
     assert len(new_shape) == len(array.metadata.shape)
     new_metadata = array.metadata.update_shape(new_shape)
 
-    if delete_outside_chunks:
+    # ensure deletion is only run if array is shrinking as the delete_outside_chunks path is unbounded in memory
+    only_growing = all(new >= old for new, old in zip(new_shape, array.metadata.shape, strict=True))
+
+    if delete_outside_chunks and not only_growing:
         # Remove all chunks outside of the new shape
         old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(array.metadata.shape))
         new_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(new_shape))
diff --git a/src/zarr/storage/_obstore.py b/src/zarr/storage/_obstore.py
@@ -4,6 +4,8 @@
 import contextlib
 import pickle
 from collections import defaultdict
+from itertools import chain
+from operator import itemgetter
 from typing import TYPE_CHECKING, Generic, Self, TypedDict, TypeVar
 
 from zarr.abc.store import (
@@ -15,6 +17,7 @@
 )
 from zarr.core.common import concurrent_map
 from zarr.core.config import config
+from zarr.storage._utils import _relativize_path
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Coroutine, Iterable, Sequence
@@ -263,10 +266,11 @@ async def _transform_list_dir(
     # We assume that the underlying object-store implementation correctly handles the
     # prefix, so we don't double-check that the returned results actually start with the
     # given prefix.
-    prefixes = [obj.lstrip(prefix).lstrip("/") for obj in list_result["common_prefixes"]]
-    objects = [obj["path"].removeprefix(prefix).lstrip("/") for obj in list_result["objects"]]
-    for item in prefixes + objects:
-        yield item
+    prefix = prefix.rstrip("/")
+    for path in chain(
+        list_result["common_prefixes"], map(itemgetter("path"), list_result["objects"])
+    ):
+        yield _relativize_path(path=path, prefix=prefix)
 
 
 class _BoundedRequest(TypedDict):
diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py
@@ -492,24 +492,36 @@ async def test_list_empty_path(self, store: S) -> None:
         assert observed_prefix_sorted == expected_prefix_sorted
 
     async def test_list_dir(self, store: S) -> None:
-        root = "foo"
-        store_dict = {
-            root + "/zarr.json": self.buffer_cls.from_bytes(b"bar"),
-            root + "/c/1": self.buffer_cls.from_bytes(b"\x01"),
-        }
+        roots_and_keys: list[tuple[str, dict[str, Buffer]]] = [
+            (
+                "foo",
+                {
+                    "foo/zarr.json": self.buffer_cls.from_bytes(b"bar"),
+                    "foo/c/1": self.buffer_cls.from_bytes(b"\x01"),
+                },
+            ),
+            (
+                "foo/bar",
+                {
+                    "foo/bar/foobar_first_child": self.buffer_cls.from_bytes(b"1"),
+                    "foo/bar/foobar_second_child/zarr.json": self.buffer_cls.from_bytes(b"2"),
+                },
+            ),
+        ]
 
         assert await _collect_aiterator(store.list_dir("")) == ()
-        assert await _collect_aiterator(store.list_dir(root)) == ()
 
-        await store._set_many(store_dict.items())
+        for root, store_dict in roots_and_keys:
+            assert await _collect_aiterator(store.list_dir(root)) == ()
 
-        keys_observed = await _collect_aiterator(store.list_dir(root))
-        keys_expected = {k.removeprefix(root + "/").split("/")[0] for k in store_dict}
+            await store._set_many(store_dict.items())
 
-        assert sorted(keys_observed) == sorted(keys_expected)
+            keys_observed = await _collect_aiterator(store.list_dir(root))
+            keys_expected = {k.removeprefix(root + "/").split("/")[0] for k in store_dict}
+            assert sorted(keys_observed) == sorted(keys_expected)
 
-        keys_observed = await _collect_aiterator(store.list_dir(root + "/"))
-        assert sorted(keys_expected) == sorted(keys_observed)
+            keys_observed = await _collect_aiterator(store.list_dir(root + "/"))
+            assert sorted(keys_expected) == sorted(keys_observed)
 
     async def test_set_if_not_exists(self, store: S) -> None:
         key = "k"
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -781,6 +781,73 @@ def test_resize_2d(store: MemoryStore, zarr_format: ZarrFormat) -> None:
     assert new_shape == result.shape
 
 
+@pytest.mark.parametrize("store", ["memory"], indirect=True)
+def test_resize_growing_skips_chunk_enumeration(
+    store: MemoryStore, zarr_format: ZarrFormat
+) -> None:
+    """Growing an array should not enumerate chunk coords for deletion (#3650 mitigation)."""
+    from zarr.core.chunk_grids import RegularChunkGrid
+
+    z = zarr.create(
+        shape=(10, 10),
+        chunks=(5, 5),
+        dtype="i4",
+        fill_value=0,
+        store=store,
+        zarr_format=zarr_format,
+    )
+    z[:] = np.ones((10, 10), dtype="i4")
+
+    # growth only - ensure no chunk coords are enumerated
+    with mock.patch.object(
+        RegularChunkGrid,
+        "all_chunk_coords",
+        wraps=z.metadata.chunk_grid.all_chunk_coords,
+    ) as mock_coords:
+        z.resize((20, 20))
+        mock_coords.assert_not_called()
+
+    assert z.shape == (20, 20)
+    np.testing.assert_array_equal(np.ones((10, 10), dtype="i4"), z[:10, :10])
+    np.testing.assert_array_equal(np.zeros((10, 10), dtype="i4"), z[10:, 10:])
+
+    # shrink - ensure no regression of behaviour
+    with mock.patch.object(
+        RegularChunkGrid,
+        "all_chunk_coords",
+        wraps=z.metadata.chunk_grid.all_chunk_coords,
+    ) as mock_coords:
+        z.resize((5, 5))
+        assert mock_coords.call_count > 0
+
+    assert z.shape == (5, 5)
+    np.testing.assert_array_equal(np.ones((5, 5), dtype="i4"), z[:])
+
+    # mixed: grow dim 0, shrink dim 1 - ensure deletion path runs
+    z2 = zarr.create(
+        shape=(10, 10),
+        chunks=(5, 5),
+        dtype="i4",
+        fill_value=0,
+        store=store,
+        zarr_format=zarr_format,
+        overwrite=True,
+    )
+    z2[:] = np.ones((10, 10), dtype="i4")
+
+    with mock.patch.object(
+        RegularChunkGrid,
+        "all_chunk_coords",
+        wraps=z2.metadata.chunk_grid.all_chunk_coords,
+    ) as mock_coords:
+        z2.resize((20, 5))
+        assert mock_coords.call_count > 0
+
+    assert z2.shape == (20, 5)
+    np.testing.assert_array_equal(np.ones((10, 5), dtype="i4"), z2[:10, :])
+    np.testing.assert_array_equal(np.zeros((10, 5), dtype="i4"), z2[10:, :])
+
+
 @pytest.mark.parametrize("store", ["memory"], indirect=True)
 def test_append_1d(store: MemoryStore, zarr_format: ZarrFormat) -> None:
     a = np.arange(105)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Fix obstore _transform_list_dir implementation to correctly relativize paths (removing lstrip usage).`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Skip chunk coordinate enumeration in resize when the array is only growing, avoiding unbounded memory usage for large arrays.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Remove an expensive `isinstance` check from the bytes codec decoding routine.