feat: deterministic but random order

ilan-gold · ilan-gold · commit 7b663ff15504 · 2026-03-27T17:34:15.000+01:00
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import random
 from collections.abc import Iterable, Mapping, MutableMapping, Sequence
 from dataclasses import dataclass, replace
 from enum import Enum
@@ -48,7 +47,6 @@
     BasicIndexer,
     ChunkProjection,
     SelectorTuple,
-    _morton_order,
     _morton_order_keys,
     c_order_iter,
     get_indexer,
@@ -315,6 +313,7 @@ class ShardingCodec(
     chunk_shape: tuple[int, ...]
     codecs: tuple[Codec, ...]
     index_codecs: tuple[Codec, ...]
+    rng: np.random.Generator | None
     index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end
     subchunk_write_order: SubchunkWriteOrder = "morton"
 
@@ -326,6 +325,7 @@ def __init__(
         index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
         index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
         subchunk_write_order: SubchunkWriteOrder = "morton",
+        rng: np.random.Generator | None = None,
     ) -> None:
         chunk_shape_parsed = parse_shapelike(chunk_shape)
         codecs_parsed = parse_codecs(codecs)
@@ -341,6 +341,7 @@ def __init__(
         object.__setattr__(self, "index_codecs", index_codecs_parsed)
         object.__setattr__(self, "index_location", index_location_parsed)
         object.__setattr__(self, "subchunk_write_order", subchunk_write_order)
+        object.__setattr__(self, "rng", rng)
 
         # Use instance-local lru_cache to avoid memory leaks
 
@@ -353,14 +354,15 @@ def __init__(
 
     # todo: typedict return type
     def __getstate__(self) -> dict[str, Any]:
-        return self.to_dict()
+        return {"rng": self.rng, **self.to_dict()}
 
     def __setstate__(self, state: dict[str, Any]) -> None:
         config = state["configuration"]
         object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"]))
         object.__setattr__(self, "codecs", parse_codecs(config["codecs"]))
         object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"]))
         object.__setattr__(self, "index_location", parse_index_location(config["index_location"]))
+        object.__setattr__(self, "rng", state["rng"])
 
         # Use instance-local lru_cache to avoid memory leaks
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
@@ -550,21 +552,12 @@ def _subchunk_order_iter(self, chunks_per_shard: tuple[int, ...]) -> Iterable[tu
                 subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1]))
             case "unordered":
                 subchunk_list = list(np.ndindex(chunks_per_shard))
-                random.shuffle(subchunk_list)
+                (self.rng if self.rng is not None else np.random.default_rng()).shuffle(
+                    subchunk_list
+                )
                 subchunk_iter = iter(subchunk_list)
         return subchunk_iter
 
-    def _subchunk_order_vectorized(self, chunks_per_shard: tuple[int, ...]) -> npt.NDArray[np.intp]:
-        match self.subchunk_write_order:
-            case "morton":
-                subchunk_order_vectorized = _morton_order(chunks_per_shard)
-            case _:
-                subchunk_order_vectorized = np.fromiter(
-                    self._subchunk_order_iter(chunks_per_shard),
-                    dtype=np.dtype((int, len(chunks_per_shard))),
-                )
-        return subchunk_order_vectorized
-
     async def _encode_single(
         self,
         shard_array: NDBuffer,
@@ -623,7 +616,7 @@ async def _encode_partial_single(
         )
 
         if self._is_complete_shard_write(indexer, chunks_per_shard):
-            shard_dict = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard))
+            shard_dict = dict.fromkeys(np.ndindex(chunks_per_shard))
         else:
             shard_reader = await self._load_full_shard_maybe(
                 byte_getter=byte_setter,
@@ -633,7 +626,7 @@ async def _encode_partial_single(
             shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
             # Use vectorized lookup for better performance
             shard_dict = shard_reader.to_dict_vectorized(
-                self._subchunk_order_vectorized(chunks_per_shard)
+                np.array(list(np.ndindex(chunks_per_shard)))
             )
 
         await self.codec_pipeline.write(
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
@@ -560,26 +560,10 @@ def test_sharding_mixed_integer_list_indexing(store: Store) -> None:
     np.testing.assert_array_equal(c3, s3)
 
 
-@pytest.mark.parametrize(
-    "subchunk_write_order",
-    get_args(SubchunkWriteOrder),
-)
-async def test_encoded_subchunk_write_order(subchunk_write_order: SubchunkWriteOrder) -> None:
-    """Subchunks must be physically laid out in the shard in the order specified by
-    ``subchunk_write_order``.  We verify this by decoding the shard index and sorting
-    the chunk coordinates by their byte offset."""
-    # Use a non-square chunks_per_shard so all three orderings are distinguishable.
-    chunks_per_shard = (3, 2)
-    chunk_shape = (4, 4)
-    shard_shape = tuple(c * s for c, s in zip(chunks_per_shard, chunk_shape, strict=True))
-
-    codec = ShardingCodec(
-        chunk_shape=chunk_shape,
-        codecs=[BytesCodec()],
-        index_codecs=[BytesCodec(), Crc32cCodec()],
-        index_location=ShardingCodecIndexLocation.end,
-        subchunk_write_order=subchunk_write_order,
-    )
+async def stored_data_and_get_order(
+    codec: ShardingCodec, chunks_per_shard: tuple[int, ...]
+) -> list[tuple[int, ...]]:
+    shard_shape = tuple(c * s for c, s in zip(chunks_per_shard, codec.chunk_shape, strict=True))
     store = MemoryStore()
     arr = zarr.create_array(
         StorePath(store),
@@ -609,9 +593,65 @@ async def test_encoded_subchunk_write_order(subchunk_write_order: SubchunkWriteO
     )
 
     # The physical write order is recovered by sorting coordinates by start offset.
-    actual_order = [coord for _, coord in sorted(offset_to_coord.items())]
-    expected_order = list(codec._subchunk_order_iter(chunks_per_shard))
-    assert (actual_order == expected_order) == (subchunk_write_order != "unordered")
+    return [coord for _, coord in sorted(offset_to_coord.items())]
+
+
+@pytest.mark.parametrize(
+    "subchunk_write_order",
+    get_args(SubchunkWriteOrder),
+)
+async def test_encoded_subchunk_write_order(subchunk_write_order: SubchunkWriteOrder) -> None:
+    """Subchunks must be physically laid out in the shard in the order specified by
+    ``subchunk_write_order``.  We verify this by decoding the shard index and sorting
+    the chunk coordinates by their byte offset."""
+    # Use a non-square chunks_per_shard so all three orderings are distinguishable.
+    chunks_per_shard = (3, 2)
+    chunk_shape = (4, 4)
+    seed = 0
+    codec = ShardingCodec(
+        chunk_shape=chunk_shape,
+        codecs=[BytesCodec()],
+        index_codecs=[BytesCodec(), Crc32cCodec()],
+        index_location=ShardingCodecIndexLocation.end,
+        subchunk_write_order=subchunk_write_order,
+        rng=np.random.default_rng(seed=seed),
+    )
+
+    actual_order = await stored_data_and_get_order(codec, chunks_per_shard)
+    if subchunk_write_order != "unordered":
+        expected_order = list(codec._subchunk_order_iter(chunks_per_shard))
+        assert actual_order == expected_order
+    else:
+        same_order_same_seed = list(
+            ShardingCodec(
+                chunk_shape=chunk_shape,
+                codecs=[BytesCodec()],
+                index_codecs=[BytesCodec(), Crc32cCodec()],
+                index_location=ShardingCodecIndexLocation.end,
+                subchunk_write_order=subchunk_write_order,
+                rng=np.random.default_rng(seed=seed),
+            )._subchunk_order_iter(chunks_per_shard)
+        )
+        assert actual_order == same_order_same_seed
+
+
+async def test_unordered_can_be_seeded() -> None:
+    orders = []
+    chunks_per_shard = (3, 2)
+    chunk_shape = (4, 4)
+    seed = 0
+    for _ in range(4):
+        codec = ShardingCodec(
+            chunk_shape=chunk_shape,
+            codecs=[BytesCodec()],
+            index_codecs=[BytesCodec(), Crc32cCodec()],
+            index_location=ShardingCodecIndexLocation.end,
+            subchunk_write_order="unordered",
+            rng=np.random.default_rng(seed=seed),
+        )
+        # The physical write order is recovered by sorting coordinates by start offset.
+        orders.append(await stored_data_and_get_order(codec, chunks_per_shard))
+    assert all(orders[0] == o for o in orders)
 
 
 @pytest.mark.parametrize(