Skip to content

Commit b4892af

Browse files
authored
Merge branch 'main' into indexing-test-cleanup
2 parents da7ce66 + c0e2afa commit b4892af

17 files changed

Lines changed: 473 additions & 255 deletions

File tree

changes/2929.bugfix.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix equality comparison of `ArrayV2Metadata` and `ArrayV3Metadata` objects with a
2+
`NaN` fill value. Such objects are now compared by their JSON-serialized form, so two
3+
otherwise-identical metadata objects with a `NaN` (or infinite) fill value compare equal.

changes/3826.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added a `subchunk_write_order` option to `ShardingCodec` to allow for `morton`, `unordered`, `lexicographic`, and `colexicographic` subchunk orderings.

changes/3973.removal.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Removed the NumPy 1.x implementation of the `VariableLengthUTF8` data type because NumPy 1.x is no longer supported under [SPEC0](https://scientific-python.org/specs/spec-0000/).

docs/user-guide/performance.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ bytes within chunks of an array may improve the compression ratio, depending on
113113
the structure of the data, the compression algorithm used, and which compression
114114
filters (e.g., byte-shuffle) have been applied.
115115

116+
### Subchunk memory layout
117+
118+
The order of chunks **within each shard** can be changed via the `subchunk_write_order` parameter of the `ShardingCodec`. That parameter is a string which must be one of `["morton", "lexicographic", "colexicographic", "unordered"]`.
119+
120+
By default [`morton`](https://en.wikipedia.org/wiki/Z-order_curve) order provides good spatial locality however [`lexicographic` (i.e., row-major)](https://en.wikipedia.org/wiki/Row-_and_column-major_order), for example, may be better suited to "batched" workflows where some form of sequential reading through a fixed number of outer dimensions is desired. The options are `lexicographic`, `morton`, `unordered` (i.e., random), and `colexicographic`.
121+
122+
116123
### Empty chunks
117124

118125
It is possible to configure how Zarr handles the storage of chunks that are "empty"

src/zarr/codecs/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
Zstd,
3030
)
3131
from zarr.codecs.scale_offset import ScaleOffset
32-
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
32+
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation, SubchunkWriteOrder
3333
from zarr.codecs.transpose import TransposeCodec
3434
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
3535
from zarr.codecs.zstd import ZstdCodec
@@ -47,6 +47,7 @@
4747
"ScaleOffset",
4848
"ShardingCodec",
4949
"ShardingCodecIndexLocation",
50+
"SubchunkWriteOrder",
5051
"TransposeCodec",
5152
"VLenBytesCodec",
5253
"VLenUTF8Codec",

src/zarr/codecs/sharding.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from dataclasses import dataclass, replace
55
from enum import Enum
66
from functools import lru_cache
7-
from typing import TYPE_CHECKING, Any, NamedTuple, cast
7+
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, cast
88

99
import numpy as np
1010
import numpy.typing as npt
@@ -46,8 +46,6 @@
4646
BasicIndexer,
4747
ChunkProjection,
4848
SelectorTuple,
49-
_morton_order,
50-
_morton_order_keys,
5149
c_order_iter,
5250
get_indexer,
5351
morton_order_iter,
@@ -64,7 +62,7 @@
6462

6563
if TYPE_CHECKING:
6664
from collections.abc import Iterator
67-
from typing import Self
65+
from typing import Final, Self
6866

6967
from zarr.core.common import JSON
7068
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
@@ -83,6 +81,15 @@ class ShardingCodecIndexLocation(Enum):
8381
end = "end"
8482

8583

84+
SubchunkWriteOrder = Literal["morton", "unordered", "lexicographic", "colexicographic"]
85+
SUBCHUNK_WRITE_ORDER: Final[tuple[str, str, str, str]] = (
86+
"morton",
87+
"unordered",
88+
"lexicographic",
89+
"colexicographic",
90+
)
91+
92+
8693
def parse_index_location(data: object) -> ShardingCodecIndexLocation:
8794
return parse_enum(data, ShardingCodecIndexLocation)
8895

@@ -272,14 +279,13 @@ def to_dict_vectorized(
272279
dict mapping chunk coordinate tuples to Buffer or None
273280
"""
274281
starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
275-
chunk_coords_keys = _morton_order_keys(self.index.chunks_per_shard)
276282

277283
result: dict[tuple[int, ...], Buffer | None] = {}
278-
for i, coords in enumerate(chunk_coords_keys):
284+
for i, coords in enumerate(chunk_coords_array):
279285
if valid[i]:
280-
result[coords] = self.buf[int(starts[i]) : int(ends[i])]
286+
result[tuple(coords.ravel())] = self.buf[int(starts[i]) : int(ends[i])]
281287
else:
282-
result[coords] = None
288+
result[tuple(coords.ravel())] = None
283289

284290
return result
285291

@@ -293,7 +299,9 @@ class ShardingCodec(
293299
chunk_shape: tuple[int, ...]
294300
codecs: tuple[Codec, ...]
295301
index_codecs: tuple[Codec, ...]
302+
rng: np.random.Generator | None
296303
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end
304+
subchunk_write_order: SubchunkWriteOrder = "morton"
297305

298306
def __init__(
299307
self,
@@ -302,16 +310,24 @@ def __init__(
302310
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
303311
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
304312
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
313+
subchunk_write_order: SubchunkWriteOrder = "morton",
314+
rng: np.random.Generator | None = None,
305315
) -> None:
306316
chunk_shape_parsed = parse_shapelike(chunk_shape)
307317
codecs_parsed = parse_codecs(codecs)
308318
index_codecs_parsed = parse_codecs(index_codecs)
309319
index_location_parsed = parse_index_location(index_location)
320+
if subchunk_write_order not in SUBCHUNK_WRITE_ORDER:
321+
raise ValueError(
322+
f"Unrecognized subchunk write order: {subchunk_write_order}. Only {SUBCHUNK_WRITE_ORDER} are allowed."
323+
)
310324

311325
object.__setattr__(self, "chunk_shape", chunk_shape_parsed)
312326
object.__setattr__(self, "codecs", codecs_parsed)
313327
object.__setattr__(self, "index_codecs", index_codecs_parsed)
314328
object.__setattr__(self, "index_location", index_location_parsed)
329+
object.__setattr__(self, "subchunk_write_order", subchunk_write_order)
330+
object.__setattr__(self, "rng", rng)
315331

316332
# Use instance-local lru_cache to avoid memory leaks
317333

@@ -324,14 +340,15 @@ def __init__(
324340

325341
# todo: typedict return type
326342
def __getstate__(self) -> dict[str, Any]:
327-
return self.to_dict()
343+
return {"rng": self.rng, **self.to_dict()}
328344

329345
def __setstate__(self, state: dict[str, Any]) -> None:
330346
config = state["configuration"]
331347
object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"]))
332348
object.__setattr__(self, "codecs", parse_codecs(config["codecs"]))
333349
object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"]))
334350
object.__setattr__(self, "index_location", parse_index_location(config["index_location"]))
351+
object.__setattr__(self, "rng", state["rng"])
335352

336353
# Use instance-local lru_cache to avoid memory leaks
337354
# object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
@@ -509,6 +526,24 @@ async def _decode_partial_single(
509526
else:
510527
return out
511528

529+
def _subchunk_order_iter(
530+
self, chunks_per_shard: tuple[int, ...], subchunk_write_order: SubchunkWriteOrder
531+
) -> Iterable[tuple[int, ...]]:
532+
match subchunk_write_order:
533+
case "morton":
534+
subchunk_iter = morton_order_iter(chunks_per_shard)
535+
case "lexicographic":
536+
subchunk_iter = np.ndindex(chunks_per_shard)
537+
case "colexicographic":
538+
subchunk_iter = (c[::-1] for c in np.ndindex(chunks_per_shard[::-1]))
539+
case "unordered":
540+
subchunk_list = list(np.ndindex(chunks_per_shard))
541+
(self.rng if self.rng is not None else np.random.default_rng()).shuffle(
542+
subchunk_list
543+
)
544+
subchunk_iter = iter(subchunk_list)
545+
return subchunk_iter
546+
512547
async def _encode_single(
513548
self,
514549
shard_array: NDBuffer,
@@ -526,8 +561,7 @@ async def _encode_single(
526561
chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
527562
)
528563
)
529-
530-
shard_builder = dict.fromkeys(morton_order_iter(chunks_per_shard))
564+
shard_builder = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic"))
531565

532566
await self.codec_pipeline.write(
533567
[
@@ -570,7 +604,7 @@ async def _encode_partial_single(
570604
)
571605

572606
if self._is_complete_shard_write(indexer, chunks_per_shard):
573-
shard_dict = dict.fromkeys(morton_order_iter(chunks_per_shard))
607+
shard_dict = dict.fromkeys(self._subchunk_order_iter(chunks_per_shard, "lexicographic"))
574608
else:
575609
shard_reader = await self._load_full_shard_maybe(
576610
byte_getter=byte_setter,
@@ -580,7 +614,7 @@ async def _encode_partial_single(
580614
shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
581615
# Use vectorized lookup for better performance
582616
shard_dict = shard_reader.to_dict_vectorized(
583-
np.asarray(_morton_order(chunks_per_shard))
617+
np.array(list(self._subchunk_order_iter(chunks_per_shard, "lexicographic")))
584618
)
585619

586620
await self.codec_pipeline.write(
@@ -619,7 +653,7 @@ async def _encode_shard_dict(
619653

620654
template = buffer_prototype.buffer.create_zero_length()
621655
chunk_start = 0
622-
for chunk_coords in morton_order_iter(chunks_per_shard):
656+
for chunk_coords in self._subchunk_order_iter(chunks_per_shard, self.subchunk_write_order):
623657
value = map.get(chunk_coords)
624658
if value is None:
625659
continue

0 commit comments

Comments
 (0)