Skip to content

Commit 36caf1f

Browse files
mkittid-v-b
andauthored
perf:Add additional sharding benchmarks (zarr-developers#3712)
* test:Add sharding indexing benchmarks * test:Add morton_order_iter benchmark tests * tests:Add single chunk write test for sharding * Document changes --------- Co-authored-by: Davis Bennett <davis.v.bennett@gmail.com>
1 parent 306e480 commit 36caf1f

File tree

2 files changed

+216
-0
lines changed

2 files changed

+216
-0
lines changed

changes/3712.misc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added benchmarks for Morton order computation in sharded arrays.

tests/benchmarks/test_indexing.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,218 @@ def test_slice_indexing(
5050

5151
data[:] = 1
5252
benchmark(getitem, data, indexer)
53+
54+
55+
# Benchmark for Morton order optimization with power-of-2 shards
56+
# Morton order is used internally by sharding codec for chunk iteration
57+
morton_shards = (
58+
(16,) * 3, # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard
59+
(32,) * 3, # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard
60+
)
61+
62+
63+
@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
64+
@pytest.mark.parametrize("shards", morton_shards, ids=str)
65+
def test_sharded_morton_indexing(
66+
store: Store,
67+
shards: tuple[int, ...],
68+
benchmark: BenchmarkFixture,
69+
) -> None:
70+
"""Benchmark sharded array indexing with power-of-2 chunks per shard.
71+
72+
This benchmark exercises the Morton order iteration path in the sharding
73+
codec, which benefits from the hypercube and vectorization optimizations.
74+
The Morton order cache is cleared before each iteration to measure the
75+
full computation cost.
76+
"""
77+
from zarr.core.indexing import _morton_order
78+
79+
# Create array where each shard contains many small chunks
80+
# e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
81+
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
82+
chunks = (2,) * 3 # Small chunks to maximize chunks per shard
83+
84+
data = create_array(
85+
store=store,
86+
shape=shape,
87+
dtype="uint8",
88+
chunks=chunks,
89+
shards=shards,
90+
compressors=None,
91+
filters=None,
92+
fill_value=0,
93+
)
94+
95+
data[:] = 1
96+
# Read a sub-shard region to exercise Morton order iteration
97+
indexer = (slice(shards[0]),) * 3
98+
99+
def read_with_cache_clear() -> None:
100+
_morton_order.cache_clear()
101+
getitem(data, indexer)
102+
103+
benchmark(read_with_cache_clear)
104+
105+
106+
# Benchmark with larger chunks_per_shard to make Morton order impact more visible
107+
large_morton_shards = (
108+
(32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
109+
)
110+
111+
112+
@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
113+
@pytest.mark.parametrize("shards", large_morton_shards, ids=str)
114+
def test_sharded_morton_indexing_large(
115+
store: Store,
116+
shards: tuple[int, ...],
117+
benchmark: BenchmarkFixture,
118+
) -> None:
119+
"""Benchmark sharded array indexing with large chunks_per_shard.
120+
121+
Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making
122+
the Morton order computation a more significant portion of total time.
123+
The Morton order cache is cleared before each iteration.
124+
"""
125+
from zarr.core.indexing import _morton_order
126+
127+
# 1x1x1 chunks means chunks_per_shard equals shard shape
128+
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
129+
chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
130+
131+
data = create_array(
132+
store=store,
133+
shape=shape,
134+
dtype="uint8",
135+
chunks=chunks,
136+
shards=shards,
137+
compressors=None,
138+
filters=None,
139+
fill_value=0,
140+
)
141+
142+
data[:] = 1
143+
# Read one full shard
144+
indexer = (slice(shards[0]),) * 3
145+
146+
def read_with_cache_clear() -> None:
147+
_morton_order.cache_clear()
148+
getitem(data, indexer)
149+
150+
benchmark(read_with_cache_clear)
151+
152+
153+
@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
154+
@pytest.mark.parametrize("shards", large_morton_shards, ids=str)
155+
def test_sharded_morton_single_chunk(
156+
store: Store,
157+
shards: tuple[int, ...],
158+
benchmark: BenchmarkFixture,
159+
) -> None:
160+
"""Benchmark reading a single chunk from a large shard.
161+
162+
This isolates the Morton order computation overhead by minimizing I/O.
163+
Reading one chunk from a shard with 32^3 = 32768 chunks still requires
164+
computing the full Morton order, making the optimization impact clear.
165+
The Morton order cache is cleared before each iteration.
166+
"""
167+
from zarr.core.indexing import _morton_order
168+
169+
# 1x1x1 chunks means chunks_per_shard equals shard shape
170+
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
171+
chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
172+
173+
data = create_array(
174+
store=store,
175+
shape=shape,
176+
dtype="uint8",
177+
chunks=chunks,
178+
shards=shards,
179+
compressors=None,
180+
filters=None,
181+
fill_value=0,
182+
)
183+
184+
data[:] = 1
185+
# Read only a single chunk (1x1x1) from the shard
186+
indexer = (slice(1),) * 3
187+
188+
def read_with_cache_clear() -> None:
189+
_morton_order.cache_clear()
190+
getitem(data, indexer)
191+
192+
benchmark(read_with_cache_clear)
193+
194+
195+
# Benchmark for morton_order_iter directly (no I/O)
196+
morton_iter_shapes = (
197+
(8, 8, 8), # 512 elements
198+
(16, 16, 16), # 4096 elements
199+
(32, 32, 32), # 32768 elements
200+
)
201+
202+
203+
@pytest.mark.parametrize("shape", morton_iter_shapes, ids=str)
204+
def test_morton_order_iter(
205+
shape: tuple[int, ...],
206+
benchmark: BenchmarkFixture,
207+
) -> None:
208+
"""Benchmark morton_order_iter directly without I/O.
209+
210+
This isolates the Morton order computation to measure the
211+
optimization impact without array read/write overhead.
212+
The cache is cleared before each iteration.
213+
"""
214+
from zarr.core.indexing import _morton_order, morton_order_iter
215+
216+
def compute_morton_order() -> None:
217+
_morton_order.cache_clear()
218+
# Consume the iterator to force computation
219+
list(morton_order_iter(shape))
220+
221+
benchmark(compute_morton_order)
222+
223+
224+
@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
225+
@pytest.mark.parametrize("shards", large_morton_shards, ids=str)
226+
def test_sharded_morton_write_single_chunk(
227+
store: Store,
228+
shards: tuple[int, ...],
229+
benchmark: BenchmarkFixture,
230+
) -> None:
231+
"""Benchmark writing a single chunk to a large shard.
232+
233+
This is the clearest end-to-end demonstration of Morton order optimization.
234+
Writing a single chunk to a shard with 32^3 = 32768 chunks requires
235+
computing the full Morton order, but minimizes I/O overhead.
236+
237+
Expected improvement: ~160ms (matching Morton computation speedup of ~178ms).
238+
The Morton order cache is cleared before each iteration.
239+
"""
240+
import numpy as np
241+
242+
from zarr.core.indexing import _morton_order
243+
244+
# 1x1x1 chunks means chunks_per_shard equals shard shape
245+
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
246+
chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
247+
248+
data = create_array(
249+
store=store,
250+
shape=shape,
251+
dtype="uint8",
252+
chunks=chunks,
253+
shards=shards,
254+
compressors=None,
255+
filters=None,
256+
fill_value=0,
257+
)
258+
259+
# Write data for a single chunk
260+
write_data = np.ones((1, 1, 1), dtype="uint8")
261+
indexer = (slice(1), slice(1), slice(1))
262+
263+
def write_with_cache_clear() -> None:
264+
_morton_order.cache_clear()
265+
data[indexer] = write_data
266+
267+
benchmark(write_with_cache_clear)

0 commit comments

Comments
 (0)