@@ -50,3 +50,218 @@ def test_slice_indexing(
5050
5151 data [:] = 1
5252 benchmark (getitem , data , indexer )
53+
54+
55+ # Benchmark for Morton order optimization with power-of-2 shards
56+ # Morton order is used internally by sharding codec for chunk iteration
57+ morton_shards = (
58+ (16 ,) * 3 , # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard
59+ (32 ,) * 3 , # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard
60+ )
61+
62+
63+ @pytest .mark .parametrize ("store" , ["memory" ], indirect = ["store" ])
64+ @pytest .mark .parametrize ("shards" , morton_shards , ids = str )
65+ def test_sharded_morton_indexing (
66+ store : Store ,
67+ shards : tuple [int , ...],
68+ benchmark : BenchmarkFixture ,
69+ ) -> None :
70+ """Benchmark sharded array indexing with power-of-2 chunks per shard.
71+
72+ This benchmark exercises the Morton order iteration path in the sharding
73+ codec, which benefits from the hypercube and vectorization optimizations.
74+ The Morton order cache is cleared before each iteration to measure the
75+ full computation cost.
76+ """
77+ from zarr .core .indexing import _morton_order
78+
79+ # Create array where each shard contains many small chunks
80+ # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
81+ shape = tuple (s * 2 for s in shards ) # 2 shards per dimension
82+ chunks = (2 ,) * 3 # Small chunks to maximize chunks per shard
83+
84+ data = create_array (
85+ store = store ,
86+ shape = shape ,
87+ dtype = "uint8" ,
88+ chunks = chunks ,
89+ shards = shards ,
90+ compressors = None ,
91+ filters = None ,
92+ fill_value = 0 ,
93+ )
94+
95+ data [:] = 1
96+ # Read a sub-shard region to exercise Morton order iteration
97+ indexer = (slice (shards [0 ]),) * 3
98+
99+ def read_with_cache_clear () -> None :
100+ _morton_order .cache_clear ()
101+ getitem (data , indexer )
102+
103+ benchmark (read_with_cache_clear )
104+
105+
106+ # Benchmark with larger chunks_per_shard to make Morton order impact more visible
107+ large_morton_shards = (
108+ (32 ,) * 3 , # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
109+ )
110+
111+
112+ @pytest .mark .parametrize ("store" , ["memory" ], indirect = ["store" ])
113+ @pytest .mark .parametrize ("shards" , large_morton_shards , ids = str )
114+ def test_sharded_morton_indexing_large (
115+ store : Store ,
116+ shards : tuple [int , ...],
117+ benchmark : BenchmarkFixture ,
118+ ) -> None :
119+ """Benchmark sharded array indexing with large chunks_per_shard.
120+
121+ Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making
122+ the Morton order computation a more significant portion of total time.
123+ The Morton order cache is cleared before each iteration.
124+ """
125+ from zarr .core .indexing import _morton_order
126+
127+ # 1x1x1 chunks means chunks_per_shard equals shard shape
128+ shape = tuple (s * 2 for s in shards ) # 2 shards per dimension
129+ chunks = (1 ,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
130+
131+ data = create_array (
132+ store = store ,
133+ shape = shape ,
134+ dtype = "uint8" ,
135+ chunks = chunks ,
136+ shards = shards ,
137+ compressors = None ,
138+ filters = None ,
139+ fill_value = 0 ,
140+ )
141+
142+ data [:] = 1
143+ # Read one full shard
144+ indexer = (slice (shards [0 ]),) * 3
145+
146+ def read_with_cache_clear () -> None :
147+ _morton_order .cache_clear ()
148+ getitem (data , indexer )
149+
150+ benchmark (read_with_cache_clear )
151+
152+
153+ @pytest .mark .parametrize ("store" , ["memory" ], indirect = ["store" ])
154+ @pytest .mark .parametrize ("shards" , large_morton_shards , ids = str )
155+ def test_sharded_morton_single_chunk (
156+ store : Store ,
157+ shards : tuple [int , ...],
158+ benchmark : BenchmarkFixture ,
159+ ) -> None :
160+ """Benchmark reading a single chunk from a large shard.
161+
162+ This isolates the Morton order computation overhead by minimizing I/O.
163+ Reading one chunk from a shard with 32^3 = 32768 chunks still requires
164+ computing the full Morton order, making the optimization impact clear.
165+ The Morton order cache is cleared before each iteration.
166+ """
167+ from zarr .core .indexing import _morton_order
168+
169+ # 1x1x1 chunks means chunks_per_shard equals shard shape
170+ shape = tuple (s * 2 for s in shards ) # 2 shards per dimension
171+ chunks = (1 ,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
172+
173+ data = create_array (
174+ store = store ,
175+ shape = shape ,
176+ dtype = "uint8" ,
177+ chunks = chunks ,
178+ shards = shards ,
179+ compressors = None ,
180+ filters = None ,
181+ fill_value = 0 ,
182+ )
183+
184+ data [:] = 1
185+ # Read only a single chunk (1x1x1) from the shard
186+ indexer = (slice (1 ),) * 3
187+
188+ def read_with_cache_clear () -> None :
189+ _morton_order .cache_clear ()
190+ getitem (data , indexer )
191+
192+ benchmark (read_with_cache_clear )
193+
194+
195+ # Benchmark for morton_order_iter directly (no I/O)
196+ morton_iter_shapes = (
197+ (8 , 8 , 8 ), # 512 elements
198+ (16 , 16 , 16 ), # 4096 elements
199+ (32 , 32 , 32 ), # 32768 elements
200+ )
201+
202+
203+ @pytest .mark .parametrize ("shape" , morton_iter_shapes , ids = str )
204+ def test_morton_order_iter (
205+ shape : tuple [int , ...],
206+ benchmark : BenchmarkFixture ,
207+ ) -> None :
208+ """Benchmark morton_order_iter directly without I/O.
209+
210+ This isolates the Morton order computation to measure the
211+ optimization impact without array read/write overhead.
212+ The cache is cleared before each iteration.
213+ """
214+ from zarr .core .indexing import _morton_order , morton_order_iter
215+
216+ def compute_morton_order () -> None :
217+ _morton_order .cache_clear ()
218+ # Consume the iterator to force computation
219+ list (morton_order_iter (shape ))
220+
221+ benchmark (compute_morton_order )
222+
223+
224+ @pytest .mark .parametrize ("store" , ["memory" ], indirect = ["store" ])
225+ @pytest .mark .parametrize ("shards" , large_morton_shards , ids = str )
226+ def test_sharded_morton_write_single_chunk (
227+ store : Store ,
228+ shards : tuple [int , ...],
229+ benchmark : BenchmarkFixture ,
230+ ) -> None :
231+ """Benchmark writing a single chunk to a large shard.
232+
233+ This is the clearest end-to-end demonstration of Morton order optimization.
234+ Writing a single chunk to a shard with 32^3 = 32768 chunks requires
235+ computing the full Morton order, but minimizes I/O overhead.
236+
237+ Expected improvement: ~160ms (matching Morton computation speedup of ~178ms).
238+ The Morton order cache is cleared before each iteration.
239+ """
240+ import numpy as np
241+
242+ from zarr .core .indexing import _morton_order
243+
244+ # 1x1x1 chunks means chunks_per_shard equals shard shape
245+ shape = tuple (s * 2 for s in shards ) # 2 shards per dimension
246+ chunks = (1 ,) * 3 # 1x1x1 chunks: chunks_per_shard = shards
247+
248+ data = create_array (
249+ store = store ,
250+ shape = shape ,
251+ dtype = "uint8" ,
252+ chunks = chunks ,
253+ shards = shards ,
254+ compressors = None ,
255+ filters = None ,
256+ fill_value = 0 ,
257+ )
258+
259+ # Write data for a single chunk
260+ write_data = np .ones ((1 , 1 , 1 ), dtype = "uint8" )
261+ indexer = (slice (1 ), slice (1 ), slice (1 ))
262+
263+ def write_with_cache_clear () -> None :
264+ _morton_order .cache_clear ()
265+ data [indexer ] = write_data
266+
267+ benchmark (write_with_cache_clear )
0 commit comments