remove batch size parameter; add changelog entry

d-v-b · d-v-b · commit 69962848a769 · 2026-02-20T16:49:33.000+01:00
diff --git a/changes/3715.misc.md b/changes/3715.misc.md
@@ -0,0 +1,12 @@
+Added several performance optimizations to chunk encoding and decoding. Low-latency stores that do not benefit from
+`async` operations can now implement synchronous IO methods which will be used when available during chunk processing.
+Similarly, codecs can implement a synchronous API which will be used if available during chunk processing.
+These changes remove unnecessary interactions with the event loop.
+
+The synchronous chunk processing path optionally uses a thread pool to parallelize execution. The number of threads is chosen
+based on the estimated compute load of each chunk, which takes into account known encoding and decoding profiles for
+different codecs. This algorithm is aware of the latency required for setting up the thread pool, and for
+single-chunk workloads we skip the thread pool entirely.
+
+Use of the thread pool can be disabled in the global configuration. The minimum number of threads
+and the maximum number of threads can be set via the configuration as well.
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -3,7 +3,7 @@
 import os
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from itertools import islice, pairwise
+from itertools import pairwise
 from typing import TYPE_CHECKING, Any, TypeVar, cast
 from warnings import warn
 
@@ -46,14 +46,6 @@ def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]:
     return (out0, out1)
 
 
-def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
-    if n < 1:
-        raise ValueError("n must be at least one")
-    it = iter(iterable)
-    while batch := tuple(islice(it, n)):
-        yield batch
-
-
 def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ArraySpec]:
     return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]
 
@@ -153,25 +145,37 @@ def _choose_workers(
     *,
     is_encode: bool = False,
 ) -> int:
-    """Decide how many thread pool workers to use (0 = don't use pool)."""
-    if n_chunks < 2:
+    """Decide how many thread pool workers to use (0 = don't use pool).
+
+    Respects ``threading.codec_workers`` config:
+    - ``enabled``: if False, always returns 0.
+    - ``min``: floor for the number of workers.
+    - ``max``: ceiling for the number of workers (default: ``os.cpu_count()``).
+    """
+    codec_workers = config.get("threading.codec_workers")
+    if not codec_workers.get("enabled", True):
         return 0
 
+    min_workers: int = codec_workers.get("min", 0)
+    max_workers: int = codec_workers.get("max") or os.cpu_count() or 4
+
+    if n_chunks < 2:
+        return min_workers
+
     per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode)
 
-    if per_chunk_ns < _POOL_OVERHEAD_NS:
+    if per_chunk_ns < _POOL_OVERHEAD_NS and min_workers == 0:
         return 0
 
     total_work_ns = per_chunk_ns * n_chunks
     total_dispatch_ns = n_chunks * 50_000  # ~50us per task
-    if total_work_ns < total_dispatch_ns * 3:
+    if total_work_ns < total_dispatch_ns * 3 and min_workers == 0:
         return 0
 
     target_per_worker_ns = 1_000_000  # 1ms
     workers = max(1, int(total_work_ns / target_per_worker_ns))
 
-    cpu_count = os.cpu_count() or 4
-    return min(workers, n_chunks, cpu_count)
+    return max(min_workers, min(workers, n_chunks, max_workers))
 
 
 def _get_pool(max_workers: int) -> ThreadPoolExecutor:
@@ -208,7 +212,6 @@ class BatchedCodecPipeline(CodecPipeline):
     array_array_codecs: tuple[ArrayArrayCodec, ...]
     array_bytes_codec: ArrayBytesCodec
     bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
-    batch_size: int
 
     @property
     def _all_sync(self) -> bool:
@@ -219,14 +222,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self)
 
     @classmethod
-    def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+    def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
         array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(list(codecs))
 
         return cls(
             array_array_codecs=array_array_codecs,
             array_bytes_codec=array_bytes_codec,
             bytes_bytes_codecs=bytes_bytes_codecs,
-            batch_size=batch_size or config.get("codec_pipeline.batch_size"),
         )
 
     @property
@@ -478,10 +480,7 @@ async def decode(
             ]
 
         # Async fallback: layer-by-layer across all chunks.
-        output: list[NDBuffer | None] = []
-        for batch_info in batched(items, self.batch_size):
-            output.extend(await self.decode_batch(batch_info))
-        return output
+        return list(await self.decode_batch(items))
 
     async def encode(
         self,
@@ -496,10 +495,7 @@ async def encode(
             return [self._encode_one(chunk_array, chunk_spec) for chunk_array, chunk_spec in items]
 
         # Async fallback: layer-by-layer across all chunks.
-        output: list[Buffer | None] = []
-        for single_batch_info in batched(items, self.batch_size):
-            output.extend(await self.encode_batch(single_batch_info))
-        return output
+        return list(await self.encode_batch(items))
 
     # -------------------------------------------------------------------
     # Async read / write (IO overlap via concurrent_map)
@@ -610,14 +606,7 @@ async def read(
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
-        await concurrent_map(
-            [
-                (single_batch_info, out, drop_axes)
-                for single_batch_info in batched(batch_info, self.batch_size)
-            ],
-            self.read_batch,
-            config.get("async.concurrency"),
-        )
+        await self.read_batch(batch_info, out, drop_axes)
 
     def _merge_chunk_array(
         self,
@@ -840,14 +829,7 @@ async def write(
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
-        await concurrent_map(
-            [
-                (single_batch_info, value, drop_axes)
-                for single_batch_info in batched(batch_info, self.batch_size)
-            ],
-            self.write_batch,
-            config.get("async.concurrency"),
-        )
+        await self.write_batch(batch_info, value, drop_axes)
 
     # -------------------------------------------------------------------
     # Fully synchronous read / write (no event loop)
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -99,11 +99,13 @@ def enable_gpu(self) -> ConfigSet:
                 "target_shard_size_bytes": None,
             },
             "async": {"concurrency": 10, "timeout": None},
-            "threading": {"max_workers": None},
+            "threading": {
+                "max_workers": None,
+                "codec_workers": {"enabled": True, "min": 0, "max": None},
+            },
             "json_indent": 2,
             "codec_pipeline": {
                 "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
-                "batch_size": 1,
             },
             "codecs": {
                 "blosc": "zarr.codecs.blosc.BloscCodec",
diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py
@@ -40,7 +40,7 @@ def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) ->
 
 
 class TestEntrypointCodecPipeline(CodecPipeline):
-    def __init__(self, batch_size: int = 1) -> None:
+    def __init__(self) -> None:
         pass
 
     async def encode(
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -56,11 +56,13 @@ def test_config_defaults_set() -> None:
                     "target_shard_size_bytes": None,
                 },
                 "async": {"concurrency": 10, "timeout": None},
-                "threading": {"max_workers": None},
+                "threading": {
+                    "max_workers": None,
+                    "codec_workers": {"enabled": True, "min": 0, "max": None},
+                },
                 "json_indent": 2,
                 "codec_pipeline": {
                     "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
-                    "batch_size": 1,
                 },
                 "codecs": {
                     "blosc": "zarr.codecs.blosc.BloscCodec",
@@ -103,7 +105,6 @@ def test_config_defaults_set() -> None:
     assert config.get("array.order") == "C"
     assert config.get("async.concurrency") == 10
     assert config.get("async.timeout") is None
-    assert config.get("codec_pipeline.batch_size") == 1
     assert config.get("json_indent") == 2