feat: implement thread-pool parallelism for sync read/write

d-v-b · claude · d-v-b · commit 850bbe458247 · 2026-04-16T12:45:02.000+02:00
read_sync and write_sync now support n_workers parameter. When &gt; 0,
the decode (read) or decode+merge+encode (write) compute steps are
parallelized across threads via ThreadPoolExecutor.map. IO remains
sequential.

This helps when codecs release the GIL (gzip, blosc, zstd) — e.g.
gzip decompression is 41% of read time and runs entirely in C.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -940,7 +940,12 @@ def read_sync(
         drop_axes: tuple[int, ...] = (),
         n_workers: int = 0,
     ) -> tuple[GetResult, ...]:
-        """Synchronous read: fetch -> decode -> scatter, per chunk."""
+        """Synchronous read: fetch -> decode -> scatter, per chunk.
+
+        When ``n_workers > 0`` and there are multiple chunks, the decode
+        step is parallelized across threads. This helps when codecs
+        release the GIL (e.g. gzip, blosc, zstd).
+        """
         assert self._sync_transform is not None
         transform = self._sync_transform
 
@@ -951,20 +956,39 @@ def read_sync(
         fill = fill_value_or_default(batch[0][1])
         _missing = GetResult(status="missing")
 
-        results: list[GetResult] = []
-        for bg, chunk_spec, chunk_selection, out_selection, _ in batch:
-            raw = bg.get_sync(prototype=chunk_spec.prototype)  # type: ignore[attr-defined]
-            if raw is None:
-                out[out_selection] = fill
-                results.append(_missing)
-                continue
+        # Phase 1: fetch all chunks (IO, sequential)
+        raw_buffers: list[Buffer | None] = [
+            bg.get_sync(prototype=cs.prototype)  # type: ignore[attr-defined]
+            for bg, cs, *_ in batch
+        ]
 
+        # Phase 2: decode (compute, optionally threaded)
+        def _decode_one(raw: Buffer | None, chunk_spec: ArraySpec) -> NDBuffer | None:
+            if raw is None:
+                return None
             chunk_shape = (
                 chunk_spec.shape
                 if chunk_spec.shape != transform.array_spec.shape
                 else None
             )
-            decoded = transform.decode_chunk(raw, chunk_shape=chunk_shape)
+            return transform.decode_chunk(raw, chunk_shape=chunk_shape)
+
+        specs = [cs for _, cs, *_ in batch]
+        if n_workers > 0 and len(batch) > 1:
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                decoded_list = list(pool.map(_decode_one, raw_buffers, specs))
+        else:
+            decoded_list = [_decode_one(raw, spec) for raw, spec in zip(raw_buffers, specs, strict=True)]
+
+        # Phase 3: scatter (sequential, writes to shared output buffer)
+        results: list[GetResult] = []
+        for (_, _chunk_spec, chunk_selection, out_selection, _), decoded in zip(
+            batch, decoded_list, strict=True
+        ):
+            if decoded is None:
+                out[out_selection] = fill
+                results.append(_missing)
+                continue
 
             selected = decoded[chunk_selection]
             if drop_axes:
@@ -981,31 +1005,42 @@ def write_sync(
         drop_axes: tuple[int, ...] = (),
         n_workers: int = 0,
     ) -> None:
-        """Synchronous write: merge -> encode -> store, per chunk."""
+        """Synchronous write: fetch existing -> merge+encode -> store.
+
+        When ``n_workers > 0`` and there are multiple chunks, the
+        merge+encode step is parallelized across threads.
+        """
         assert self._sync_transform is not None
         transform = self._sync_transform
 
         batch = list(batch_info)
         if not batch:
             return
 
-        for bs, chunk_spec, chunk_selection, out_selection, is_complete in batch:
+        # Phase 1: fetch existing chunks (IO, sequential)
+        existing_buffers: list[Buffer | None] = [
+            None if ic else bs.get_sync(prototype=cs.prototype)  # type: ignore[attr-defined]
+            for bs, cs, _, _, ic in batch
+        ]
+
+        # Phase 2: decode + merge + encode (compute, optionally threaded)
+        def _process_one(
+            idx: int,
+        ) -> Buffer | None:
+            _, chunk_spec, chunk_selection, out_selection, is_complete = batch[idx]
+            existing_bytes = existing_buffers[idx]
             chunk_shape = (
                 chunk_spec.shape
                 if chunk_spec.shape != transform.array_spec.shape
                 else None
             )
 
-            # Decode existing chunk if partial write
             existing_chunk_array: NDBuffer | None = None
-            if not is_complete:
-                existing_bytes = bs.get_sync(prototype=chunk_spec.prototype)  # type: ignore[attr-defined]
-                if existing_bytes is not None:
-                    existing_chunk_array = transform.decode_chunk(
-                        existing_bytes, chunk_shape=chunk_shape
-                    )
+            if existing_bytes is not None:
+                existing_chunk_array = transform.decode_chunk(
+                    existing_bytes, chunk_shape=chunk_shape
+                )
 
-            # Merge
             chunk_array = self._merge_chunk_array(
                 existing_chunk_array,
                 value,
@@ -1020,10 +1055,19 @@ def write_sync(
             if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
                 fill_value_or_default(chunk_spec)
             ):
-                bs.delete_sync()  # type: ignore[attr-defined]
-                continue
+                return None
+
+            return transform.encode_chunk(chunk_array, chunk_shape=chunk_shape)
+
+        indices = list(range(len(batch)))
+        if n_workers > 0 and len(batch) > 1:
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                encoded_list = list(pool.map(_process_one, indices))
+        else:
+            encoded_list = [_process_one(i) for i in indices]
 
-            encoded = transform.encode_chunk(chunk_array, chunk_shape=chunk_shape)
+        # Phase 3: store results (IO, sequential)
+        for (bs, *_rest), encoded in zip(batch, encoded_list, strict=True):
             if encoded is None:
                 bs.delete_sync()  # type: ignore[attr-defined]
             else: