zarr-developers
diff --git a/‎changes/3925.feature.md‎
Lines changed: 1 addition & 0 deletions b/‎changes/3925.feature.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/zarr/abc/store.py‎
Lines changed: 62 additions & 1 deletion b/‎src/zarr/abc/store.py‎
Lines changed: 62 additions & 1 deletion
diff --git a/‎src/zarr/core/_coalesce.py‎
Lines changed: 222 additions & 0 deletions b/‎src/zarr/core/_coalesce.py‎
Lines changed: 222 additions & 0 deletions
diff --git a/‎src/zarr/storage/_wrapper.py‎
Lines changed: 27 additions & 1 deletion b/‎src/zarr/storage/_wrapper.py‎
Lines changed: 27 additions & 1 deletion
@@ -0,0 +1 @@
+Add `zarr.abc.store.Store.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method is defined on the `Store` ABC with a default implementation built on `Store.get`, so every store inherits a working version; stores with native multi-range backends (e.g. `FsspecStore`) can override for efficiency. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`. Failures from underlying fetches surface as a `BaseExceptionGroup` (PEP 654); callers should use `except*` to filter for specific exception types such as `FileNotFoundError`.
@@ -4,13 +4,14 @@
 import json
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from functools import partial
 from itertools import starmap
 from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable
 
 from zarr.core.sync import sync
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, AsyncIterator, Iterable
+    from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence
     from types import TracebackType
     from typing import Any, Self
 
@@ -616,6 +617,66 @@ async def _get_many(
         for req in requests:
             yield (req[0], await self.get(*req))
 
+    async def get_ranges(
+        self,
+        key: str,
+        byte_ranges: Sequence[ByteRequest | None],
+        *,
+        prototype: BufferPrototype,
+        max_concurrency: int = 10,
+        max_gap_bytes: int = 1 << 20,  # 1 MiB
+        max_coalesced_bytes: int = 16 << 20,  # 16 MiB
+    ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]:
+        """Read many byte ranges from `key`.
+
+        Yields one batch per underlying I/O operation, each a sequence of
+        `(input_index, Buffer | None)` tuples. Batches across yields arrive in
+        completion order, not input order. The default implementation built
+        into `Store` runs the coalescer over `self.get`, so subclasses get a
+        working implementation for free; stores that have a more efficient
+        backend (e.g. ranged HTTP, S3 byte-range fetches) should override.
+
+        Parameters
+        ----------
+        key
+            Storage key to read from.
+        byte_ranges
+            Input ranges. `None` means "the whole value".
+        prototype
+            Buffer prototype, forwarded to `self.get`.
+        max_concurrency
+            Maximum number of merged fetches in flight at once.
+        max_gap_bytes
+            Two `RangeByteRequest`s separated by at most this many bytes may
+            be merged into one fetch.
+        max_coalesced_bytes
+            Upper bound on the size of a single merged fetch.
+
+        Raises
+        ------
+        BaseExceptionGroup
+            Failures from underlying fetches are reported as a
+            `BaseExceptionGroup` (PEP 654) and should be handled with
+            `except*`. Inner exceptions include `FileNotFoundError` if any
+            fetch returns `None` (i.e. `key` is absent), and any exception
+            raised by `self.get` for the corresponding range. Pending
+            fetches are cancelled as soon as one task fails, so the group
+            typically contains a single non-`CancelledError` exception even
+            under high concurrency.
+        """
+        # Local import: zarr.core._coalesce imports symbols from this module.
+        from zarr.core._coalesce import coalesced_get
+
+        fetch = partial(self.get, key, prototype)
+        async for group in coalesced_get(
+            fetch,
+            byte_ranges,
+            max_concurrency=max_concurrency,
+            max_gap_bytes=max_gap_bytes,
+            max_coalesced_bytes=max_coalesced_bytes,
+        ):
+            yield group
+
     async def getsize(self, key: str) -> int:
         """
         Return the size, in bytes, of a value in a Store.
 
@@ -0,0 +1,222 @@
+# src/zarr/core/_coalesce.py
+from __future__ import annotations
+
+import asyncio
+from typing import TYPE_CHECKING, NamedTuple
+
+from zarr.abc.store import RangeByteRequest
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator, Awaitable, Callable, Sequence
+
+    from zarr.abc.store import ByteRequest
+    from zarr.core.buffer import Buffer
+
+
+class _WorkerCtx(NamedTuple):
+    """Shared state passed to the per-task worker coroutines.
+
+    Bundling these lets the workers declare their dependencies as one
+    parameter instead of capturing them implicitly via closure.
+    """
+
+    fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]]
+    semaphore: asyncio.Semaphore
+
+
+async def _fetch_single(
+    ctx: _WorkerCtx, idx: int, req: ByteRequest | None
+) -> Sequence[tuple[int, Buffer | None]]:
+    """Fetch one byte range. Raises FileNotFoundError if the key is absent."""
+    async with ctx.semaphore:
+        buf = await ctx.fetch(req)
+    if buf is None:
+        raise FileNotFoundError
+    return ((idx, buf),)
+
+
+async def _fetch_group(
+    ctx: _WorkerCtx, members: list[tuple[int, RangeByteRequest]]
+) -> Sequence[tuple[int, Buffer | None]]:
+    """Fetch one merged byte range and slice it back into per-input buffers.
+
+    `members` must already be sorted by `start`; callers in this module
+    build it from the sorted mergeable list. Raises `FileNotFoundError`
+    if the key is absent.
+    """
+    if len(members) == 1:
+        solo_idx, solo_req = members[0]
+        return await _fetch_single(ctx, solo_idx, solo_req)
+
+    start = members[0][1].start
+    end = max(r.end for _, r in members)
+    async with ctx.semaphore:
+        big = await ctx.fetch(RangeByteRequest(start, end))
+    if big is None:
+        raise FileNotFoundError
+    sliced = [(idx, big[r.start - start : r.end - start]) for idx, r in members]
+    return tuple(sliced)
+
+
+def coalesce_ranges(
+    byte_ranges: Sequence[ByteRequest | None],
+    *,
+    max_gap_bytes: int,
+    max_coalesced_bytes: int,
+) -> tuple[
+    list[list[tuple[int, RangeByteRequest]]],
+    list[tuple[int, ByteRequest | None]],
+]:
+    """Plan a set of byte-range fetches: which inputs merge, which stand alone.
+
+    Pure (no I/O). The result is the I/O plan a caller would execute: each
+    group corresponds to one fetch of a coalesced byte range, and each
+    uncoalescable item corresponds to one fetch of the original request.
+
+    All tuning knobs are required keyword arguments. `Store.get_ranges` is
+    the public entry point and owns the canonical default values; this
+    function takes them explicitly to avoid duplicating policy.
+
+    Parameters
+    ----------
+    byte_ranges
+        Input ranges. `None` means "the whole value".
+    max_gap_bytes
+        Two `RangeByteRequest`s separated by at most this many bytes may be
+        merged into one fetch.
+    max_coalesced_bytes
+        Upper bound on the size of a single merged fetch.
+
+    Returns
+    -------
+    groups
+        List of merged groups. Each group is a list of
+        `(input_index, RangeByteRequest)` pairs sorted by `start`. A
+        single-element group represents a `RangeByteRequest` that did not
+        merge with any neighbor.
+    uncoalescable
+        List of `(input_index, request)` pairs for inputs that are not
+        `RangeByteRequest` (`OffsetByteRequest`, `SuffixByteRequest`,
+        `None`). Indices are preserved from the input order.
+
+    Notes
+    -----
+    Only `RangeByteRequest` inputs participate in coalescing. Two ranges
+    merge when both: their gap (next `start` minus current group's running
+    `end`) is `<= max_gap_bytes`, and the resulting merged span is
+    `<= max_coalesced_bytes`.
+    """
+    indexed = list(enumerate(byte_ranges))
+    mergeable = [(i, r) for i, r in indexed if isinstance(r, RangeByteRequest)]
+    uncoalescable: list[tuple[int, ByteRequest | None]] = [
+        (i, r) for i, r in indexed if not isinstance(r, RangeByteRequest)
+    ]
+
+    # Sort mergeables by start offset, then merge. Track running start/end of the
+    # current group so each merge step is O(1) instead of O(group size).
+    mergeable.sort(key=lambda pair: pair[1].start)
+    groups: list[list[tuple[int, RangeByteRequest]]] = []
+    group_start = 0
+    group_end = 0
+    for pair in mergeable:
+        _i, r = pair
+        if groups and r.start - group_end <= max_gap_bytes:
+            prospective_end = max(group_end, r.end)
+            if prospective_end - group_start <= max_coalesced_bytes:
+                groups[-1].append(pair)
+                group_end = prospective_end
+                continue
+        groups.append([pair])
+        group_start = r.start
+        group_end = r.end
+
+    return groups, uncoalescable
+
+
+async def coalesced_get(
+    fetch: Callable[[ByteRequest | None], Awaitable[Buffer | None]],
+    byte_ranges: Sequence[ByteRequest | None],
+    *,
+    max_concurrency: int,
+    max_gap_bytes: int,
+    max_coalesced_bytes: int,
+) -> AsyncGenerator[Sequence[tuple[int, Buffer | None]]]:
+    """Read many byte ranges through `fetch` with coalescing and concurrency.
+
+    Nearby ranges are merged into a single underlying I/O, and merged fetches
+    are run concurrently. Each yield corresponds to exactly one underlying I/O
+    operation: a sequence of `(input_index, result)` tuples for all input
+    ranges served by that I/O. Tuples within a yielded sequence are ordered by
+    start offset. Yields across groups are in completion order, not input
+    order.
+
+    All tuning knobs are required keyword arguments. `Store.get_ranges` is
+    the public entry point and owns the canonical default values; this
+    function takes them explicitly to avoid duplicating policy.
+
+    Parameters
+    ----------
+    fetch
+        Callable that reads one byte range and returns a `Buffer` (or `None`
+        if the underlying key does not exist). Typically constructed via
+        `functools.partial(store.get, key, prototype)`.
+    byte_ranges
+        Input ranges. `None` means "the whole value".
+    max_concurrency
+        Maximum number of merged fetches in flight at once.
+    max_gap_bytes
+        Forwarded to `coalesce_ranges`.
+    max_coalesced_bytes
+        Forwarded to `coalesce_ranges`.
+
+    Yields
+    ------
+    Sequence[tuple[int, Buffer | None]]
+        Per-I/O batch of `(input_index, result)` tuples.
+
+    Notes
+    -----
+    - Only `RangeByteRequest` inputs are coalesced. `OffsetByteRequest`,
+      `SuffixByteRequest`, and `None` are each treated as uncoalescable
+      (one fetch, one single-tuple yield per input).
+    - Failures from underlying fetches surface as a `BaseExceptionGroup`
+      (PEP 654). Inner exceptions include `FileNotFoundError` if a fetch
+      returns `None`, plus any exception `fetch` raises. Pending fetches are
+      cancelled as soon as one task fails, so the group typically contains a
+      single non-`CancelledError` exception even under high concurrency.
+    - Groups completed before the failure remain observable on the yields
+      preceding the raise.
+    - `GeneratorExit` raised by `aclose()` is filtered out so the iterator
+      closes cleanly; callers don't see a group containing only it.
+    """
+    if not byte_ranges:
+        return
+
+    groups, singles = coalesce_ranges(
+        byte_ranges,
+        max_gap_bytes=max_gap_bytes,
+        max_coalesced_bytes=max_coalesced_bytes,
+    )
+
+    ctx = _WorkerCtx(fetch=fetch, semaphore=asyncio.Semaphore(max_concurrency))
+
+    # Launch all work as tasks. The semaphore bounds actual I/O concurrency.
+    # TaskGroup wraps task exceptions in BaseExceptionGroup; we propagate the
+    # group unchanged as part of the public contract (callers handle batch
+    # failures via `except*` / PEP 654). GeneratorExit (raised when the
+    # consumer calls aclose()) is filtered out so close completes cleanly.
+    try:
+        async with asyncio.TaskGroup() as tg:
+            tasks = [
+                *(tg.create_task(_fetch_group(ctx, group)) for group in groups),
+                *(tg.create_task(_fetch_single(ctx, i, single)) for i, single in singles),
+            ]
+
+            for fut in asyncio.as_completed(tasks):
+                yield await fut
+    except BaseExceptionGroup as eg:
+        # Strip GeneratorExits (consumer aclose()) and propagate whatever remains.
+        _, other_errors = eg.split(GeneratorExit)
+
+        if other_errors is not None:
+            raise other_errors from None
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING, cast
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, AsyncIterator, Iterable
+    from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence
     from types import TracebackType
     from typing import Any, Self
 
@@ -103,6 +103,32 @@ async def get_partial_values(
     ) -> list[Buffer | None]:
         return await self._store.get_partial_values(prototype, key_ranges)
 
+    async def get_ranges(
+        self,
+        key: str,
+        byte_ranges: Sequence[ByteRequest | None],
+        *,
+        prototype: BufferPrototype,
+        max_concurrency: int | None = None,
+        max_gap_bytes: int | None = None,
+        max_coalesced_bytes: int | None = None,
+    ) -> AsyncIterator[Sequence[tuple[int, Buffer | None]]]:
+        """Forward `get_ranges` to the wrapped store.
+
+        Default values for the coalescing kwargs are not declared here; the
+        wrapped store decides them. `None` means "don't override the wrapped
+        store's default".
+        """
+        kwargs: dict[str, int] = {}
+        if max_concurrency is not None:
+            kwargs["max_concurrency"] = max_concurrency
+        if max_gap_bytes is not None:
+            kwargs["max_gap_bytes"] = max_gap_bytes
+        if max_coalesced_bytes is not None:
+            kwargs["max_coalesced_bytes"] = max_coalesced_bytes
+        async for group in self._store.get_ranges(key, byte_ranges, prototype=prototype, **kwargs):
+            yield group
+
     async def exists(self, key: str) -> bool:
         return await self._store.exists(key)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Add `zarr.abc.store.Store.get_ranges` for concurrent, coalesced multi-range reads from a single key. The method is defined on the `Store` ABC with a default implementation built on `Store.get`, so every store inherits a working version; stores with native multi-range backends (e.g. `FsspecStore`) can override for efficiency. Coalescing knobs (`max_concurrency`, `max_gap_bytes`, `max_coalesced_bytes`) are passed as keyword arguments to `get_ranges`. Failures from underlying fetches surface as a `BaseExceptionGroup` (PEP 654); callers should use `except*` to filter for specific exception types such as `FileNotFoundError`.