Add concurrent chunk fetching for external array links

bendichter · claude · bendichter · commit d50a5376193f · 2026-03-14T09:24:41.000Z
Route remote external array links through zarr + LindiH5ZarrStore instead
of h5py + LindiRemfile. This enables concurrent HTTP range requests via
LindiH5ZarrStore.getitems(), which zarr calls when reading multiple chunks.

The getitems() method separates serial metadata lookup (fast, uses h5py's
B-tree cache) from parallel data fetches (N concurrent HTTP requests via
ThreadPoolExecutor instead of N serial ones).

Local external array links still use h5py directly since there's no
concurrency benefit for local I/O.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -1,9 +1,12 @@
 import json
 import base64
+import time
 from typing import Tuple, Union, List, IO, Any, Dict, Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import numpy as np
 import zarr
 from zarr.storage import Store, MemoryStore
+import requests
 import h5py
 from tqdm import tqdm
 from ._util import (
@@ -150,7 +153,8 @@ def __init__(
         _opts: LindiH5ZarrStoreOpts,
         _url: Union[str, None] = None,
         _entities_to_close: List[Any],
-        _local_cache: Union[LocalCache, None] = None
+        _local_cache: Union[LocalCache, None] = None,
+        _concurrent_max_workers: int = 8
     ):
         """
         Do not call the constructor directly. Instead, use the from_file class
@@ -161,6 +165,7 @@ def __init__(
         self._url = _url
         self._opts = _opts
         self._local_cache = _local_cache
+        self._concurrent_max_workers = _concurrent_max_workers
         self._entities_to_close = _entities_to_close + [self._h5f]
 
         # Some datasets do not correspond to traditional chunked datasets. For
@@ -325,6 +330,97 @@ def __contains__(self, key):
                     return False
             return True
 
+    def getitems(self, keys, *, contexts=None):
+        """Fetch multiple keys, with concurrent HTTP fetches for remote chunks."""
+        results = {}
+        remote_chunks = []  # (key, byte_offset, byte_count)
+
+        for key in keys:
+            parts = [p for p in key.split("/") if p]
+            if not parts:
+                continue
+            key_name = parts[-1]
+
+            # Metadata keys — resolve synchronously
+            if key_name in ('.zattrs', '.zgroup', '.zarray'):
+                try:
+                    results[key] = self[key]
+                except KeyError:
+                    pass
+                continue
+
+            # Chunk keys — get byte range from h5py metadata
+            key_parent = "/".join(parts[:-1])
+            try:
+                byte_offset, byte_count, inline_data = self._get_chunk_file_bytes_data(key_parent, key_name)
+            except Exception:
+                continue
+
+            if inline_data is not None:
+                results[key] = inline_data
+                continue
+
+            # Check local cache
+            if self._local_cache is not None and self._url is not None:
+                cached = self._local_cache.get_remote_chunk(url=self._url, offset=byte_offset, size=byte_count)
+                if cached is not None:
+                    results[key] = cached
+                    continue
+
+            if self._url is not None and (self._url.startswith('http://') or self._url.startswith('https://')):
+                remote_chunks.append((key, byte_offset, byte_count))
+            else:
+                # Local file — read synchronously (byte range already known)
+                buf = _read_bytes(self._file, byte_offset, byte_count)
+                self._try_cache_put(byte_offset, byte_count, buf)
+                results[key] = buf
+
+        if not remote_chunks:
+            return self._apply_padding_to_results(results)
+
+        # Pre-resolve URL for DANDI auth
+        from ..LindiRemfile.LindiRemfile import _resolve_url
+        resolved_url = _resolve_url(self._url)
+
+        # Single chunk — skip thread pool overhead
+        if len(remote_chunks) == 1:
+            key, offset, count = remote_chunks[0]
+            val = _fetch_bytes_direct(resolved_url, offset, count)
+            self._try_cache_put(offset, count, val)
+            results[key] = val
+            return self._apply_padding_to_results(results)
+
+        # Concurrent fetch
+        max_workers = min(len(remote_chunks), self._concurrent_max_workers)
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(_fetch_bytes_direct, resolved_url, offset, count): (key, offset, count)
+                for key, offset, count in remote_chunks
+            }
+            for future in as_completed(futures):
+                key, offset, count = futures[future]
+                val = future.result()
+                self._try_cache_put(offset, count, val)
+                results[key] = val
+
+        return self._apply_padding_to_results(results)
+
+    def _try_cache_put(self, byte_offset, byte_count, data):
+        """Write data to the local cache if available."""
+        if self._local_cache is not None and self._url is not None:
+            try:
+                self._local_cache.put_remote_chunk(url=self._url, offset=byte_offset, size=byte_count, data=data)
+            except ChunkTooLargeError:
+                pass
+
+    def _apply_padding_to_results(self, results):
+        for key in list(results.keys()):
+            val = results[key]
+            padded_size = _get_padded_size(self, key, val)
+            if padded_size is not None:
+                results[key] = _pad_chunk(val, padded_size)
+        return results
+
     def __delitem__(self, key):
         raise Exception("Deleting items is not allowed")
 
@@ -889,3 +985,23 @@ def chunk_fname(self):
     @property
     def chunk_bytes(self):
         return self._chunk_bytes
+
+
+def _fetch_bytes_direct(resolved_url: str, offset: int, length: int) -> bytes:
+    """Fetch bytes from a resolved URL via HTTP range request. Thread-safe."""
+    num_retries = 8
+    for try_num in range(num_retries):
+        try:
+            range_header = f"bytes={offset}-{offset + length - 1}"
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+                "Range": range_header
+            }
+            response = requests.get(resolved_url, headers=headers)
+            response.raise_for_status()
+            return response.content
+        except Exception as e:
+            if try_num == num_retries - 1:
+                raise
+            time.sleep(0.1 * 2 ** try_num)
+    assert False, "unreachable"  # loop always returns or raises
diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py
@@ -12,6 +12,7 @@
 
 if TYPE_CHECKING:
     from .LindiH5pyFile import LindiH5pyFile  # pragma: no cover
+    from ..LindiH5ZarrStore.LindiH5ZarrStore import LindiH5ZarrStore  # pragma: no cover
 
 
 # This is a global list of external hdf5 clients, which are used by
@@ -20,6 +21,11 @@
 # TODO: figure out how to close these clients
 _external_hdf5_clients: Dict[str, h5py.File] = {}
 
+# Cache of LindiH5ZarrStore instances for remote external array links,
+# keyed by URL. Similar to _external_hdf5_clients.
+# TODO: figure out how to close these stores (same issue as _external_hdf5_clients)
+_external_zarr_stores: Dict[str, "LindiH5ZarrStore"] = {}
+
 
 class LindiH5pyDataset(h5py.Dataset):
     def __init__(self, _zarr_array: zarr.Array, _file: "LindiH5pyFile"):
@@ -203,10 +209,17 @@ def _get_item_for_zarr(self, zarr_array: zarr.Array, selection: Any):
                 url = external_array_link.get("url", None)
                 name = external_array_link.get("name", None)
                 if url is not None and name is not None:
-                    client = self._get_external_hdf5_client(url)
-                    dataset = client[name]
-                    assert isinstance(dataset, h5py.Dataset)
-                    return dataset[selection]
+                    is_remote = url.startswith("http://") or url.startswith("https://")
+                    if is_remote:
+                        # Use zarr + LindiH5ZarrStore for concurrent chunk fetching
+                        ext_zarr_array = self._get_external_zarr_array(url, name)
+                        return ext_zarr_array[selection]
+                    else:
+                        # Local files — use h5py directly (no concurrency benefit)
+                        client = self._get_external_hdf5_client(url)
+                        dataset = client[name]
+                        assert isinstance(dataset, h5py.Dataset)
+                        return dataset[selection]
         if self._compound_dtype is not None:
             # Compound dtype
             # In this case we index into the compound dtype using the name of the field
@@ -252,6 +265,21 @@ def _get_external_hdf5_client(self, url: str) -> h5py.File:
             _external_hdf5_clients[url] = h5py.File(ff, "r")
         return _external_hdf5_clients[url]
 
+    def _get_external_zarr_array(self, url: str, name: str) -> zarr.Array:
+        """Get a zarr array for concurrent reading of a remote external array link."""
+        from ..LindiH5ZarrStore.LindiH5ZarrStore import LindiH5ZarrStore
+        from ..LindiH5ZarrStore.LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts
+
+        if url not in _external_zarr_stores:
+            # Disable external array links (num_dataset_chunks_threshold=None)
+            # so all chunks are served through the zarr store
+            opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+            _external_zarr_stores[url] = LindiH5ZarrStore.from_file(
+                url, opts=opts, local_cache=self._file._local_cache
+            )
+        store = _external_zarr_stores[url]
+        return zarr.open_array(store=store, path=name, mode='r')
+
     @property
     def ref(self):
         if self._readonly:
diff --git a/tests/test_concurrent_external_link.py b/tests/test_concurrent_external_link.py
@@ -0,0 +1,140 @@
+import tempfile
+import numpy as np
+import h5py
+import zarr
+import lindi
+from lindi.LindiH5ZarrStore.LindiH5ZarrStore import LindiH5ZarrStore
+from lindi.LindiH5ZarrStore.LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts
+
+
+def test_getitems_local_chunks():
+    """Test getitems on LindiH5ZarrStore with a local chunked dataset."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        X = np.random.randn(100, 10)
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("dataset1", data=X, chunks=(20, 10))
+
+        # Use num_dataset_chunks_threshold=None so chunks are served through store
+        opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+        with LindiH5ZarrStore.from_file(filename, url=filename, opts=opts) as store:
+            # Read via zarr to verify basic functionality
+            arr = zarr.open_array(store=store, path="dataset1", mode="r")
+            np.testing.assert_array_equal(arr[:], X)
+
+            # Test getitems with chunk keys
+            keys = ["dataset1/0.0", "dataset1/1.0", "dataset1/2.0"]
+            results = store.getitems(keys)
+            assert len(results) == 3
+            for key in keys:
+                assert key in results
+
+            # Test getitems with metadata keys
+            meta_keys = ["dataset1/.zarray", "dataset1/.zattrs"]
+            meta_results = store.getitems(meta_keys)
+            assert len(meta_results) == 2
+            for key in meta_keys:
+                assert key in meta_results
+
+            # Test getitems with non-existent keys (should be skipped)
+            mixed_keys = ["dataset1/0.0", "nonexistent/0.0"]
+            mixed_results = store.getitems(mixed_keys)
+            assert "dataset1/0.0" in mixed_results
+            assert "nonexistent/0.0" not in mixed_results
+
+
+def test_getitems_inline_data():
+    """Test getitems with a small dataset that is stored inline."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        X = np.array([1, 2, 3], dtype=np.float64)
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("small", data=X)
+
+        opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+        with LindiH5ZarrStore.from_file(filename, url=filename, opts=opts) as store:
+            # Small arrays should be inline
+            keys = ["small/0"]
+            results = store.getitems(keys)
+            assert len(results) == 1
+
+
+def test_getitems_single_chunk_shortcut():
+    """Test that a single remote chunk skips the thread pool."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        X = np.random.randn(1000)
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("data", data=X, chunks=(1000,))
+
+        opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+        with LindiH5ZarrStore.from_file(filename, url=filename, opts=opts) as store:
+            keys = ["data/0"]
+            results = store.getitems(keys)
+            assert "data/0" in results
+
+
+def test_external_array_link_via_zarr_store():
+    """Test that external array links for local files still work correctly."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        X = np.random.randn(50, 12)
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("dataset1", data=X, chunks=(10, 6))
+
+        # Create a LINDI reference with a low threshold so external array link is used
+        with LindiH5ZarrStore.from_file(
+            filename,
+            url=filename,
+            opts=LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=4),
+        ) as store:
+            rfs = store.to_reference_file_system()
+
+        # Read back through LindiH5pyFile — local external links use h5py directly
+        client = lindi.LindiH5pyFile.from_reference_file_system(rfs)
+        X2 = client["dataset1"][:]
+        np.testing.assert_array_equal(X, X2)
+
+
+def test_zarr_store_for_external_array():
+    """Test creating a LindiH5ZarrStore with num_dataset_chunks_threshold=None
+    to serve all chunks (the pattern used for remote external array links)."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        X = np.random.randn(200, 10)
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("dataset1", data=X, chunks=(20, 10))
+
+        # This is the same pattern used in _get_external_zarr_array
+        opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+        with LindiH5ZarrStore.from_file(filename, opts=opts, url=filename) as store:
+            arr = zarr.open_array(store=store, path="dataset1", mode="r")
+            result = arr[:]
+            np.testing.assert_array_equal(result, X)
+
+            # Test slicing
+            result_slice = arr[10:30, 3:7]
+            np.testing.assert_array_equal(result_slice, X[10:30, 3:7])
+
+
+def test_getitems_empty_keys():
+    """Test getitems with empty key list."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("data", data=np.array([1, 2, 3]))
+
+        opts = LindiH5ZarrStoreOpts(num_dataset_chunks_threshold=None)
+        with LindiH5ZarrStore.from_file(filename, url=filename, opts=opts) as store:
+            results = store.getitems([])
+            assert results == {}
+
+
+if __name__ == "__main__":
+    test_getitems_local_chunks()
+    test_getitems_inline_data()
+    test_getitems_single_chunk_shortcut()
+    test_external_array_link_via_zarr_store()
+    test_zarr_store_for_external_array()
+    test_getitems_empty_keys()
+    print("All tests passed!")