From aa93b8b0a5017b55c986ffbbaff3cbf7579df695 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 11:30:42 -0700
Subject: [PATCH 01/22] updates zarr-parser to use obstore list_async instead
 of concurrent_map

---
 virtualizarr/parsers/zarr.py                 | 88 ++++++++++----------
 virtualizarr/tests/test_parsers/test_zarr.py | 64 +++++++-------
 2 files changed, 73 insertions(+), 79 deletions(-)

diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 8bb7cddd..38104ae2 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
+import numpy as np
 import zarr
 from obspec_utils.registry import ObjectStoreRegistry
 from zarr.api.asynchronous import open_group as open_group_async
@@ -19,8 +20,10 @@
     ManifestGroup,
     ManifestStore,
 )
-from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
-from virtualizarr.vendor.zarr.core.common import _concurrent_map
+from virtualizarr.manifests.manifest import (
+    parse_manifest_index,
+    validate_and_normalize_path_to_uri,
+)
 
 if TYPE_CHECKING:
     import zarr
@@ -90,33 +93,43 @@ async def _handle_scalar_array(
 
 
 async def _build_chunk_mapping(
-    chunk_keys: list[str], zarr_array: ZarrArrayType, path: str, prefix: str
+    zarr_array: ZarrArrayType, path: str, prefix: str
 ) -> dict[str, dict[str, Any]]:
     """
-    Build chunk mapping from a list of chunk keys.
+    Build chunk mapping by listing the object store with obstore.
+
+    Uses obstore's list_async with Arrow output to get chunk paths and sizes
+    in a single Rust-level call, avoiding per-chunk getsize calls.
 
     Parameters
     ----------
-    chunk_keys
-        List of storage keys for chunks.
     zarr_array
         The Zarr array.
     path
         Base path for constructing chunk paths.
     prefix
-        Prefix to strip from chunk keys.
+        Prefix to list and strip from chunk keys.
 
     Returns
     -------
     dict
         Mapping of normalized chunk coordinates to storage locations.
     """
+
+    size_map: dict[str, int] = {}
+    stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
+    async for batch in stream:
+        size_map.update(
+            zip(batch.column("path").to_pylist(), batch.column("size").to_pylist())
+        )
+
+    # filter out metadata files
+    chunk_keys = [k for k in size_map if not k.split("/")[-1].startswith(".")]
+
     if not chunk_keys:
         return {}
 
-    lengths = await _concurrent_map(
-        [(k,) for k in chunk_keys], zarr_array.store.getsize
-    )
+    lengths = [size_map[k] for k in chunk_keys]
     dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
     paths = [join_url(path, k) for k in chunk_keys]
     offsets = [0] * len(lengths)
@@ -158,24 +171,7 @@ async def get_chunk_mapping(
             scalar_key = f"{prefix}0"
             return await _handle_scalar_array(zarr_array, path, scalar_key)
 
-        # List all keys under the array prefix, filtering out metadata files
-        prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
-        if not prefix_keys:
-            return {}
-
-        metadata_files = {".zarray", ".zattrs", ".zgroup", ".zmetadata"}
-        chunk_keys = []
-        for key_tuple in prefix_keys:
-            key = key_tuple[0]
-            file_name = (
-                key[len(prefix) :]
-                if prefix and key.startswith(prefix)
-                else key.split("/")[-1]
-            )
-            if file_name not in metadata_files:
-                chunk_keys.append(key)
-
-        return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Convert V2 metadata to V3 format."""
@@ -272,12 +268,7 @@ async def get_chunk_mapping(
 
         # List chunk keys under the c/ subdirectory
         prefix = f"{name}/c/" if name else "c/"
-        prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
-        if not prefix_keys:
-            return {}
-
-        chunk_keys = [x[0] for x in prefix_keys]
-        return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Return V3 metadata as-is (no conversion needed)."""
@@ -322,17 +313,28 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     """
     strategy = get_strategy(zarr_array)
     chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
+    chunk_grid_shape = zarr_array._chunk_grid_shape
 
     if not chunk_map:
-        import math
-
-        if zarr_array.shape and zarr_array.chunks:
-            chunk_grid_shape = tuple(
-                math.ceil(s / c) for s, c in zip(zarr_array.shape, zarr_array.chunks)
-            )
-            return ChunkManifest(chunk_map, shape=chunk_grid_shape)
-
-    return ChunkManifest(chunk_map)
+        return ChunkManifest(chunk_map, shape=chunk_grid_shape)
+
+    # Pre-allocate N-D numpy arrays shaped like the chunk grid.
+    # Empty string paths indicate missing chunks (sparse arrays).
+    paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
+    offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+    lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+
+    for key, entry in chunk_map.items():
+        idx = parse_manifest_index(key)
+        paths_arr[idx] = entry["path"]
+        offsets_arr[idx] = entry["offset"]
+        lengths_arr[idx] = entry["length"]
+
+    return ChunkManifest.from_arrays(
+        paths=paths_arr,
+        offsets=offsets_arr,
+        lengths=lengths_arr,
+    )
 
 
 def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata:
diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py
index 27f8537e..20d46dbe 100644
--- a/virtualizarr/tests/test_parsers/test_zarr.py
+++ b/virtualizarr/tests/test_parsers/test_zarr.py
@@ -177,7 +177,9 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format):
     """Test chunk mapping for arrays with no chunks written yet."""
     import asyncio
 
-    # Create an array but don't write any data
+    from obstore.store import LocalStore as ObsLocalStore
+    from zarr.storage import ObjectStore
+
     filepath = f"{tmpdir}/empty.zarr"
     zarr.create(
         shape=(10, 10),
@@ -188,12 +190,13 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format):
     )
 
     async def get_chunk_map():
-        zarr_array = await open_array(store=filepath, mode="r")
+        obs_store = ObsLocalStore(prefix=filepath)
+        zarr_store = ObjectStore(store=obs_store)
+        zarr_array = await open_array(store=zarr_store, mode="r")
         strategy = get_strategy(zarr_array)
         return await strategy.get_chunk_mapping(zarr_array, filepath)
 
     chunk_map = asyncio.run(get_chunk_map())
-    # Empty arrays should return empty chunk map
     assert chunk_map == {}
 
 
@@ -306,34 +309,31 @@ def test_build_chunk_manifest_empty_with_shape():
     """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks."""
     import asyncio
 
-    # Create an array but don't write data
-    store = zarr.storage.MemoryStore()
-    zarr.create(shape=(10, 10), chunks=(5, 5), dtype="int8", store=store, zarr_format=3)
+    from obstore.store import MemoryStore as ObsMemoryStore
+    from zarr.storage import ObjectStore
+
+    obs_store = ObsMemoryStore()
+    zarr_store = ObjectStore(store=obs_store)
+    zarr.create(
+        shape=(10, 10), chunks=(5, 5), dtype="int8", store=zarr_store, zarr_format=3
+    )
 
     async def get_manifest():
-        zarr_array = await open_array(store=store, mode="r")
+        zarr_array = await open_array(store=zarr_store, mode="r")
         return await build_chunk_manifest(zarr_array, "test://path")
 
     manifest = asyncio.run(get_manifest())
-    # Should create manifest with proper chunk grid shape even if empty
-    assert manifest.shape_chunk_grid == (2, 2)  # 10/5 = 2 chunks per dimension
+    assert manifest.shape_chunk_grid == (2, 2)
 
 
 @zarr_versions()
 def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
-    """Test that arrays with some missing chunks (sparse arrays) are handled correctly.
-
-    This test verifies that VirtualiZarr correctly handles the case where some chunks
-    exist but others are missing. Zarr allows this for sparse data, and when chunks
-    are missing, Zarr returns the fill_value for those regions. VirtualiZarr should
-    preserve this sparsity in the manifest rather than generating entries for all
-    possible chunks based on the chunk grid.
-    """
+    """Test that arrays with some missing chunks (sparse arrays) are handled correctly."""
     import asyncio
 
-    from virtualizarr.parsers.zarr import build_chunk_manifest
+    from obstore.store import LocalStore as ObsLocalStore
+    from zarr.storage import ObjectStore
 
-    # Create a zarr array with a 3x3 chunk grid (9 possible chunks)
     filepath = f"{tmpdir}/sparse.zarr"
     arr = zarr.create(
         shape=(30, 30),
@@ -344,36 +344,28 @@ def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
         fill_value=np.nan,
     )
 
-    # Only write data to some chunks, leaving others missing (sparse)
-    # Write to chunks (0,0), (1,1), and (2,2) - a diagonal pattern
     arr[0:10, 0:10] = 1.0  # chunk 0.0
     arr[10:20, 10:20] = 2.0  # chunk 1.1
     arr[20:30, 20:30] = 3.0  # chunk 2.2
-    # Chunks (0,1), (0,2), (1,0), (1,2), (2,0), (2,1) are intentionally left unwritten
 
     async def get_manifest():
-        zarr_array = await open_array(store=filepath, mode="r")
+        obs_store = ObsLocalStore(prefix=filepath)
+        zarr_store = ObjectStore(store=obs_store)
+        zarr_array = await open_array(store=zarr_store, mode="r")
         return await build_chunk_manifest(zarr_array, filepath)
 
     manifest = asyncio.run(get_manifest())
 
-    # The manifest should only contain the 3 chunks we actually wrote
-    assert len(manifest.dict()) == 3, f"Expected 3 chunks, got {len(manifest.dict())}"
-
-    # Verify the expected chunks are present
-    assert "0.0" in manifest.dict(), "Chunk 0.0 should be present"
-    assert "1.1" in manifest.dict(), "Chunk 1.1 should be present"
-    assert "2.2" in manifest.dict(), "Chunk 2.2 should be present"
+    assert len(manifest.dict()) == 3
+    assert "0.0" in manifest.dict()
+    assert "1.1" in manifest.dict()
+    assert "2.2" in manifest.dict()
 
-    # Verify missing chunks are not in the manifest
     missing_chunks = ["0.1", "0.2", "1.0", "1.2", "2.0", "2.1"]
     for chunk_key in missing_chunks:
-        assert chunk_key not in manifest.dict(), (
-            f"Chunk {chunk_key} should not be present (it's missing/sparse)"
-        )
+        assert chunk_key not in manifest.dict()
 
-    # The chunk grid shape should still reflect the full array dimensions
-    assert manifest.shape_chunk_grid == (3, 3), "Chunk grid should be 3x3"
+    assert manifest.shape_chunk_grid == (3, 3)
 
 
 @zarr_versions()

From 37dff68bb17151e6e3673f4787b23f400e2fb2ee Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 11:37:34 -0700
Subject: [PATCH 02/22] removes the zarr vendor code

---
 virtualizarr/vendor/__init__.py           |  0
 virtualizarr/vendor/zarr/__init__.py      |  0
 virtualizarr/vendor/zarr/core/__init__.py |  0
 virtualizarr/vendor/zarr/core/common.py   | 34 -----------------------
 4 files changed, 34 deletions(-)
 delete mode 100644 virtualizarr/vendor/__init__.py
 delete mode 100644 virtualizarr/vendor/zarr/__init__.py
 delete mode 100644 virtualizarr/vendor/zarr/core/__init__.py
 delete mode 100644 virtualizarr/vendor/zarr/core/common.py

diff --git a/virtualizarr/vendor/__init__.py b/virtualizarr/vendor/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/virtualizarr/vendor/zarr/__init__.py b/virtualizarr/vendor/zarr/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/virtualizarr/vendor/zarr/core/__init__.py b/virtualizarr/vendor/zarr/core/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/virtualizarr/vendor/zarr/core/common.py b/virtualizarr/vendor/zarr/core/common.py
deleted file mode 100644
index b6363db7..00000000
--- a/virtualizarr/vendor/zarr/core/common.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import asyncio
-from itertools import starmap
-from typing import (
-    Any,
-    Awaitable,
-    Callable,
-    Iterable,
-    TypeVar,
-)
-
-# Vendored directly from Zarr-python V3's private API
-# https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53
-T = TypeVar("T", bound=tuple[Any, ...])
-V = TypeVar("V")
-
-
-async def _concurrent_map(
-    items: Iterable[T],
-    func: Callable[..., Awaitable[V]],
-    limit: int | None = None,
-) -> list[V]:
-    if limit is None:
-        return await asyncio.gather(*list(starmap(func, items)))
-
-    else:
-        sem = asyncio.Semaphore(limit)
-
-        async def run(item: tuple[Any]) -> V:
-            async with sem:
-                return await func(*item)
-
-        return await asyncio.gather(
-            *[asyncio.ensure_future(run(item)) for item in items]
-        )

From 2fa25a769915844929dbbeb6c610d2fbd99522a3 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 11:48:41 -0700
Subject: [PATCH 03/22] adds arro3-core to zarr group

---
 pyproject.toml               | 5 +++++
 virtualizarr/parsers/zarr.py | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 66b6ce67..0806b986 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,8 @@ hdf = [
     "imagecodecs-numcodecs==2024.6.1",
 ]
 
+zarr = ["arro3-core"]
+
 # kerchunk-based parsers
 netcdf3 = [
     "virtualizarr[remote]",
@@ -209,6 +211,9 @@ upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-d
 all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
 docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
 
+[tool.pixi.dependencies]
+pytest = "*"
+
 # Define commands to run within the docs environment
 [tool.pixi.feature.docs.tasks]
 serve-docs = { cmd = "mkdocs serve" }
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 38104ae2..5bf8a805 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -100,6 +100,7 @@ async def _build_chunk_mapping(
 
     Uses obstore's list_async with Arrow output to get chunk paths and sizes
     in a single Rust-level call, avoiding per-chunk getsize calls.
+    # https://github.com/zarr-developers/VirtualiZarr/issues/891
 
     Parameters
     ----------
@@ -330,6 +331,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
         offsets_arr[idx] = entry["offset"]
         lengths_arr[idx] = entry["length"]
 
+    # Construct the python ChunkManifest object's numpy arrays directly from the Arrow arrays, minimizing memory copies (i.e. the opposite of what I did in Pass manifests to icechunk as pyarrow arrays #861)
+    # TODO: I think we still have a arrow->dict->arrow conversion happening
     return ChunkManifest.from_arrays(
         paths=paths_arr,
         offsets=offsets_arr,

From 626d0b921c26c86260e7099bb18d5a37baa04509 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 18:06:09 -0700
Subject: [PATCH 04/22] adds _from_arrow method

---
 pyproject.toml                               |   2 +-
 virtualizarr/manifests/manifest.py           |  49 ++++++++
 virtualizarr/parsers/zarr.py                 | 111 ++++++++++++-------
 virtualizarr/tests/test_parsers/test_zarr.py |   8 +-
 4 files changed, 124 insertions(+), 46 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0806b986..0efe6908 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ hdf = [
     "imagecodecs-numcodecs==2024.6.1",
 ]
 
-zarr = ["arro3-core"]
+zarr = ["arro3-core", "pyarrow"]
 
 # kerchunk-based parsers
 netcdf3 = [
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 250ccc60..d3206f04 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -11,6 +11,7 @@
 from typing import Any, NewType, TypedDict, cast
 
 import numpy as np
+import pyarrow as pa
 
 from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
 from virtualizarr.types import ChunkKey
@@ -322,6 +323,54 @@ def from_arrays(
 
         return obj
 
+    @classmethod
+    def _from_arrow(
+        cls,
+        *,
+        chunk_keys: "pa.Array",
+        paths: "pa.Array",
+        sizes: "pa.Array",
+        chunk_grid_shape: tuple[int, ...],
+    ) -> "ChunkManifest":
+        """
+        Create a ChunkManifest directly from PyArrow arrays.
+
+
+        Parameters
+        ----------
+        chunk_keys
+            Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array.
+        paths
+            Full paths to chunks, as a PyArrow Utf8 array.
+        sizes
+            Chunk sizes in bytes, as a PyArrow UInt64 array.
+        chunk_grid_shape
+            Shape of the chunk grid, used to pre-allocate numpy arrays.
+        """
+        import pyarrow.compute as pc
+
+        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
+        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+
+        if len(chunk_keys) > 0:
+            # split "1.2" into  [1, 2], then reshape to (n_chunks, ndim), use for indexing
+            ndim = len(chunk_grid_shape)
+            indices_flat = pc.cast(
+                pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64()
+            ).to_numpy()
+            indices_2d = indices_flat.reshape(-1, ndim)
+            idx_tuple = tuple(indices_2d[:, i] for i in range(ndim))
+
+            paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False)
+            lengths_arr[idx_tuple] = sizes.to_numpy()
+
+        return cls.from_arrays(
+            paths=paths_arr,
+            offsets=offsets_arr,
+            lengths=lengths_arr,
+        )
+
     @property
     def ndim_chunk_grid(self) -> int:
         """
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 5bf8a805..39db18a8 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -26,6 +26,7 @@
 )
 
 if TYPE_CHECKING:
+    import pyarrow as pa
     import zarr
 
 ZarrArrayType = zarr.AsyncArray | zarr.Array
@@ -94,13 +95,12 @@ async def _handle_scalar_array(
 
 async def _build_chunk_mapping(
     zarr_array: ZarrArrayType, path: str, prefix: str
-) -> dict[str, dict[str, Any]]:
+) -> tuple["pa.Array", "pa.Array", "pa.Array"] | None:
     """
     Build chunk mapping by listing the object store with obstore.
 
     Uses obstore's list_async with Arrow output to get chunk paths and sizes
     in a single Rust-level call, avoiding per-chunk getsize calls.
-    # https://github.com/zarr-developers/VirtualiZarr/issues/891
 
     Parameters
     ----------
@@ -113,32 +113,48 @@ async def _build_chunk_mapping(
 
     Returns
     -------
-    dict
-        Mapping of normalized chunk coordinates to storage locations.
+    Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found.
     """
+    import pyarrow as pa
+    import pyarrow.compute as pc
 
-    size_map: dict[str, int] = {}
+    path_batches = []
+    size_batches = []
     stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
     async for batch in stream:
-        size_map.update(
-            zip(batch.column("path").to_pylist(), batch.column("size").to_pylist())
+        pa_path_col = pa.array(batch.column("path"))
+        not_metadata = pc.invert(
+            pc.or_(
+                pc.match_substring(pa_path_col, pattern="/."),
+                pc.starts_with(pa_path_col, "."),
+            )
         )
 
-    # filter out metadata files
-    chunk_keys = [k for k in size_map if not k.split("/")[-1].startswith(".")]
+        filtered_paths = pa_path_col.filter(not_metadata)
+        filtered_sizes = pa.array(batch.column("size")).filter(not_metadata)
+        path_batches.append(filtered_paths)
+        size_batches.append(filtered_sizes)
 
-    if not chunk_keys:
-        return {}
+    if not path_batches:
+        return None
 
-    lengths = [size_map[k] for k in chunk_keys]
-    dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
-    paths = [join_url(path, k) for k in chunk_keys]
-    offsets = [0] * len(lengths)
+    all_paths = pa.concat_arrays(path_batches)
+    all_sizes = pa.concat_arrays(size_batches)
 
-    return {
-        key: {"path": p, "offset": offset, "length": length}
-        for key, p, offset, length in zip(dict_keys, paths, offsets, lengths)
-    }
+    if len(all_paths) == 0:
+        return None
+    # normalize: strip prefix, replace / with .
+    stripped = pc.utf8_replace_slice(
+        all_paths, start=0, stop=len(prefix), replacement=""
+    )
+    normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".")
+
+    # construct full paths
+    full_paths = pc.binary_join_element_wise(
+        pa.scalar(path.rstrip("/")), all_paths, "/"
+    )
+
+    return normalized_keys, full_paths, all_sizes
 
 
 class ZarrVersionStrategy(ABC):
@@ -313,30 +329,43 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     missing, Zarr will return the fill_value for those regions when the array is read.
     """
     strategy = get_strategy(zarr_array)
-    chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
     chunk_grid_shape = zarr_array._chunk_grid_shape
 
-    if not chunk_map:
-        return ChunkManifest(chunk_map, shape=chunk_grid_shape)
-
-    # Pre-allocate N-D numpy arrays shaped like the chunk grid.
-    # Empty string paths indicate missing chunks (sparse arrays).
-    paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
-    offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-    lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-
-    for key, entry in chunk_map.items():
-        idx = parse_manifest_index(key)
-        paths_arr[idx] = entry["path"]
-        offsets_arr[idx] = entry["offset"]
-        lengths_arr[idx] = entry["length"]
-
-    # Construct the python ChunkManifest object's numpy arrays directly from the Arrow arrays, minimizing memory copies (i.e. the opposite of what I did in Pass manifests to icechunk as pyarrow arrays #861)
-    # TODO: I think we still have a arrow->dict->arrow conversion happening
-    return ChunkManifest.from_arrays(
-        paths=paths_arr,
-        offsets=offsets_arr,
-        lengths=lengths_arr,
+    # For scalar arrays use _from_arrays
+    chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
+    if zarr_array.shape == ():
+        if not chunk_map:
+            return ChunkManifest(chunk_map, shape=chunk_grid_shape)
+        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
+        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        for key, entry in chunk_map.items():
+            idx = parse_manifest_index(key)
+            paths_arr[idx] = entry["path"]
+            offsets_arr[idx] = entry["offset"]
+            lengths_arr[idx] = entry["length"]
+        return ChunkManifest.from_arrays(
+            paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr
+        )
+
+    # for non scalar arrays, use the new _from_arrow method
+    name = _get_array_name(zarr_array)
+    if zarr_array.metadata.zarr_format == 3:
+        prefix = f"{name}/c/" if name else "c/"
+    else:
+        prefix = f"{name}/" if name else ""
+
+    result = await _build_chunk_mapping(zarr_array, path, prefix)
+
+    if result is None:
+        return ChunkManifest({}, shape=chunk_grid_shape)
+
+    normalized_keys, full_paths, sizes = result
+    return ChunkManifest._from_arrow(
+        chunk_keys=normalized_keys,
+        paths=full_paths,
+        sizes=sizes,
+        chunk_grid_shape=chunk_grid_shape,
     )
 
 
diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py
index 20d46dbe..991d6bde 100644
--- a/virtualizarr/tests/test_parsers/test_zarr.py
+++ b/virtualizarr/tests/test_parsers/test_zarr.py
@@ -193,11 +193,11 @@ async def get_chunk_map():
         obs_store = ObsLocalStore(prefix=filepath)
         zarr_store = ObjectStore(store=obs_store)
         zarr_array = await open_array(store=zarr_store, mode="r")
-        strategy = get_strategy(zarr_array)
-        return await strategy.get_chunk_mapping(zarr_array, filepath)
+        manifest = await build_chunk_manifest(zarr_array, filepath)
+        return manifest.dict()
 
-    chunk_map = asyncio.run(get_chunk_map())
-    assert chunk_map == {}
+    result = asyncio.run(get_chunk_map())
+    assert result == {}
 
 
 @SKIP_OLDER_ZARR_PYTHON

From 9d6a312dbff256991fbab8c5ed6437592fe5092b Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 18:15:48 -0700
Subject: [PATCH 05/22] adds type_checking for pa type hint + import in
 _from_arrow

---
 virtualizarr/manifests/manifest.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index d3206f04..98b08199 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import re
 from collections.abc import (
     Callable,
@@ -8,7 +10,7 @@
     ValuesView,
 )
 from pathlib import PosixPath
-from typing import Any, NewType, TypedDict, cast
+from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast
 
 import numpy as np
 import pyarrow as pa
@@ -16,6 +18,9 @@
 from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
 from virtualizarr.types import ChunkKey
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 # doesn't guarantee that writers actually handle these
 VALID_URI_PREFIXES = {
     "s3://",
@@ -347,6 +352,7 @@ def _from_arrow(
         chunk_grid_shape
             Shape of the chunk grid, used to pre-allocate numpy arrays.
         """
+        import pyarrow as pa
         import pyarrow.compute as pc
 
         paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())

From bab147d23cef9919719765384545955ace886b14 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 18:17:49 -0700
Subject: [PATCH 06/22] extra import removed

---
 virtualizarr/manifests/manifest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 98b08199..6c5d8140 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -13,7 +13,6 @@
 from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast
 
 import numpy as np
-import pyarrow as pa
 
 from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
 from virtualizarr.types import ChunkKey

From 17e35cc1baf3466c6a194f8a47654916e939d192 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Thu, 26 Feb 2026 18:22:47 -0700
Subject: [PATCH 07/22] adds zarr to test-py31* test group

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0efe6908..a700b28b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -203,8 +203,8 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
 min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
 # Inherit from min-deps to get all the test commands, along with optional dependencies
 test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
-test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
-test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
+test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
+test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
 minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
 upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]

From 6cbb7c0dd1dc2746b1c17999846d6df876793eaa Mon Sep 17 00:00:00 2001
From: Raphael Hagen <norlandrhagen@gmail.com>
Date: Fri, 27 Feb 2026 09:23:53 -0700
Subject: [PATCH 08/22] Update virtualizarr/manifests/manifest.py

Co-authored-by: Tom Nicholas <tom@earthmover.io>
---
 virtualizarr/manifests/manifest.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 6c5d8140..4009bc8e 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -331,10 +331,9 @@ def from_arrays(
     def _from_arrow(
         cls,
         *,
-        chunk_keys: "pa.Array",
-        paths: "pa.Array",
-        sizes: "pa.Array",
-        chunk_grid_shape: tuple[int, ...],
+        paths: "pa.StringArray",
+        offsets: "pa.UInt64Array",
+        lengths: "pa.UInt64Array",
     ) -> "ChunkManifest":
         """
         Create a ChunkManifest directly from PyArrow arrays.

From b400a345011ddb50e17b1b20261d1ce7807d9e29 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 27 Feb 2026 10:37:03 -0700
Subject: [PATCH 09/22] updates _from_arrow method to have paths, offsets,
 lengths and opt[shape]. Moves all weird arrow reshaping into
 zarr:build_chunk_manifest

---
 virtualizarr/manifests/manifest.py | 56 ++++++++---------
 virtualizarr/parsers/zarr.py       | 98 ++++++++++++++++++++++++++----
 2 files changed, 113 insertions(+), 41 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 4009bc8e..f3a11c17 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -334,46 +334,46 @@ def _from_arrow(
         paths: "pa.StringArray",
         offsets: "pa.UInt64Array",
         lengths: "pa.UInt64Array",
+        shape: tuple[int, ...] | None = None,
     ) -> "ChunkManifest":
         """
-        Create a ChunkManifest directly from PyArrow arrays.
+        Create a ChunkManifest from flat 1D PyArrow arrays.
 
+        Avoids intermediate Python dicts by converting Arrow arrays directly
+        to the numpy arrays used internally by ChunkManifest.
 
         Parameters
         ----------
-        chunk_keys
-            Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array.
         paths
-            Full paths to chunks, as a PyArrow Utf8 array.
-        sizes
-            Chunk sizes in bytes, as a PyArrow UInt64 array.
-        chunk_grid_shape
-            Shape of the chunk grid, used to pre-allocate numpy arrays.
+            Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks.
+        offsets
+            Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
+        lengths
+            Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
+        shape
+            Shape to reshape the flat arrays into. If None, arrays are used as-is (1D).
         """
         import pyarrow as pa
         import pyarrow.compute as pc
 
-        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
-        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-
-        if len(chunk_keys) > 0:
-            # split "1.2" into  [1, 2], then reshape to (n_chunks, ndim), use for indexing
-            ndim = len(chunk_grid_shape)
-            indices_flat = pc.cast(
-                pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64()
-            ).to_numpy()
-            indices_2d = indices_flat.reshape(-1, ndim)
-            idx_tuple = tuple(indices_2d[:, i] for i in range(ndim))
-
-            paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False)
-            lengths_arr[idx_tuple] = sizes.to_numpy()
-
-        return cls.from_arrays(
-            paths=paths_arr,
-            offsets=offsets_arr,
-            lengths=lengths_arr,
+        paths_np = (
+            pc.if_else(pc.is_null(paths), "", paths)
+            .to_numpy(zero_copy_only=False)
+            .astype(np.dtypes.StringDType())
         )
+        offsets_np = pc.if_else(
+            pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets
+        ).to_numpy(zero_copy_only=False)
+        lengths_np = pc.if_else(
+            pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths
+        ).to_numpy(zero_copy_only=False)
+
+        if shape is not None:
+            paths_np = paths_np.reshape(shape)
+            offsets_np = offsets_np.reshape(shape)
+            lengths_np = lengths_np.reshape(shape)
+
+        return cls.from_arrays(paths=paths_np, offsets=offsets_np, lengths=lengths_np)
 
     @property
     def ndim_chunk_grid(self) -> int:
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 39db18a8..cd3b2c20 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -172,6 +172,15 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Get V3 metadata for the array (converting if necessary)."""
         ...
 
+    @abstractmethod
+    def get_prefix(self, zarr_array: ZarrArrayType) -> str:
+        """Get the storage prefix for chunk listing."""
+        ...
+
+    @abstractmethod
+    def validate(self, zarr_array: ZarrArrayType) -> None:
+        """Validate that the array can be virtualized."""
+
 
 class ZarrV2Strategy(ZarrVersionStrategy):
     """Strategy for handling Zarr V2 arrays."""
@@ -250,6 +259,13 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
 
         return v3_metadata
 
+    def get_prefix(self, zarr_array: ZarrArrayType) -> str:
+        name = _get_array_name(zarr_array)
+        return f"{name}/" if name else ""
+
+    def validate(self, zarr_array: ZarrArrayType) -> None:
+        pass  # no restrictions for V2
+
 
 class ZarrV3Strategy(ZarrVersionStrategy):
     """Strategy for handling Zarr V3 arrays."""
@@ -291,6 +307,23 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Return V3 metadata as-is (no conversion needed)."""
         return zarr_array.metadata  # type: ignore[return-value]
 
+    def get_prefix(self, zarr_array: ZarrArrayType) -> str:
+        name = _get_array_name(zarr_array)
+        return f"{name}/c/" if name else "c/"
+
+    def validate(self, zarr_array: ZarrArrayType) -> None:
+        from zarr.codecs import ShardingCodec
+
+        if any(
+            isinstance(codec, ShardingCodec) for codec in zarr_array.metadata.codecs
+        ):
+            raise NotImplementedError(
+                "Zarr V3 arrays with sharding are not yet supported. "
+                "Sharding stores multiple chunks in a single storage object with non-zero offsets, "
+                "which VirtualiZarr does not currently handle. "
+                "Reading sharded arrays without proper offset handling would result in corrupted data."
+            )
+
 
 def get_strategy(zarr_array: ZarrArrayType) -> ZarrVersionStrategy:
     """
@@ -328,12 +361,15 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are
     missing, Zarr will return the fill_value for those regions when the array is read.
     """
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
     strategy = get_strategy(zarr_array)
     chunk_grid_shape = zarr_array._chunk_grid_shape
 
-    # For scalar arrays use _from_arrays
-    chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
+    # scalar arrays go through the dict path instead of the pure arrow bit
     if zarr_array.shape == ():
+        chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
         if not chunk_map:
             return ChunkManifest(chunk_map, shape=chunk_grid_shape)
         paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
@@ -348,24 +384,60 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
             paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr
         )
 
-    # for non scalar arrays, use the new _from_arrow method
-    name = _get_array_name(zarr_array)
-    if zarr_array.metadata.zarr_format == 3:
-        prefix = f"{name}/c/" if name else "c/"
-    else:
-        prefix = f"{name}/" if name else ""
+    # check for v3 sharding / prefix update
+    strategy.validate(zarr_array)
+    prefix = strategy.get_prefix(zarr_array)
 
     result = await _build_chunk_mapping(zarr_array, path, prefix)
 
     if result is None:
         return ChunkManifest({}, shape=chunk_grid_shape)
 
-    normalized_keys, full_paths, sizes = result
+    normalized_keys, full_paths, all_lengths = result
+
+    # Incoming: lots of LLM arrow mumbo jumbo for sparse arrays
+
+    # compute flat positions for each listed chunk (C-order)
+    ndim = len(chunk_grid_shape)
+    if ndim == 1:
+        # 1D shortcut: we can bypass in the simple case
+        flat_positions = pc.cast(normalized_keys, pa.int64())
+        total_size = chunk_grid_shape[0]
+    else:
+        # compute C-order strides and dot with per-dimension indices
+        stride = 1
+        strides = []
+        for s in reversed(chunk_grid_shape):
+            strides.insert(0, stride)
+            stride *= s
+        total_size = stride
+
+        flat_positions = pa.repeat(pa.scalar(0, pa.int64()), len(normalized_keys))
+        for dim, dim_stride in enumerate(strides):
+            dim_indices = pc.list_slice(
+                pc.split_pattern(normalized_keys, pattern="."), dim, dim + 1
+            ).flatten()
+            flat_positions = pc.add(
+                flat_positions,
+                pc.multiply(pc.cast(dim_indices, pa.int64()), dim_stride),
+            )
+
+    # scatter listed chunks into dense flat arrow arrays via join on flat index
+    # How will this left join scale? poorly?
+    updates = pa.table(
+        {"idx": flat_positions, "path": full_paths, "length": all_lengths}
+    )
+    dense = (
+        pa.table({"idx": pa.array(range(total_size), type=pa.int64())})
+        .join(updates, "idx", join_type="left outer")
+        .sort_by("idx")
+    )
+
     return ChunkManifest._from_arrow(
-        chunk_keys=normalized_keys,
-        paths=full_paths,
-        sizes=sizes,
-        chunk_grid_shape=chunk_grid_shape,
+        paths=dense["path"].combine_chunks(),
+        offsets=pa.repeat(pa.scalar(0, type=pa.uint64()), total_size),
+        lengths=dense["length"].combine_chunks(),
+        shape=chunk_grid_shape,
     )
 
 

From e22981f939e6064db3ad47ab3c37854c6add4a41 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 12:31:15 -0700
Subject: [PATCH 10/22] update releases.md

---
 docs/releases.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/releases.md b/docs/releases.md
index 41789d5c..f89add24 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -38,6 +38,10 @@ This release moves the `ObjectStoreRegistry` to a separate package `obspec_utils
 
 ### New Features
 
+- Improved `ZarrParser` performance.
+  ([#892](https://github.com/zarr-developers/VirtualiZarr/pull/892)).
+  By [Raphael Hagen](https://github.com/norlandrhagen).
+
 - Added `reader_factory` parameter to `HDFParser` to allow customizing how files are read
   ([#844](https://github.com/zarr-developers/VirtualiZarr/pull/844)).
   By [Max Jones](https://github.com/maxrjones).

From fda8ce61e1ba3a24f18796a2bf6ee9ccaed9b622 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 13:32:28 -0700
Subject: [PATCH 11/22] mypy

---
 virtualizarr/manifests/manifest.py |  6 +++---
 virtualizarr/parsers/zarr.py       | 12 ++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index ab157b1c..f0d73155 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -18,7 +18,7 @@
 from virtualizarr.types import ChunkKey
 
 if TYPE_CHECKING:
-    import pyarrow as pa
+    import pyarrow as pa  # type: ignore[import-untyped]
 
 # doesn't guarantee that writers actually handle these
 VALID_URI_PREFIXES = {
@@ -353,8 +353,8 @@ def _from_arrow(
         shape
             Shape to reshape the flat arrays into. If None, arrays are used as-is (1D).
         """
-        import pyarrow as pa
-        import pyarrow.compute as pc
+        import pyarrow as pa  # type: ignore[import-untyped]
+        import pyarrow.compute as pc  # type: ignore[import-untyped]
 
         paths_np = (
             pc.if_else(pc.is_null(paths), "", paths)
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 510a167f..45bbf87b 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -122,7 +122,9 @@ async def _build_chunk_mapping(
 
     path_batches = []
     size_batches = []
-    stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
+    stream = cast(ObjectStore, zarr_array.store).store.list_async(
+        prefix=prefix, return_arrow=True
+    )
     async for batch in stream:
         pa_path_col = pa.array(batch.column("path"))
         not_metadata = pc.invert(
@@ -199,7 +201,7 @@ async def get_chunk_mapping(
             scalar_key = f"{prefix}0"
             return await _handle_scalar_array(zarr_array, path, scalar_key)
 
-        return await _build_chunk_mapping(zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)  # type: ignore[return-value]
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Convert V2 metadata to V3 format."""
@@ -302,7 +304,7 @@ async def get_chunk_mapping(
 
         # List chunk keys under the c/ subdirectory
         prefix = f"{name}/c/" if name else "c/"
-        return await _build_chunk_mapping(zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)  # type: ignore[return-value]
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Return V3 metadata as-is (no conversion needed)."""
@@ -315,6 +317,8 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
     def validate(self, zarr_array: ZarrArrayType) -> None:
         from zarr.codecs import ShardingCodec
 
+        if not isinstance(zarr_array.metadata, ArrayV3Metadata):
+            return
         if any(
             isinstance(codec, ShardingCodec) for codec in zarr_array.metadata.codecs
         ):
@@ -407,7 +411,7 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     else:
         # compute C-order strides and dot with per-dimension indices
         stride = 1
-        strides = []
+        strides: list[int] = []
         for s in reversed(chunk_grid_shape):
             strides.insert(0, stride)
             stride *= s

From bbd6a1f52017a101fcb73fde30dcbe8378293a41 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 13:35:13 -0700
Subject: [PATCH 12/22] mypy-2

---
 virtualizarr/parsers/zarr.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 45bbf87b..5c06d8d0 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -27,7 +27,7 @@
 )
 
 if TYPE_CHECKING:
-    import pyarrow as pa
+    import pyarrow as pa  # type: ignore[import-untyped]
     import zarr
 
 T = TypeVar("T")
@@ -117,8 +117,8 @@ async def _build_chunk_mapping(
     -------
     Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found.
     """
-    import pyarrow as pa
-    import pyarrow.compute as pc
+    import pyarrow as pa  # type: ignore[import-untyped]
+    import pyarrow.compute as pc  # type: ignore[import-untyped]
 
     path_batches = []
     size_batches = []
@@ -366,8 +366,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are
     missing, Zarr will return the fill_value for those regions when the array is read.
     """
-    import pyarrow as pa
-    import pyarrow.compute as pc
+    import pyarrow as pa  # type: ignore[import-untyped]
+    import pyarrow.compute as pc  # type: ignore[import-untyped]
 
     strategy = get_strategy(zarr_array)
     chunk_grid_shape = zarr_array._chunk_grid_shape

From 9cba9e8cbdce0768e754975a36b24c1e619acd96 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 13:48:54 -0700
Subject: [PATCH 13/22] update pyproj

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 4a6fa453..cd9e08b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ all_parsers = [
     "virtualizarr[fits]",
     "virtualizarr[kerchunk_parquet]",
     "virtualizarr[tiff]",
+    "virtualizarr[zarr]"
 ]
 
 # writers

From f50b724dc25ff426818ade7f2e28032bc0e0690c Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 13:51:44 -0700
Subject: [PATCH 14/22] adds new zarr parser deps and fix to acccessor

---
 pyproject.toml           |  2 ++
 virtualizarr/accessor.py | 22 ++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8f21e877..797e1ea8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,6 +82,8 @@ all_parsers = [
 icechunk = [
     "icechunk>=1.1.2",
 ]
+zarr = ["arro3-core", "pyarrow"]
+
 kerchunk = ["fastparquet"]
 
 all_writers = [
diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
index d898a87c..7eadd572 100644
--- a/virtualizarr/accessor.py
+++ b/virtualizarr/accessor.py
@@ -192,6 +192,7 @@ def to_kerchunk(
 
             return None
         elif format == "parquet":
+            import pandas as pd
             from kerchunk.df import refs_to_dataframe
 
             if isinstance(filepath, Path):
@@ -199,14 +200,19 @@ def to_kerchunk(
             elif isinstance(filepath, str):
                 url = filepath
 
-            # refs_to_dataframe is responsible for writing to parquet.
-            # at no point does it create a full in-memory dataframe.
-            refs_to_dataframe(
-                refs,
-                url=url,
-                record_size=record_size,
-                categorical_threshold=categorical_threshold,
-            )
+            # The zarr-parser performance update PR #892 adds pyarrow and arro3-core as deps.
+            # These break the `kerchunk` refs_to_dataframe behavior.
+            # It seems like pyarrow makes pandas default to an ArrowStringArray
+            # which fastparquet cannot zero-copy encode.
+            # TODO: remove once fastparquet or kerchunk handle ArrowStringArray.
+
+            with pd.option_context("future.infer_string", False):
+                refs_to_dataframe(
+                    refs,
+                    url=url,
+                    record_size=record_size,
+                    categorical_threshold=categorical_threshold,
+                )
             return None
         else:
             raise ValueError(f"Unrecognized output format: {format}")

From 4ed82950e66e9a3294c8005eb202eecdda529bee Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 13:57:40 -0700
Subject: [PATCH 15/22] fix double pyproj def

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b179b707..7bbe18d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,7 +85,6 @@ all_parsers = [
 icechunk = [
     "icechunk>=1.1.2",
 ]
-zarr = ["arro3-core", "pyarrow"]
 
 kerchunk = ["fastparquet"]
 

From 9114613b6593a6d96fdb2f4ccf4c0ae02026b91e Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 14:08:30 -0700
Subject: [PATCH 16/22] adds requires pyarrow decorator to the test_zarr so
 mins deps are ok

---
 virtualizarr/tests/__init__.py                   | 1 +
 virtualizarr/tests/test_parsers/test_kerchunk.py | 6 +++++-
 virtualizarr/tests/test_parsers/test_zarr.py     | 9 +++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index c14f048c..b1c030b4 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -42,3 +42,4 @@ def _importorskip(
 has_dask, requires_dask = _importorskip("dask")
 has_obstore, requires_obstore = _importorskip("obstore")
 has_tiff, requires_tiff = _importorskip("virtual_tiff")
+has_pyarrow, requires_pyarrow = _importorskip("pyarrow")
diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py
index 0728ee04..73a420ae 100644
--- a/virtualizarr/tests/test_parsers/test_kerchunk.py
+++ b/virtualizarr/tests/test_parsers/test_kerchunk.py
@@ -305,10 +305,14 @@ def test_open_virtual_dataset_existing_kerchunk_refs(
                 ujson.dump(example_reference_dict, json_file)
             parser = KerchunkJSONParser(fs_root="file://")
         if reference_format == "parquet":
+            import pandas as pd
             from kerchunk.df import refs_to_dataframe
 
             ref_filepath = tmp_path / "ref.parquet"
-            refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix())
+            with pd.option_context("future.infer_string", False):
+                refs_to_dataframe(
+                    fo=example_reference_dict, url=ref_filepath.as_posix()
+                )
             parser = KerchunkParquetParser(fs_root="file://")
         expected_refs = netcdf4_virtual_dataset.vz.to_kerchunk(format="dict")
         with open_virtual_dataset(
diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py
index ce690c5c..5acc7bbb 100644
--- a/virtualizarr/tests/test_parsers/test_zarr.py
+++ b/virtualizarr/tests/test_parsers/test_zarr.py
@@ -20,6 +20,7 @@
     get_strategy,
     join_url,
 )
+from virtualizarr.tests import requires_pyarrow
 
 ZarrArrayType = zarr.AsyncArray | zarr.Array
 
@@ -52,6 +53,7 @@ def zarr_versions(param_name="zarr_format", indirect=False):
     )
 
 
+@requires_pyarrow
 @zarr_versions(param_name="zarr_store", indirect=True)
 class TestOpenVirtualDatasetZarr:
     def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]):
@@ -178,6 +180,7 @@ def test_unsupported_zarr_format():
         get_strategy(mock_array)
 
 
+@requires_pyarrow
 @zarr_versions()
 def test_empty_array_chunk_mapping(tmpdir, zarr_format):
     """Test chunk mapping for arrays with no chunks written yet."""
@@ -306,6 +309,7 @@ async def get_meta():
     assert metadata.fill_value is not None
 
 
+@requires_pyarrow
 def test_build_chunk_manifest_empty_with_shape():
     """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks."""
 
@@ -326,6 +330,7 @@ async def get_manifest():
     assert manifest.shape_chunk_grid == (2, 2)
 
 
+@requires_pyarrow
 @zarr_versions()
 def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
     """Test that arrays with some missing chunks (sparse arrays) are handled correctly."""
@@ -368,6 +373,7 @@ async def get_manifest():
     assert manifest.shape_chunk_grid == (3, 3)
 
 
+@requires_pyarrow
 @zarr_versions()
 def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format):
     """Roundtrip a small dataset through the ZarrParser and compare with xarray."""
@@ -403,6 +409,7 @@ def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format):
             xr.testing.assert_identical(actual, expected)
 
 
+@requires_pyarrow
 @zarr_versions()
 def test_parser_scalar_roundtrip_matches_xarray(tmpdir, zarr_format):
     """Roundtrip a small dataset through the ZarrParser and compare with xarray."""
@@ -463,6 +470,7 @@ async def outer():
     assert result == 42
 
 
+@requires_pyarrow
 @zarr_versions()
 def test_zarr_parser_works_inside_running_event_loop(tmpdir, zarr_format):
     """Test that ZarrParser.__call__ works inside a running event loop (notebook scenario)."""
@@ -490,6 +498,7 @@ async def run_parser_in_loop():
             xr.testing.assert_identical(actual, expected)
 
 
+@requires_pyarrow
 def test_sharded_array_raises_error(tmpdir):
     """Test that attempting to virtualize a sharded Zarr V3 array raises NotImplementedError."""
     filepath = f"{tmpdir}/test_sharded.zarr"

From 31c8ed0b175136f21e2589bee5bd5a89c50bb987 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 14:15:51 -0700
Subject: [PATCH 17/22] add strange pyarrow pandas context override to more
 test_kerchunk.py tests

---
 virtualizarr/tests/test_parsers/test_kerchunk.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py
index 73a420ae..2a2863e1 100644
--- a/virtualizarr/tests/test_parsers/test_kerchunk.py
+++ b/virtualizarr/tests/test_parsers/test_kerchunk.py
@@ -164,6 +164,7 @@ def test_kerchunk_parquet_sparse_array(tmp_path, local_registry):
     This tests reading a kerchunk parquet where not all chunks are present,
     which is a common case for sparse arrays.
     """
+    import pandas as pd
     from kerchunk.df import refs_to_dataframe
 
     # Create refs with only one chunk defined (sparse array)
@@ -179,7 +180,8 @@ def test_kerchunk_parquet_sparse_array(tmp_path, local_registry):
     }
 
     ref_filepath = tmp_path / "sparse.parq"
-    refs_to_dataframe(fo=refs, url=str(ref_filepath))
+    with pd.option_context("future.infer_string", False):
+        refs_to_dataframe(fo=refs, url=str(ref_filepath))
 
     parser = KerchunkParquetParser()
     with open_virtual_dataset(
@@ -367,10 +369,12 @@ def test_notimplemented_read_inline_refs_parquet(
 ):
     # Test that parquet references with inlined data raise NotImplementedError
     # https://github.com/zarr-developers/VirtualiZarr/issues/489
+    import pandas as pd
     from kerchunk.df import refs_to_dataframe
 
     ref_filepath = tmp_path / "ref.parquet"
-    refs_to_dataframe(fo=netcdf4_inlined_ref, url=ref_filepath.as_posix())
+    with pd.option_context("future.infer_string", False):
+        refs_to_dataframe(fo=netcdf4_inlined_ref, url=ref_filepath.as_posix())
 
     parser = KerchunkParquetParser()
     with pytest.raises(

From e0ddfc28f36eb6b58506f12b82af49a4518ee7ae Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 14:24:25 -0700
Subject: [PATCH 18/22] mypy again

---
 virtualizarr/manifests/manifest.py |  6 +++---
 virtualizarr/parsers/zarr.py       | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index f0d73155..c19c4894 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -18,7 +18,7 @@
 from virtualizarr.types import ChunkKey
 
 if TYPE_CHECKING:
-    import pyarrow as pa  # type: ignore[import-untyped]
+    import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
 
 # doesn't guarantee that writers actually handle these
 VALID_URI_PREFIXES = {
@@ -353,8 +353,8 @@ def _from_arrow(
         shape
             Shape to reshape the flat arrays into. If None, arrays are used as-is (1D).
         """
-        import pyarrow as pa  # type: ignore[import-untyped]
-        import pyarrow.compute as pc  # type: ignore[import-untyped]
+        import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
+        import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
 
         paths_np = (
             pc.if_else(pc.is_null(paths), "", paths)
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 5c06d8d0..44dc2a12 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -27,7 +27,7 @@
 )
 
 if TYPE_CHECKING:
-    import pyarrow as pa  # type: ignore[import-untyped]
+    import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
     import zarr
 
 T = TypeVar("T")
@@ -117,8 +117,8 @@ async def _build_chunk_mapping(
     -------
     Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found.
     """
-    import pyarrow as pa  # type: ignore[import-untyped]
-    import pyarrow.compute as pc  # type: ignore[import-untyped]
+    import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
+    import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
 
     path_batches = []
     size_batches = []
@@ -366,8 +366,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are
     missing, Zarr will return the fill_value for those regions when the array is read.
     """
-    import pyarrow as pa  # type: ignore[import-untyped]
-    import pyarrow.compute as pc  # type: ignore[import-untyped]
+    import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
+    import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
 
     strategy = get_strategy(zarr_array)
     chunk_grid_shape = zarr_array._chunk_grid_shape

From d96d5c58ec183d5ffcbae6031496fb1f5ba39e3b Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Fri, 6 Mar 2026 16:35:47 -0700
Subject: [PATCH 19/22] incorporate feedback

---
 virtualizarr/manifests/manifest.py           | 33 +++++------
 virtualizarr/parsers/zarr.py                 | 58 ++++++--------------
 virtualizarr/tests/test_parsers/test_zarr.py | 10 +---
 3 files changed, 35 insertions(+), 66 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index c19c4894..d1029fb9 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -334,7 +334,7 @@ def _from_arrow(
         paths: "pa.StringArray",
         offsets: "pa.UInt64Array",
         lengths: "pa.UInt64Array",
-        shape: tuple[int, ...] | None = None,
+        shape: tuple[int, ...],
     ) -> "ChunkManifest":
         """
         Create a ChunkManifest from flat 1D PyArrow arrays.
@@ -351,29 +351,30 @@ def _from_arrow(
         lengths
             Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
         shape
-            Shape to reshape the flat arrays into. If None, arrays are used as-is (1D).
+            Shape to reshape the flat arrays into.
         """
         import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
         import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
 
-        paths_np = (
-            pc.if_else(pc.is_null(paths), "", paths)
-            .to_numpy(zero_copy_only=False)
-            .astype(np.dtypes.StringDType())
-        )
-        offsets_np = pc.if_else(
+        arrow_paths = pc.if_else(pc.is_null(paths), "", paths)
+        arrow_offsets = pc.if_else(
             pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets
-        ).to_numpy(zero_copy_only=False)
-        lengths_np = pc.if_else(
+        )
+        arrow_lengths = pc.if_else(
             pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths
-        ).to_numpy(zero_copy_only=False)
+        )
 
-        if shape is not None:
-            paths_np = paths_np.reshape(shape)
-            offsets_np = offsets_np.reshape(shape)
-            lengths_np = lengths_np.reshape(shape)
+        np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype(
+            np.dtypes.StringDType()
+        )
+        np_offsets = arrow_offsets.to_numpy(zero_copy_only=False)
+        np_lengths = arrow_lengths.to_numpy(zero_copy_only=False)
 
-        return cls.from_arrays(paths=paths_np, offsets=offsets_np, lengths=lengths_np)
+        return cls.from_arrays(
+            paths=np_paths.reshape(shape),
+            offsets=np_offsets.reshape(shape),
+            lengths=np_lengths.reshape(shape),
+        )
 
     @property
     def ndim_chunk_grid(self) -> int:
diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 44dc2a12..4d736c30 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -22,7 +22,6 @@
     ManifestStore,
 )
 from virtualizarr.manifests.manifest import (
-    parse_manifest_index,
     validate_and_normalize_path_to_uri,
 )
 
@@ -370,27 +369,21 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
 
     strategy = get_strategy(zarr_array)
+    strategy.validate(zarr_array)
     chunk_grid_shape = zarr_array._chunk_grid_shape
 
-    # scalar arrays go through the dict path instead of the pure arrow bit
     if zarr_array.shape == ():
         chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
         if not chunk_map:
             return ChunkManifest(chunk_map, shape=chunk_grid_shape)
-        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
-        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
-        for key, entry in chunk_map.items():
-            idx = parse_manifest_index(key)
-            paths_arr[idx] = entry["path"]
-            offsets_arr[idx] = entry["offset"]
-            lengths_arr[idx] = entry["length"]
-        return ChunkManifest.from_arrays(
-            paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr
+        entry = next(iter(chunk_map.values()))
+        return ChunkManifest._from_arrow(
+            paths=pa.array([entry["path"]], type=pa.string()),
+            offsets=pa.array([entry["offset"]], type=pa.uint64()),
+            lengths=pa.array([entry["length"]], type=pa.uint64()),
+            shape=chunk_grid_shape,
         )
 
-    # check for v3 sharding / prefix update
-    strategy.validate(zarr_array)
     prefix = strategy.get_prefix(zarr_array)
 
     result = await _build_chunk_mapping(zarr_array, path, prefix)
@@ -400,40 +393,21 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
 
     normalized_keys, full_paths, all_lengths = result
 
-    # Incoming: lots of LLM arrow mumbo jumbo for sparse arrays
+    total_size = zarr_array.nchunks
 
-    # compute flat positions for each listed chunk (C-order)
-    ndim = len(chunk_grid_shape)
-    if ndim == 1:
-        # 1D shortcut: we can bypass in the simple case
-        flat_positions = pc.cast(normalized_keys, pa.int64())
-        total_size = chunk_grid_shape[0]
-    else:
-        # compute C-order strides and dot with per-dimension indices
-        stride = 1
-        strides: list[int] = []
-        for s in reversed(chunk_grid_shape):
-            strides.insert(0, stride)
-            stride *= s
-        total_size = stride
-
-        flat_positions = pa.repeat(pa.scalar(0, pa.int64()), len(normalized_keys))
-        for dim, dim_stride in enumerate(strides):
-            dim_indices = pc.list_slice(
-                pc.split_pattern(normalized_keys, pattern="."), dim, dim + 1
-            ).flatten()
-            flat_positions = pc.add(
-                flat_positions,
-                pc.multiply(pc.cast(dim_indices, pa.int64()), dim_stride),
-            )
+    split_keys = pc.split_pattern(normalized_keys, pattern=".")
+    coords = [
+        pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy()
+        for dim in range(zarr_array.ndim)
+    ]
+    flat_positions = pa.array(np.ravel_multi_index(coords, chunk_grid_shape))
 
-    # scatter listed chunks into dense flat arrow arrays via join on flat index
-    # How will this left join scale? poorly?
+    # scatter listed chunks into a dense flat array (nulls = missing chunks)
     updates = pa.table(
         {"idx": flat_positions, "path": full_paths, "length": all_lengths}
     )
     dense = (
-        pa.table({"idx": pa.array(range(total_size), type=pa.int64())})
+        pa.table({"idx": pa.array(np.arange(total_size, dtype=np.int64))})
         .join(updates, "idx", join_type="left outer")
         .sort_by("idx")
     )
diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py
index 5acc7bbb..1330f131 100644
--- a/virtualizarr/tests/test_parsers/test_zarr.py
+++ b/virtualizarr/tests/test_parsers/test_zarr.py
@@ -22,6 +22,8 @@
 )
 from virtualizarr.tests import requires_pyarrow
 
+pytestmark = requires_pyarrow
+
 ZarrArrayType = zarr.AsyncArray | zarr.Array
 
 SKIP_OLDER_ZARR_PYTHON = pytest.mark.skipif(
@@ -53,7 +55,6 @@ def zarr_versions(param_name="zarr_format", indirect=False):
     )
 
 
-@requires_pyarrow
 @zarr_versions(param_name="zarr_store", indirect=True)
 class TestOpenVirtualDatasetZarr:
     def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]):
@@ -180,7 +181,6 @@ def test_unsupported_zarr_format():
         get_strategy(mock_array)
 
 
-@requires_pyarrow
 @zarr_versions()
 def test_empty_array_chunk_mapping(tmpdir, zarr_format):
     """Test chunk mapping for arrays with no chunks written yet."""
@@ -309,7 +309,6 @@ async def get_meta():
     assert metadata.fill_value is not None
 
 
-@requires_pyarrow
 def test_build_chunk_manifest_empty_with_shape():
     """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks."""
 
@@ -330,7 +329,6 @@ async def get_manifest():
     assert manifest.shape_chunk_grid == (2, 2)
 
 
-@requires_pyarrow
 @zarr_versions()
 def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
     """Test that arrays with some missing chunks (sparse arrays) are handled correctly."""
@@ -373,7 +371,6 @@ async def get_manifest():
     assert manifest.shape_chunk_grid == (3, 3)
 
 
-@requires_pyarrow
 @zarr_versions()
 def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format):
     """Roundtrip a small dataset through the ZarrParser and compare with xarray."""
@@ -409,7 +406,6 @@ def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format):
             xr.testing.assert_identical(actual, expected)
 
 
-@requires_pyarrow
 @zarr_versions()
 def test_parser_scalar_roundtrip_matches_xarray(tmpdir, zarr_format):
     """Roundtrip a small dataset through the ZarrParser and compare with xarray."""
@@ -470,7 +466,6 @@ async def outer():
     assert result == 42
 
 
-@requires_pyarrow
 @zarr_versions()
 def test_zarr_parser_works_inside_running_event_loop(tmpdir, zarr_format):
     """Test that ZarrParser.__call__ works inside a running event loop (notebook scenario)."""
@@ -498,7 +493,6 @@ async def run_parser_in_loop():
             xr.testing.assert_identical(actual, expected)
 
 
-@requires_pyarrow
 def test_sharded_array_raises_error(tmpdir):
     """Test that attempting to virtualize a sharded Zarr V3 array raises NotImplementedError."""
     filepath = f"{tmpdir}/test_sharded.zarr"

From 716a0bb89d2a390dd4a2da6e0907ef6b542adc3c Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Mon, 9 Mar 2026 10:11:15 -0600
Subject: [PATCH 20/22] removed seperator normalization and added a method to
 get chunk seperator

---
 virtualizarr/parsers/zarr.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index 4d736c30..ca1ac28c 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -146,18 +146,16 @@ async def _build_chunk_mapping(
 
     if len(all_paths) == 0:
         return None
-    # normalize: strip prefix, replace / with .
-    stripped = pc.utf8_replace_slice(
+    stripped_keys = pc.utf8_replace_slice(
         all_paths, start=0, stop=len(prefix), replacement=""
     )
-    normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".")
 
     # construct full paths
     full_paths = pc.binary_join_element_wise(
         pa.scalar(path.rstrip("/")), all_paths, "/"
     )
 
-    return normalized_keys, full_paths, all_sizes
+    return stripped_keys, full_paths, all_sizes
 
 
 class ZarrVersionStrategy(ABC):
@@ -180,6 +178,11 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         """Get the storage prefix for chunk listing."""
         ...
 
+    @abstractmethod
+    def get_separator(self, zarr_array: ZarrArrayType) -> str:
+        """Get the chunk key separator for the array."""
+        ...
+
     @abstractmethod
     def validate(self, zarr_array: ZarrArrayType) -> None:
         """Validate that the array can be virtualized."""
@@ -265,6 +268,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         name = _get_array_name(zarr_array)
         return f"{name}/" if name else ""
 
+    def get_separator(self, zarr_array: ZarrArrayType) -> str:
+        # Default for v2 should be "."
+        return zarr_array.metadata.dimension_separator
+
     def validate(self, zarr_array: ZarrArrayType) -> None:
         pass  # no restrictions for V2
 
@@ -313,6 +320,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         name = _get_array_name(zarr_array)
         return f"{name}/c/" if name else "c/"
 
+    def get_separator(self, zarr_array: ZarrArrayType) -> str:
+        # gets chunk separator. Default for v3 should be "/"
+        return zarr_array.metadata.chunk_key_encoding.separator
+
     def validate(self, zarr_array: ZarrArrayType) -> None:
         from zarr.codecs import ShardingCodec
 
@@ -391,11 +402,11 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     if result is None:
         return ChunkManifest({}, shape=chunk_grid_shape)
 
-    normalized_keys, full_paths, all_lengths = result
+    stripped_keys, full_paths, all_lengths = result
 
     total_size = zarr_array.nchunks
-
-    split_keys = pc.split_pattern(normalized_keys, pattern=".")
+    separator = strategy.get_separator(zarr_array)
+    split_keys = pc.split_pattern(stripped_keys, pattern=separator)
     coords = [
         pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy()
         for dim in range(zarr_array.ndim)

From 5df7705a12e622025f73b5c018744311142f5713 Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Mon, 9 Mar 2026 10:14:06 -0600
Subject: [PATCH 21/22] de-dup pyproj

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b57d79a0..e7d23fdc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,8 +86,6 @@ icechunk = [
     "icechunk>=1.1.2",
 ]
 
-kerchunk = ["fastparquet"]
-zarr = ["arro3-core", "pyarrow"]
 
 kerchunk = ["fastparquet", "pandas"]
 

From 08232a84c98598dd0f0d7cb149c471e3f603903a Mon Sep 17 00:00:00 2001
From: norlandrhagen <norlandrhagen@gmail.com>
Date: Mon, 9 Mar 2026 10:30:50 -0600
Subject: [PATCH 22/22] mypy

---
 virtualizarr/parsers/zarr.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
index ca1ac28c..41785559 100644
--- a/virtualizarr/parsers/zarr.py
+++ b/virtualizarr/parsers/zarr.py
@@ -11,6 +11,7 @@
 import zarr
 from obspec_utils.registry import ObjectStoreRegistry
 from zarr.api.asynchronous import open_group as open_group_async
+from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding
 from zarr.core.group import GroupMetadata
 from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
 from zarr.storage import ObjectStore
@@ -179,9 +180,7 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         ...
 
     @abstractmethod
-    def get_separator(self, zarr_array: ZarrArrayType) -> str:
-        """Get the chunk key separator for the array."""
-        ...
+    def _get_separator(self, zarr_array: ZarrArrayType) -> str: ...
 
     @abstractmethod
     def validate(self, zarr_array: ZarrArrayType) -> None:
@@ -268,9 +267,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         name = _get_array_name(zarr_array)
         return f"{name}/" if name else ""
 
-    def get_separator(self, zarr_array: ZarrArrayType) -> str:
-        # Default for v2 should be "."
-        return zarr_array.metadata.dimension_separator
+    def _get_separator(self, zarr_array: ZarrArrayType) -> str:
+        from typing import cast
+
+        return cast(ArrayV2Metadata, zarr_array.metadata).dimension_separator
 
     def validate(self, zarr_array: ZarrArrayType) -> None:
         pass  # no restrictions for V2
@@ -320,9 +320,11 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str:
         name = _get_array_name(zarr_array)
         return f"{name}/c/" if name else "c/"
 
-    def get_separator(self, zarr_array: ZarrArrayType) -> str:
-        # gets chunk separator. Default for v3 should be "/"
-        return zarr_array.metadata.chunk_key_encoding.separator
+    def _get_separator(self, zarr_array: ZarrArrayType) -> str:
+        from typing import cast
+
+        metadata = cast(ArrayV3Metadata, zarr_array.metadata)
+        return cast(DefaultChunkKeyEncoding, metadata.chunk_key_encoding).separator
 
     def validate(self, zarr_array: ZarrArrayType) -> None:
         from zarr.codecs import ShardingCodec
@@ -405,7 +407,7 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     stripped_keys, full_paths, all_lengths = result
 
     total_size = zarr_array.nchunks
-    separator = strategy.get_separator(zarr_array)
+    separator = strategy._get_separator(zarr_array)
     split_keys = pc.split_pattern(stripped_keys, pattern=separator)
     coords = [
         pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy()