From aa93b8b0a5017b55c986ffbbaff3cbf7579df695 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 11:30:42 -0700 Subject: [PATCH 01/22] updates zarr-parser to use obstore list_async instead of concurrent_map --- virtualizarr/parsers/zarr.py | 88 ++++++++++---------- virtualizarr/tests/test_parsers/test_zarr.py | 64 +++++++------- 2 files changed, 73 insertions(+), 79 deletions(-) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 8bb7cddd..38104ae2 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, cast +import numpy as np import zarr from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async @@ -19,8 +20,10 @@ ManifestGroup, ManifestStore, ) -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri -from virtualizarr.vendor.zarr.core.common import _concurrent_map +from virtualizarr.manifests.manifest import ( + parse_manifest_index, + validate_and_normalize_path_to_uri, +) if TYPE_CHECKING: import zarr @@ -90,33 +93,43 @@ async def _handle_scalar_array( async def _build_chunk_mapping( - chunk_keys: list[str], zarr_array: ZarrArrayType, path: str, prefix: str + zarr_array: ZarrArrayType, path: str, prefix: str ) -> dict[str, dict[str, Any]]: """ - Build chunk mapping from a list of chunk keys. + Build chunk mapping by listing the object store with obstore. + + Uses obstore's list_async with Arrow output to get chunk paths and sizes + in a single Rust-level call, avoiding per-chunk getsize calls. Parameters ---------- - chunk_keys - List of storage keys for chunks. zarr_array The Zarr array. path Base path for constructing chunk paths. prefix - Prefix to strip from chunk keys. + Prefix to list and strip from chunk keys. Returns ------- dict Mapping of normalized chunk coordinates to storage locations. """ + + size_map: dict[str, int] = {} + stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True) + async for batch in stream: + size_map.update( + zip(batch.column("path").to_pylist(), batch.column("size").to_pylist()) + ) + + # filter out metadata files + chunk_keys = [k for k in size_map if not k.split("/")[-1].startswith(".")] + if not chunk_keys: return {} - lengths = await _concurrent_map( - [(k,) for k in chunk_keys], zarr_array.store.getsize - ) + lengths = [size_map[k] for k in chunk_keys] dict_keys = _normalize_chunk_keys(chunk_keys, prefix) paths = [join_url(path, k) for k in chunk_keys] offsets = [0] * len(lengths) @@ -158,24 +171,7 @@ async def get_chunk_mapping( scalar_key = f"{prefix}0" return await _handle_scalar_array(zarr_array, path, scalar_key) - # List all keys under the array prefix, filtering out metadata files - prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] - if not prefix_keys: - return {} - - metadata_files = {".zarray", ".zattrs", ".zgroup", ".zmetadata"} - chunk_keys = [] - for key_tuple in prefix_keys: - key = key_tuple[0] - file_name = ( - key[len(prefix) :] - if prefix and key.startswith(prefix) - else key.split("/")[-1] - ) - if file_name not in metadata_files: - chunk_keys.append(key) - - return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix) + return await _build_chunk_mapping(zarr_array, path, prefix) def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Convert V2 metadata to V3 format.""" @@ -272,12 +268,7 @@ async def get_chunk_mapping( # List chunk keys under the c/ subdirectory prefix = f"{name}/c/" if name else "c/" - prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] - if not prefix_keys: - return {} - - chunk_keys = [x[0] for x in prefix_keys] - return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix) + return await _build_chunk_mapping(zarr_array, path, prefix) def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Return V3 metadata as-is (no conversion needed).""" @@ -322,17 +313,28 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan """ strategy = get_strategy(zarr_array) chunk_map = await strategy.get_chunk_mapping(zarr_array, path) + chunk_grid_shape = zarr_array._chunk_grid_shape if not chunk_map: - import math - - if zarr_array.shape and zarr_array.chunks: - chunk_grid_shape = tuple( - math.ceil(s / c) for s, c in zip(zarr_array.shape, zarr_array.chunks) - ) - return ChunkManifest(chunk_map, shape=chunk_grid_shape) - - return ChunkManifest(chunk_map) + return ChunkManifest(chunk_map, shape=chunk_grid_shape) + + # Pre-allocate N-D numpy arrays shaped like the chunk grid. + # Empty string paths indicate missing chunks (sparse arrays). + paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) + offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + + for key, entry in chunk_map.items(): + idx = parse_manifest_index(key) + paths_arr[idx] = entry["path"] + offsets_arr[idx] = entry["offset"] + lengths_arr[idx] = entry["length"] + + return ChunkManifest.from_arrays( + paths=paths_arr, + offsets=offsets_arr, + lengths=lengths_arr, + ) def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata: diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py index 27f8537e..20d46dbe 100644 --- a/virtualizarr/tests/test_parsers/test_zarr.py +++ b/virtualizarr/tests/test_parsers/test_zarr.py @@ -177,7 +177,9 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format): """Test chunk mapping for arrays with no chunks written yet.""" import asyncio - # Create an array but don't write any data + from obstore.store import LocalStore as ObsLocalStore + from zarr.storage import ObjectStore + filepath = f"{tmpdir}/empty.zarr" zarr.create( shape=(10, 10), @@ -188,12 +190,13 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format): ) async def get_chunk_map(): - zarr_array = await open_array(store=filepath, mode="r") + obs_store = ObsLocalStore(prefix=filepath) + zarr_store = ObjectStore(store=obs_store) + zarr_array = await open_array(store=zarr_store, mode="r") strategy = get_strategy(zarr_array) return await strategy.get_chunk_mapping(zarr_array, filepath) chunk_map = asyncio.run(get_chunk_map()) - # Empty arrays should return empty chunk map assert chunk_map == {} @@ -306,34 +309,31 @@ def test_build_chunk_manifest_empty_with_shape(): """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks.""" import asyncio - # Create an array but don't write data - store = zarr.storage.MemoryStore() - zarr.create(shape=(10, 10), chunks=(5, 5), dtype="int8", store=store, zarr_format=3) + from obstore.store import MemoryStore as ObsMemoryStore + from zarr.storage import ObjectStore + + obs_store = ObsMemoryStore() + zarr_store = ObjectStore(store=obs_store) + zarr.create( + shape=(10, 10), chunks=(5, 5), dtype="int8", store=zarr_store, zarr_format=3 + ) async def get_manifest(): - zarr_array = await open_array(store=store, mode="r") + zarr_array = await open_array(store=zarr_store, mode="r") return await build_chunk_manifest(zarr_array, "test://path") manifest = asyncio.run(get_manifest()) - # Should create manifest with proper chunk grid shape even if empty - assert manifest.shape_chunk_grid == (2, 2) # 10/5 = 2 chunks per dimension + assert manifest.shape_chunk_grid == (2, 2) @zarr_versions() def test_sparse_array_with_missing_chunks(tmpdir, zarr_format): - """Test that arrays with some missing chunks (sparse arrays) are handled correctly. - - This test verifies that VirtualiZarr correctly handles the case where some chunks - exist but others are missing. Zarr allows this for sparse data, and when chunks - are missing, Zarr returns the fill_value for those regions. VirtualiZarr should - preserve this sparsity in the manifest rather than generating entries for all - possible chunks based on the chunk grid. - """ + """Test that arrays with some missing chunks (sparse arrays) are handled correctly.""" import asyncio - from virtualizarr.parsers.zarr import build_chunk_manifest + from obstore.store import LocalStore as ObsLocalStore + from zarr.storage import ObjectStore - # Create a zarr array with a 3x3 chunk grid (9 possible chunks) filepath = f"{tmpdir}/sparse.zarr" arr = zarr.create( shape=(30, 30), @@ -344,36 +344,28 @@ def test_sparse_array_with_missing_chunks(tmpdir, zarr_format): fill_value=np.nan, ) - # Only write data to some chunks, leaving others missing (sparse) - # Write to chunks (0,0), (1,1), and (2,2) - a diagonal pattern arr[0:10, 0:10] = 1.0 # chunk 0.0 arr[10:20, 10:20] = 2.0 # chunk 1.1 arr[20:30, 20:30] = 3.0 # chunk 2.2 - # Chunks (0,1), (0,2), (1,0), (1,2), (2,0), (2,1) are intentionally left unwritten async def get_manifest(): - zarr_array = await open_array(store=filepath, mode="r") + obs_store = ObsLocalStore(prefix=filepath) + zarr_store = ObjectStore(store=obs_store) + zarr_array = await open_array(store=zarr_store, mode="r") return await build_chunk_manifest(zarr_array, filepath) manifest = asyncio.run(get_manifest()) - # The manifest should only contain the 3 chunks we actually wrote - assert len(manifest.dict()) == 3, f"Expected 3 chunks, got {len(manifest.dict())}" - - # Verify the expected chunks are present - assert "0.0" in manifest.dict(), "Chunk 0.0 should be present" - assert "1.1" in manifest.dict(), "Chunk 1.1 should be present" - assert "2.2" in manifest.dict(), "Chunk 2.2 should be present" + assert len(manifest.dict()) == 3 + assert "0.0" in manifest.dict() + assert "1.1" in manifest.dict() + assert "2.2" in manifest.dict() - # Verify missing chunks are not in the manifest missing_chunks = ["0.1", "0.2", "1.0", "1.2", "2.0", "2.1"] for chunk_key in missing_chunks: - assert chunk_key not in manifest.dict(), ( - f"Chunk {chunk_key} should not be present (it's missing/sparse)" - ) + assert chunk_key not in manifest.dict() - # The chunk grid shape should still reflect the full array dimensions - assert manifest.shape_chunk_grid == (3, 3), "Chunk grid should be 3x3" + assert manifest.shape_chunk_grid == (3, 3) @zarr_versions() From 37dff68bb17151e6e3673f4787b23f400e2fb2ee Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 11:37:34 -0700 Subject: [PATCH 02/22] removes the zarr vendor code --- virtualizarr/vendor/__init__.py | 0 virtualizarr/vendor/zarr/__init__.py | 0 virtualizarr/vendor/zarr/core/__init__.py | 0 virtualizarr/vendor/zarr/core/common.py | 34 ----------------------- 4 files changed, 34 deletions(-) delete mode 100644 virtualizarr/vendor/__init__.py delete mode 100644 virtualizarr/vendor/zarr/__init__.py delete mode 100644 virtualizarr/vendor/zarr/core/__init__.py delete mode 100644 virtualizarr/vendor/zarr/core/common.py diff --git a/virtualizarr/vendor/__init__.py b/virtualizarr/vendor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/virtualizarr/vendor/zarr/__init__.py b/virtualizarr/vendor/zarr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/virtualizarr/vendor/zarr/core/__init__.py b/virtualizarr/vendor/zarr/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/virtualizarr/vendor/zarr/core/common.py b/virtualizarr/vendor/zarr/core/common.py deleted file mode 100644 index b6363db7..00000000 --- a/virtualizarr/vendor/zarr/core/common.py +++ /dev/null @@ -1,34 +0,0 @@ -import asyncio -from itertools import starmap -from typing import ( - Any, - Awaitable, - Callable, - Iterable, - TypeVar, -) - -# Vendored directly from Zarr-python V3's private API -# https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53 -T = TypeVar("T", bound=tuple[Any, ...]) -V = TypeVar("V") - - -async def _concurrent_map( - items: Iterable[T], - func: Callable[..., Awaitable[V]], - limit: int | None = None, -) -> list[V]: - if limit is None: - return await asyncio.gather(*list(starmap(func, items))) - - else: - sem = asyncio.Semaphore(limit) - - async def run(item: tuple[Any]) -> V: - async with sem: - return await func(*item) - - return await asyncio.gather( - *[asyncio.ensure_future(run(item)) for item in items] - ) From 2fa25a769915844929dbbeb6c610d2fbd99522a3 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 11:48:41 -0700 Subject: [PATCH 03/22] adds arro3-core to zarr group --- pyproject.toml | 5 +++++ virtualizarr/parsers/zarr.py | 3 +++ 2 files changed, 8 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 66b6ce67..0806b986 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ hdf = [ "imagecodecs-numcodecs==2024.6.1", ] +zarr = ["arro3-core"] + # kerchunk-based parsers netcdf3 = [ "virtualizarr[remote]", @@ -209,6 +211,9 @@ upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-d all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"] docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"] +[tool.pixi.dependencies] +pytest = "*" + # Define commands to run within the docs environment [tool.pixi.feature.docs.tasks] serve-docs = { cmd = "mkdocs serve" } diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 38104ae2..5bf8a805 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -100,6 +100,7 @@ async def _build_chunk_mapping( Uses obstore's list_async with Arrow output to get chunk paths and sizes in a single Rust-level call, avoiding per-chunk getsize calls. + # https://github.com/zarr-developers/VirtualiZarr/issues/891 Parameters ---------- @@ -330,6 +331,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan offsets_arr[idx] = entry["offset"] lengths_arr[idx] = entry["length"] + # Construct the python ChunkManifest object's numpy arrays directly from the Arrow arrays, minimizing memory copies (i.e. the opposite of what I did in Pass manifests to icechunk as pyarrow arrays #861) + # TODO: I think we still have a arrow->dict->arrow conversion happening return ChunkManifest.from_arrays( paths=paths_arr, offsets=offsets_arr, From 626d0b921c26c86260e7099bb18d5a37baa04509 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 18:06:09 -0700 Subject: [PATCH 04/22] adds _from_arrow method --- pyproject.toml | 2 +- virtualizarr/manifests/manifest.py | 49 ++++++++ virtualizarr/parsers/zarr.py | 111 ++++++++++++------- virtualizarr/tests/test_parsers/test_zarr.py | 8 +- 4 files changed, 124 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0806b986..0efe6908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ hdf = [ "imagecodecs-numcodecs==2024.6.1", ] -zarr = ["arro3-core"] +zarr = ["arro3-core", "pyarrow"] # kerchunk-based parsers netcdf3 = [ diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 250ccc60..d3206f04 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -11,6 +11,7 @@ from typing import Any, NewType, TypedDict, cast import numpy as np +import pyarrow as pa from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index from virtualizarr.types import ChunkKey @@ -322,6 +323,54 @@ def from_arrays( return obj + @classmethod + def _from_arrow( + cls, + *, + chunk_keys: "pa.Array", + paths: "pa.Array", + sizes: "pa.Array", + chunk_grid_shape: tuple[int, ...], + ) -> "ChunkManifest": + """ + Create a ChunkManifest directly from PyArrow arrays. + + + Parameters + ---------- + chunk_keys + Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array. + paths + Full paths to chunks, as a PyArrow Utf8 array. + sizes + Chunk sizes in bytes, as a PyArrow UInt64 array. + chunk_grid_shape + Shape of the chunk grid, used to pre-allocate numpy arrays. + """ + import pyarrow.compute as pc + + paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) + offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + + if len(chunk_keys) > 0: + # split "1.2" into [1, 2], then reshape to (n_chunks, ndim), use for indexing + ndim = len(chunk_grid_shape) + indices_flat = pc.cast( + pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64() + ).to_numpy() + indices_2d = indices_flat.reshape(-1, ndim) + idx_tuple = tuple(indices_2d[:, i] for i in range(ndim)) + + paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False) + lengths_arr[idx_tuple] = sizes.to_numpy() + + return cls.from_arrays( + paths=paths_arr, + offsets=offsets_arr, + lengths=lengths_arr, + ) + @property def ndim_chunk_grid(self) -> int: """ diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 5bf8a805..39db18a8 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -26,6 +26,7 @@ ) if TYPE_CHECKING: + import pyarrow as pa import zarr ZarrArrayType = zarr.AsyncArray | zarr.Array @@ -94,13 +95,12 @@ async def _handle_scalar_array( async def _build_chunk_mapping( zarr_array: ZarrArrayType, path: str, prefix: str -) -> dict[str, dict[str, Any]]: +) -> tuple["pa.Array", "pa.Array", "pa.Array"] | None: """ Build chunk mapping by listing the object store with obstore. Uses obstore's list_async with Arrow output to get chunk paths and sizes in a single Rust-level call, avoiding per-chunk getsize calls. - # https://github.com/zarr-developers/VirtualiZarr/issues/891 Parameters ---------- @@ -113,32 +113,48 @@ async def _build_chunk_mapping( Returns ------- - dict - Mapping of normalized chunk coordinates to storage locations. + Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found. """ + import pyarrow as pa + import pyarrow.compute as pc - size_map: dict[str, int] = {} + path_batches = [] + size_batches = [] stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True) async for batch in stream: - size_map.update( - zip(batch.column("path").to_pylist(), batch.column("size").to_pylist()) + pa_path_col = pa.array(batch.column("path")) + not_metadata = pc.invert( + pc.or_( + pc.match_substring(pa_path_col, pattern="/."), + pc.starts_with(pa_path_col, "."), + ) ) - # filter out metadata files - chunk_keys = [k for k in size_map if not k.split("/")[-1].startswith(".")] + filtered_paths = pa_path_col.filter(not_metadata) + filtered_sizes = pa.array(batch.column("size")).filter(not_metadata) + path_batches.append(filtered_paths) + size_batches.append(filtered_sizes) - if not chunk_keys: - return {} + if not path_batches: + return None - lengths = [size_map[k] for k in chunk_keys] - dict_keys = _normalize_chunk_keys(chunk_keys, prefix) - paths = [join_url(path, k) for k in chunk_keys] - offsets = [0] * len(lengths) + all_paths = pa.concat_arrays(path_batches) + all_sizes = pa.concat_arrays(size_batches) - return { - key: {"path": p, "offset": offset, "length": length} - for key, p, offset, length in zip(dict_keys, paths, offsets, lengths) - } + if len(all_paths) == 0: + return None + # normalize: strip prefix, replace / with . + stripped = pc.utf8_replace_slice( + all_paths, start=0, stop=len(prefix), replacement="" + ) + normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".") + + # construct full paths + full_paths = pc.binary_join_element_wise( + pa.scalar(path.rstrip("/")), all_paths, "/" + ) + + return normalized_keys, full_paths, all_sizes class ZarrVersionStrategy(ABC): @@ -313,30 +329,43 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan missing, Zarr will return the fill_value for those regions when the array is read. """ strategy = get_strategy(zarr_array) - chunk_map = await strategy.get_chunk_mapping(zarr_array, path) chunk_grid_shape = zarr_array._chunk_grid_shape - if not chunk_map: - return ChunkManifest(chunk_map, shape=chunk_grid_shape) - - # Pre-allocate N-D numpy arrays shaped like the chunk grid. - # Empty string paths indicate missing chunks (sparse arrays). - paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) - offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - - for key, entry in chunk_map.items(): - idx = parse_manifest_index(key) - paths_arr[idx] = entry["path"] - offsets_arr[idx] = entry["offset"] - lengths_arr[idx] = entry["length"] - - # Construct the python ChunkManifest object's numpy arrays directly from the Arrow arrays, minimizing memory copies (i.e. the opposite of what I did in Pass manifests to icechunk as pyarrow arrays #861) - # TODO: I think we still have a arrow->dict->arrow conversion happening - return ChunkManifest.from_arrays( - paths=paths_arr, - offsets=offsets_arr, - lengths=lengths_arr, + # For scalar arrays use _from_arrays + chunk_map = await strategy.get_chunk_mapping(zarr_array, path) + if zarr_array.shape == (): + if not chunk_map: + return ChunkManifest(chunk_map, shape=chunk_grid_shape) + paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) + offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) + for key, entry in chunk_map.items(): + idx = parse_manifest_index(key) + paths_arr[idx] = entry["path"] + offsets_arr[idx] = entry["offset"] + lengths_arr[idx] = entry["length"] + return ChunkManifest.from_arrays( + paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr + ) + + # for non scalar arrays, use the new _from_arrow method + name = _get_array_name(zarr_array) + if zarr_array.metadata.zarr_format == 3: + prefix = f"{name}/c/" if name else "c/" + else: + prefix = f"{name}/" if name else "" + + result = await _build_chunk_mapping(zarr_array, path, prefix) + + if result is None: + return ChunkManifest({}, shape=chunk_grid_shape) + + normalized_keys, full_paths, sizes = result + return ChunkManifest._from_arrow( + chunk_keys=normalized_keys, + paths=full_paths, + sizes=sizes, + chunk_grid_shape=chunk_grid_shape, ) diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py index 20d46dbe..991d6bde 100644 --- a/virtualizarr/tests/test_parsers/test_zarr.py +++ b/virtualizarr/tests/test_parsers/test_zarr.py @@ -193,11 +193,11 @@ async def get_chunk_map(): obs_store = ObsLocalStore(prefix=filepath) zarr_store = ObjectStore(store=obs_store) zarr_array = await open_array(store=zarr_store, mode="r") - strategy = get_strategy(zarr_array) - return await strategy.get_chunk_mapping(zarr_array, filepath) + manifest = await build_chunk_manifest(zarr_array, filepath) + return manifest.dict() - chunk_map = asyncio.run(get_chunk_map()) - assert chunk_map == {} + result = asyncio.run(get_chunk_map()) + assert result == {} @SKIP_OLDER_ZARR_PYTHON From 9d6a312dbff256991fbab8c5ed6437592fe5092b Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 18:15:48 -0700 Subject: [PATCH 05/22] adds type_checking for pa type hint + import in _from_arrow --- virtualizarr/manifests/manifest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index d3206f04..98b08199 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import re from collections.abc import ( Callable, @@ -8,7 +10,7 @@ ValuesView, ) from pathlib import PosixPath -from typing import Any, NewType, TypedDict, cast +from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast import numpy as np import pyarrow as pa @@ -16,6 +18,9 @@ from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index from virtualizarr.types import ChunkKey +if TYPE_CHECKING: + import pyarrow as pa + # doesn't guarantee that writers actually handle these VALID_URI_PREFIXES = { "s3://", @@ -347,6 +352,7 @@ def _from_arrow( chunk_grid_shape Shape of the chunk grid, used to pre-allocate numpy arrays. """ + import pyarrow as pa import pyarrow.compute as pc paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) From bab147d23cef9919719765384545955ace886b14 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 18:17:49 -0700 Subject: [PATCH 06/22] extra import removed --- virtualizarr/manifests/manifest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 98b08199..6c5d8140 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -13,7 +13,6 @@ from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast import numpy as np -import pyarrow as pa from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index from virtualizarr.types import ChunkKey From 17e35cc1baf3466c6a194f8a47654916e939d192 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Thu, 26 Feb 2026 18:22:47 -0700 Subject: [PATCH 07/22] adds zarr to test-py31* test group --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0efe6908..a700b28b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -203,8 +203,8 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs # Inherit from min-deps to get all the test commands, along with optional dependencies test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"] -test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11 -test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12 +test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11 +test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"] minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"] upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"] From 6cbb7c0dd1dc2746b1c17999846d6df876793eaa Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 27 Feb 2026 09:23:53 -0700 Subject: [PATCH 08/22] Update virtualizarr/manifests/manifest.py Co-authored-by: Tom Nicholas --- virtualizarr/manifests/manifest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 6c5d8140..4009bc8e 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -331,10 +331,9 @@ def from_arrays( def _from_arrow( cls, *, - chunk_keys: "pa.Array", - paths: "pa.Array", - sizes: "pa.Array", - chunk_grid_shape: tuple[int, ...], + paths: "pa.StringArray", + offsets: "pa.UInt64Array", + lengths: "pa.UInt64Array", ) -> "ChunkManifest": """ Create a ChunkManifest directly from PyArrow arrays. From b400a345011ddb50e17b1b20261d1ce7807d9e29 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 27 Feb 2026 10:37:03 -0700 Subject: [PATCH 09/22] updates _from_arrow method to have paths, offsets, lengths and opt[shape]. Moves all weird arrow reshaping into zarr:build_chunk_manifest --- virtualizarr/manifests/manifest.py | 56 ++++++++--------- virtualizarr/parsers/zarr.py | 98 ++++++++++++++++++++++++++---- 2 files changed, 113 insertions(+), 41 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 4009bc8e..f3a11c17 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -334,46 +334,46 @@ def _from_arrow( paths: "pa.StringArray", offsets: "pa.UInt64Array", lengths: "pa.UInt64Array", + shape: tuple[int, ...] | None = None, ) -> "ChunkManifest": """ - Create a ChunkManifest directly from PyArrow arrays. + Create a ChunkManifest from flat 1D PyArrow arrays. + Avoids intermediate Python dicts by converting Arrow arrays directly + to the numpy arrays used internally by ChunkManifest. Parameters ---------- - chunk_keys - Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array. paths - Full paths to chunks, as a PyArrow Utf8 array. - sizes - Chunk sizes in bytes, as a PyArrow UInt64 array. - chunk_grid_shape - Shape of the chunk grid, used to pre-allocate numpy arrays. + Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks. + offsets + Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks. + lengths + Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks. + shape + Shape to reshape the flat arrays into. If None, arrays are used as-is (1D). """ import pyarrow as pa import pyarrow.compute as pc - paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) - offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - - if len(chunk_keys) > 0: - # split "1.2" into [1, 2], then reshape to (n_chunks, ndim), use for indexing - ndim = len(chunk_grid_shape) - indices_flat = pc.cast( - pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64() - ).to_numpy() - indices_2d = indices_flat.reshape(-1, ndim) - idx_tuple = tuple(indices_2d[:, i] for i in range(ndim)) - - paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False) - lengths_arr[idx_tuple] = sizes.to_numpy() - - return cls.from_arrays( - paths=paths_arr, - offsets=offsets_arr, - lengths=lengths_arr, + paths_np = ( + pc.if_else(pc.is_null(paths), "", paths) + .to_numpy(zero_copy_only=False) + .astype(np.dtypes.StringDType()) ) + offsets_np = pc.if_else( + pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets + ).to_numpy(zero_copy_only=False) + lengths_np = pc.if_else( + pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths + ).to_numpy(zero_copy_only=False) + + if shape is not None: + paths_np = paths_np.reshape(shape) + offsets_np = offsets_np.reshape(shape) + lengths_np = lengths_np.reshape(shape) + + return cls.from_arrays(paths=paths_np, offsets=offsets_np, lengths=lengths_np) @property def ndim_chunk_grid(self) -> int: diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 39db18a8..cd3b2c20 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -172,6 +172,15 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Get V3 metadata for the array (converting if necessary).""" ... + @abstractmethod + def get_prefix(self, zarr_array: ZarrArrayType) -> str: + """Get the storage prefix for chunk listing.""" + ... + + @abstractmethod + def validate(self, zarr_array: ZarrArrayType) -> None: + """Validate that the array can be virtualized.""" + class ZarrV2Strategy(ZarrVersionStrategy): """Strategy for handling Zarr V2 arrays.""" @@ -250,6 +259,13 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: return v3_metadata + def get_prefix(self, zarr_array: ZarrArrayType) -> str: + name = _get_array_name(zarr_array) + return f"{name}/" if name else "" + + def validate(self, zarr_array: ZarrArrayType) -> None: + pass # no restrictions for V2 + class ZarrV3Strategy(ZarrVersionStrategy): """Strategy for handling Zarr V3 arrays.""" @@ -291,6 +307,23 @@ def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Return V3 metadata as-is (no conversion needed).""" return zarr_array.metadata # type: ignore[return-value] + def get_prefix(self, zarr_array: ZarrArrayType) -> str: + name = _get_array_name(zarr_array) + return f"{name}/c/" if name else "c/" + + def validate(self, zarr_array: ZarrArrayType) -> None: + from zarr.codecs import ShardingCodec + + if any( + isinstance(codec, ShardingCodec) for codec in zarr_array.metadata.codecs + ): + raise NotImplementedError( + "Zarr V3 arrays with sharding are not yet supported. " + "Sharding stores multiple chunks in a single storage object with non-zero offsets, " + "which VirtualiZarr does not currently handle. " + "Reading sharded arrays without proper offset handling would result in corrupted data." + ) + def get_strategy(zarr_array: ZarrArrayType) -> ZarrVersionStrategy: """ @@ -328,12 +361,15 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are missing, Zarr will return the fill_value for those regions when the array is read. """ + import pyarrow as pa + import pyarrow.compute as pc + strategy = get_strategy(zarr_array) chunk_grid_shape = zarr_array._chunk_grid_shape - # For scalar arrays use _from_arrays - chunk_map = await strategy.get_chunk_mapping(zarr_array, path) + # scalar arrays go through the dict path instead of the pure arrow bit if zarr_array.shape == (): + chunk_map = await strategy.get_chunk_mapping(zarr_array, path) if not chunk_map: return ChunkManifest(chunk_map, shape=chunk_grid_shape) paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) @@ -348,24 +384,60 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr ) - # for non scalar arrays, use the new _from_arrow method - name = _get_array_name(zarr_array) - if zarr_array.metadata.zarr_format == 3: - prefix = f"{name}/c/" if name else "c/" - else: - prefix = f"{name}/" if name else "" + # check for v3 sharding / prefix update + strategy.validate(zarr_array) + prefix = strategy.get_prefix(zarr_array) result = await _build_chunk_mapping(zarr_array, path, prefix) if result is None: return ChunkManifest({}, shape=chunk_grid_shape) - normalized_keys, full_paths, sizes = result + normalized_keys, full_paths, all_lengths = result + + # Incoming: lots of LLM arrow mumbo jumbo for sparse arrays + + # compute flat positions for each listed chunk (C-order) + ndim = len(chunk_grid_shape) + if ndim == 1: + # 1D shortcut: we can bypass in the simple case + flat_positions = pc.cast(normalized_keys, pa.int64()) + total_size = chunk_grid_shape[0] + else: + # compute C-order strides and dot with per-dimension indices + stride = 1 + strides = [] + for s in reversed(chunk_grid_shape): + strides.insert(0, stride) + stride *= s + total_size = stride + + flat_positions = pa.repeat(pa.scalar(0, pa.int64()), len(normalized_keys)) + for dim, dim_stride in enumerate(strides): + dim_indices = pc.list_slice( + pc.split_pattern(normalized_keys, pattern="."), dim, dim + 1 + ).flatten() + flat_positions = pc.add( + flat_positions, + pc.multiply(pc.cast(dim_indices, pa.int64()), dim_stride), + ) + + # scatter listed chunks into dense flat arrow arrays via join on flat index + # How will this left join scale? poorly? + updates = pa.table( + {"idx": flat_positions, "path": full_paths, "length": all_lengths} + ) + dense = ( + pa.table({"idx": pa.array(range(total_size), type=pa.int64())}) + .join(updates, "idx", join_type="left outer") + .sort_by("idx") + ) + return ChunkManifest._from_arrow( - chunk_keys=normalized_keys, - paths=full_paths, - sizes=sizes, - chunk_grid_shape=chunk_grid_shape, + paths=dense["path"].combine_chunks(), + offsets=pa.repeat(pa.scalar(0, type=pa.uint64()), total_size), + lengths=dense["length"].combine_chunks(), + shape=chunk_grid_shape, ) From e22981f939e6064db3ad47ab3c37854c6add4a41 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 12:31:15 -0700 Subject: [PATCH 10/22] update releases.md --- docs/releases.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/releases.md b/docs/releases.md index 41789d5c..f89add24 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -38,6 +38,10 @@ This release moves the `ObjectStoreRegistry` to a separate package `obspec_utils ### New Features +- Improved `ZarrParser` performance. + ([#892](https://github.com/zarr-developers/VirtualiZarr/pull/892)). + By [Raphael Hagen](https://github.com/norlandrhagen). + - Added `reader_factory` parameter to `HDFParser` to allow customizing how files are read ([#844](https://github.com/zarr-developers/VirtualiZarr/pull/844)). By [Max Jones](https://github.com/maxrjones). From fda8ce61e1ba3a24f18796a2bf6ee9ccaed9b622 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 13:32:28 -0700 Subject: [PATCH 11/22] mypy --- virtualizarr/manifests/manifest.py | 6 +++--- virtualizarr/parsers/zarr.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index ab157b1c..f0d73155 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -18,7 +18,7 @@ from virtualizarr.types import ChunkKey if TYPE_CHECKING: - import pyarrow as pa + import pyarrow as pa # type: ignore[import-untyped] # doesn't guarantee that writers actually handle these VALID_URI_PREFIXES = { @@ -353,8 +353,8 @@ def _from_arrow( shape Shape to reshape the flat arrays into. If None, arrays are used as-is (1D). """ - import pyarrow as pa - import pyarrow.compute as pc + import pyarrow as pa # type: ignore[import-untyped] + import pyarrow.compute as pc # type: ignore[import-untyped] paths_np = ( pc.if_else(pc.is_null(paths), "", paths) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 510a167f..45bbf87b 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -122,7 +122,9 @@ async def _build_chunk_mapping( path_batches = [] size_batches = [] - stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True) + stream = cast(ObjectStore, zarr_array.store).store.list_async( + prefix=prefix, return_arrow=True + ) async for batch in stream: pa_path_col = pa.array(batch.column("path")) not_metadata = pc.invert( @@ -199,7 +201,7 @@ async def get_chunk_mapping( scalar_key = f"{prefix}0" return await _handle_scalar_array(zarr_array, path, scalar_key) - return await _build_chunk_mapping(zarr_array, path, prefix) + return await _build_chunk_mapping(zarr_array, path, prefix) # type: ignore[return-value] def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Convert V2 metadata to V3 format.""" @@ -302,7 +304,7 @@ async def get_chunk_mapping( # List chunk keys under the c/ subdirectory prefix = f"{name}/c/" if name else "c/" - return await _build_chunk_mapping(zarr_array, path, prefix) + return await _build_chunk_mapping(zarr_array, path, prefix) # type: ignore[return-value] def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: """Return V3 metadata as-is (no conversion needed).""" @@ -315,6 +317,8 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: def validate(self, zarr_array: ZarrArrayType) -> None: from zarr.codecs import ShardingCodec + if not isinstance(zarr_array.metadata, ArrayV3Metadata): + return if any( isinstance(codec, ShardingCodec) for codec in zarr_array.metadata.codecs ): @@ -407,7 +411,7 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan else: # compute C-order strides and dot with per-dimension indices stride = 1 - strides = [] + strides: list[int] = [] for s in reversed(chunk_grid_shape): strides.insert(0, stride) stride *= s From bbd6a1f52017a101fcb73fde30dcbe8378293a41 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 13:35:13 -0700 Subject: [PATCH 12/22] mypy-2 --- virtualizarr/parsers/zarr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 45bbf87b..5c06d8d0 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -27,7 +27,7 @@ ) if TYPE_CHECKING: - import pyarrow as pa + import pyarrow as pa # type: ignore[import-untyped] import zarr T = TypeVar("T") @@ -117,8 +117,8 @@ async def _build_chunk_mapping( ------- Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found. """ - import pyarrow as pa - import pyarrow.compute as pc + import pyarrow as pa # type: ignore[import-untyped] + import pyarrow.compute as pc # type: ignore[import-untyped] path_batches = [] size_batches = [] @@ -366,8 +366,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are missing, Zarr will return the fill_value for those regions when the array is read. """ - import pyarrow as pa - import pyarrow.compute as pc + import pyarrow as pa # type: ignore[import-untyped] + import pyarrow.compute as pc # type: ignore[import-untyped] strategy = get_strategy(zarr_array) chunk_grid_shape = zarr_array._chunk_grid_shape From 9cba9e8cbdce0768e754975a36b24c1e619acd96 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 13:48:54 -0700 Subject: [PATCH 13/22] update pyproj --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4a6fa453..cd9e08b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ all_parsers = [ "virtualizarr[fits]", "virtualizarr[kerchunk_parquet]", "virtualizarr[tiff]", + "virtualizarr[zarr]" ] # writers From f50b724dc25ff426818ade7f2e28032bc0e0690c Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 13:51:44 -0700 Subject: [PATCH 14/22] adds new zarr parser deps and fix to acccessor --- pyproject.toml | 2 ++ virtualizarr/accessor.py | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f21e877..797e1ea8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,8 @@ all_parsers = [ icechunk = [ "icechunk>=1.1.2", ] +zarr = ["arro3-core", "pyarrow"] + kerchunk = ["fastparquet"] all_writers = [ diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index d898a87c..7eadd572 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -192,6 +192,7 @@ def to_kerchunk( return None elif format == "parquet": + import pandas as pd from kerchunk.df import refs_to_dataframe if isinstance(filepath, Path): @@ -199,14 +200,19 @@ def to_kerchunk( elif isinstance(filepath, str): url = filepath - # refs_to_dataframe is responsible for writing to parquet. - # at no point does it create a full in-memory dataframe. - refs_to_dataframe( - refs, - url=url, - record_size=record_size, - categorical_threshold=categorical_threshold, - ) + # The zarr-parser performance update PR #892 adds pyarrow and arro3-core as deps. + # These break the `kerchunk` refs_to_dataframe behavior. + # It seems like pyarrow makes pandas default to an ArrowStringArray + # which fastparquet cannot zero-copy encode. + # TODO: remove once fastparquet or kerchunk handle ArrowStringArray. + + with pd.option_context("future.infer_string", False): + refs_to_dataframe( + refs, + url=url, + record_size=record_size, + categorical_threshold=categorical_threshold, + ) return None else: raise ValueError(f"Unrecognized output format: {format}") From 4ed82950e66e9a3294c8005eb202eecdda529bee Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 13:57:40 -0700 Subject: [PATCH 15/22] fix double pyproj def --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b179b707..7bbe18d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,6 @@ all_parsers = [ icechunk = [ "icechunk>=1.1.2", ] -zarr = ["arro3-core", "pyarrow"] kerchunk = ["fastparquet"] From 9114613b6593a6d96fdb2f4ccf4c0ae02026b91e Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 14:08:30 -0700 Subject: [PATCH 16/22] adds requires pyarrow decorator to the test_zarr so mins deps are ok --- virtualizarr/tests/__init__.py | 1 + virtualizarr/tests/test_parsers/test_kerchunk.py | 6 +++++- virtualizarr/tests/test_parsers/test_zarr.py | 9 +++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index c14f048c..b1c030b4 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -42,3 +42,4 @@ def _importorskip( has_dask, requires_dask = _importorskip("dask") has_obstore, requires_obstore = _importorskip("obstore") has_tiff, requires_tiff = _importorskip("virtual_tiff") +has_pyarrow, requires_pyarrow = _importorskip("pyarrow") diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py index 0728ee04..73a420ae 100644 --- a/virtualizarr/tests/test_parsers/test_kerchunk.py +++ b/virtualizarr/tests/test_parsers/test_kerchunk.py @@ -305,10 +305,14 @@ def test_open_virtual_dataset_existing_kerchunk_refs( ujson.dump(example_reference_dict, json_file) parser = KerchunkJSONParser(fs_root="file://") if reference_format == "parquet": + import pandas as pd from kerchunk.df import refs_to_dataframe ref_filepath = tmp_path / "ref.parquet" - refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix()) + with pd.option_context("future.infer_string", False): + refs_to_dataframe( + fo=example_reference_dict, url=ref_filepath.as_posix() + ) parser = KerchunkParquetParser(fs_root="file://") expected_refs = netcdf4_virtual_dataset.vz.to_kerchunk(format="dict") with open_virtual_dataset( diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py index ce690c5c..5acc7bbb 100644 --- a/virtualizarr/tests/test_parsers/test_zarr.py +++ b/virtualizarr/tests/test_parsers/test_zarr.py @@ -20,6 +20,7 @@ get_strategy, join_url, ) +from virtualizarr.tests import requires_pyarrow ZarrArrayType = zarr.AsyncArray | zarr.Array @@ -52,6 +53,7 @@ def zarr_versions(param_name="zarr_format", indirect=False): ) +@requires_pyarrow @zarr_versions(param_name="zarr_store", indirect=True) class TestOpenVirtualDatasetZarr: def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]): @@ -178,6 +180,7 @@ def test_unsupported_zarr_format(): get_strategy(mock_array) +@requires_pyarrow @zarr_versions() def test_empty_array_chunk_mapping(tmpdir, zarr_format): """Test chunk mapping for arrays with no chunks written yet.""" @@ -306,6 +309,7 @@ async def get_meta(): assert metadata.fill_value is not None +@requires_pyarrow def test_build_chunk_manifest_empty_with_shape(): """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks.""" @@ -326,6 +330,7 @@ async def get_manifest(): assert manifest.shape_chunk_grid == (2, 2) +@requires_pyarrow @zarr_versions() def test_sparse_array_with_missing_chunks(tmpdir, zarr_format): """Test that arrays with some missing chunks (sparse arrays) are handled correctly.""" @@ -368,6 +373,7 @@ async def get_manifest(): assert manifest.shape_chunk_grid == (3, 3) +@requires_pyarrow @zarr_versions() def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format): """Roundtrip a small dataset through the ZarrParser and compare with xarray.""" @@ -403,6 +409,7 @@ def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format): xr.testing.assert_identical(actual, expected) +@requires_pyarrow @zarr_versions() def test_parser_scalar_roundtrip_matches_xarray(tmpdir, zarr_format): """Roundtrip a small dataset through the ZarrParser and compare with xarray.""" @@ -463,6 +470,7 @@ async def outer(): assert result == 42 +@requires_pyarrow @zarr_versions() def test_zarr_parser_works_inside_running_event_loop(tmpdir, zarr_format): """Test that ZarrParser.__call__ works inside a running event loop (notebook scenario).""" @@ -490,6 +498,7 @@ async def run_parser_in_loop(): xr.testing.assert_identical(actual, expected) +@requires_pyarrow def test_sharded_array_raises_error(tmpdir): """Test that attempting to virtualize a sharded Zarr V3 array raises NotImplementedError.""" filepath = f"{tmpdir}/test_sharded.zarr" From 31c8ed0b175136f21e2589bee5bd5a89c50bb987 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 14:15:51 -0700 Subject: [PATCH 17/22] add strange pyarrow pandas context override to more test_kerchunk.py tests --- virtualizarr/tests/test_parsers/test_kerchunk.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py index 73a420ae..2a2863e1 100644 --- a/virtualizarr/tests/test_parsers/test_kerchunk.py +++ b/virtualizarr/tests/test_parsers/test_kerchunk.py @@ -164,6 +164,7 @@ def test_kerchunk_parquet_sparse_array(tmp_path, local_registry): This tests reading a kerchunk parquet where not all chunks are present, which is a common case for sparse arrays. """ + import pandas as pd from kerchunk.df import refs_to_dataframe # Create refs with only one chunk defined (sparse array) @@ -179,7 +180,8 @@ def test_kerchunk_parquet_sparse_array(tmp_path, local_registry): } ref_filepath = tmp_path / "sparse.parq" - refs_to_dataframe(fo=refs, url=str(ref_filepath)) + with pd.option_context("future.infer_string", False): + refs_to_dataframe(fo=refs, url=str(ref_filepath)) parser = KerchunkParquetParser() with open_virtual_dataset( @@ -367,10 +369,12 @@ def test_notimplemented_read_inline_refs_parquet( ): # Test that parquet references with inlined data raise NotImplementedError # https://github.com/zarr-developers/VirtualiZarr/issues/489 + import pandas as pd from kerchunk.df import refs_to_dataframe ref_filepath = tmp_path / "ref.parquet" - refs_to_dataframe(fo=netcdf4_inlined_ref, url=ref_filepath.as_posix()) + with pd.option_context("future.infer_string", False): + refs_to_dataframe(fo=netcdf4_inlined_ref, url=ref_filepath.as_posix()) parser = KerchunkParquetParser() with pytest.raises( From e0ddfc28f36eb6b58506f12b82af49a4518ee7ae Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 14:24:25 -0700 Subject: [PATCH 18/22] mypy again --- virtualizarr/manifests/manifest.py | 6 +++--- virtualizarr/parsers/zarr.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index f0d73155..c19c4894 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -18,7 +18,7 @@ from virtualizarr.types import ChunkKey if TYPE_CHECKING: - import pyarrow as pa # type: ignore[import-untyped] + import pyarrow as pa # type: ignore[import-untyped,import-not-found] # doesn't guarantee that writers actually handle these VALID_URI_PREFIXES = { @@ -353,8 +353,8 @@ def _from_arrow( shape Shape to reshape the flat arrays into. If None, arrays are used as-is (1D). """ - import pyarrow as pa # type: ignore[import-untyped] - import pyarrow.compute as pc # type: ignore[import-untyped] + import pyarrow as pa # type: ignore[import-untyped,import-not-found] + import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] paths_np = ( pc.if_else(pc.is_null(paths), "", paths) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 5c06d8d0..44dc2a12 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -27,7 +27,7 @@ ) if TYPE_CHECKING: - import pyarrow as pa # type: ignore[import-untyped] + import pyarrow as pa # type: ignore[import-untyped,import-not-found] import zarr T = TypeVar("T") @@ -117,8 +117,8 @@ async def _build_chunk_mapping( ------- Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found. """ - import pyarrow as pa # type: ignore[import-untyped] - import pyarrow.compute as pc # type: ignore[import-untyped] + import pyarrow as pa # type: ignore[import-untyped,import-not-found] + import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] path_batches = [] size_batches = [] @@ -366,8 +366,8 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are missing, Zarr will return the fill_value for those regions when the array is read. """ - import pyarrow as pa # type: ignore[import-untyped] - import pyarrow.compute as pc # type: ignore[import-untyped] + import pyarrow as pa # type: ignore[import-untyped,import-not-found] + import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] strategy = get_strategy(zarr_array) chunk_grid_shape = zarr_array._chunk_grid_shape From d96d5c58ec183d5ffcbae6031496fb1f5ba39e3b Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 6 Mar 2026 16:35:47 -0700 Subject: [PATCH 19/22] incorporate feedback --- virtualizarr/manifests/manifest.py | 33 +++++------ virtualizarr/parsers/zarr.py | 58 ++++++-------------- virtualizarr/tests/test_parsers/test_zarr.py | 10 +--- 3 files changed, 35 insertions(+), 66 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index c19c4894..d1029fb9 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -334,7 +334,7 @@ def _from_arrow( paths: "pa.StringArray", offsets: "pa.UInt64Array", lengths: "pa.UInt64Array", - shape: tuple[int, ...] | None = None, + shape: tuple[int, ...], ) -> "ChunkManifest": """ Create a ChunkManifest from flat 1D PyArrow arrays. @@ -351,29 +351,30 @@ def _from_arrow( lengths Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks. shape - Shape to reshape the flat arrays into. If None, arrays are used as-is (1D). + Shape to reshape the flat arrays into. """ import pyarrow as pa # type: ignore[import-untyped,import-not-found] import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] - paths_np = ( - pc.if_else(pc.is_null(paths), "", paths) - .to_numpy(zero_copy_only=False) - .astype(np.dtypes.StringDType()) - ) - offsets_np = pc.if_else( + arrow_paths = pc.if_else(pc.is_null(paths), "", paths) + arrow_offsets = pc.if_else( pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets - ).to_numpy(zero_copy_only=False) - lengths_np = pc.if_else( + ) + arrow_lengths = pc.if_else( pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths - ).to_numpy(zero_copy_only=False) + ) - if shape is not None: - paths_np = paths_np.reshape(shape) - offsets_np = offsets_np.reshape(shape) - lengths_np = lengths_np.reshape(shape) + np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype( + np.dtypes.StringDType() + ) + np_offsets = arrow_offsets.to_numpy(zero_copy_only=False) + np_lengths = arrow_lengths.to_numpy(zero_copy_only=False) - return cls.from_arrays(paths=paths_np, offsets=offsets_np, lengths=lengths_np) + return cls.from_arrays( + paths=np_paths.reshape(shape), + offsets=np_offsets.reshape(shape), + lengths=np_lengths.reshape(shape), + ) @property def ndim_chunk_grid(self) -> int: diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 44dc2a12..4d736c30 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -22,7 +22,6 @@ ManifestStore, ) from virtualizarr.manifests.manifest import ( - parse_manifest_index, validate_and_normalize_path_to_uri, ) @@ -370,27 +369,21 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] strategy = get_strategy(zarr_array) + strategy.validate(zarr_array) chunk_grid_shape = zarr_array._chunk_grid_shape - # scalar arrays go through the dict path instead of the pure arrow bit if zarr_array.shape == (): chunk_map = await strategy.get_chunk_mapping(zarr_array, path) if not chunk_map: return ChunkManifest(chunk_map, shape=chunk_grid_shape) - paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType()) - offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64")) - for key, entry in chunk_map.items(): - idx = parse_manifest_index(key) - paths_arr[idx] = entry["path"] - offsets_arr[idx] = entry["offset"] - lengths_arr[idx] = entry["length"] - return ChunkManifest.from_arrays( - paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr + entry = next(iter(chunk_map.values())) + return ChunkManifest._from_arrow( + paths=pa.array([entry["path"]], type=pa.string()), + offsets=pa.array([entry["offset"]], type=pa.uint64()), + lengths=pa.array([entry["length"]], type=pa.uint64()), + shape=chunk_grid_shape, ) - # check for v3 sharding / prefix update - strategy.validate(zarr_array) prefix = strategy.get_prefix(zarr_array) result = await _build_chunk_mapping(zarr_array, path, prefix) @@ -400,40 +393,21 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan normalized_keys, full_paths, all_lengths = result - # Incoming: lots of LLM arrow mumbo jumbo for sparse arrays + total_size = zarr_array.nchunks - # compute flat positions for each listed chunk (C-order) - ndim = len(chunk_grid_shape) - if ndim == 1: - # 1D shortcut: we can bypass in the simple case - flat_positions = pc.cast(normalized_keys, pa.int64()) - total_size = chunk_grid_shape[0] - else: - # compute C-order strides and dot with per-dimension indices - stride = 1 - strides: list[int] = [] - for s in reversed(chunk_grid_shape): - strides.insert(0, stride) - stride *= s - total_size = stride - - flat_positions = pa.repeat(pa.scalar(0, pa.int64()), len(normalized_keys)) - for dim, dim_stride in enumerate(strides): - dim_indices = pc.list_slice( - pc.split_pattern(normalized_keys, pattern="."), dim, dim + 1 - ).flatten() - flat_positions = pc.add( - flat_positions, - pc.multiply(pc.cast(dim_indices, pa.int64()), dim_stride), - ) + split_keys = pc.split_pattern(normalized_keys, pattern=".") + coords = [ + pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy() + for dim in range(zarr_array.ndim) + ] + flat_positions = pa.array(np.ravel_multi_index(coords, chunk_grid_shape)) - # scatter listed chunks into dense flat arrow arrays via join on flat index - # How will this left join scale? poorly? + # scatter listed chunks into a dense flat array (nulls = missing chunks) updates = pa.table( {"idx": flat_positions, "path": full_paths, "length": all_lengths} ) dense = ( - pa.table({"idx": pa.array(range(total_size), type=pa.int64())}) + pa.table({"idx": pa.array(np.arange(total_size, dtype=np.int64))}) .join(updates, "idx", join_type="left outer") .sort_by("idx") ) diff --git a/virtualizarr/tests/test_parsers/test_zarr.py b/virtualizarr/tests/test_parsers/test_zarr.py index 5acc7bbb..1330f131 100644 --- a/virtualizarr/tests/test_parsers/test_zarr.py +++ b/virtualizarr/tests/test_parsers/test_zarr.py @@ -22,6 +22,8 @@ ) from virtualizarr.tests import requires_pyarrow +pytestmark = requires_pyarrow + ZarrArrayType = zarr.AsyncArray | zarr.Array SKIP_OLDER_ZARR_PYTHON = pytest.mark.skipif( @@ -53,7 +55,6 @@ def zarr_versions(param_name="zarr_format", indirect=False): ) -@requires_pyarrow @zarr_versions(param_name="zarr_store", indirect=True) class TestOpenVirtualDatasetZarr: def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]): @@ -180,7 +181,6 @@ def test_unsupported_zarr_format(): get_strategy(mock_array) -@requires_pyarrow @zarr_versions() def test_empty_array_chunk_mapping(tmpdir, zarr_format): """Test chunk mapping for arrays with no chunks written yet.""" @@ -309,7 +309,6 @@ async def get_meta(): assert metadata.fill_value is not None -@requires_pyarrow def test_build_chunk_manifest_empty_with_shape(): """Test build_chunk_manifest when chunk_map is empty but array has shape and chunks.""" @@ -330,7 +329,6 @@ async def get_manifest(): assert manifest.shape_chunk_grid == (2, 2) -@requires_pyarrow @zarr_versions() def test_sparse_array_with_missing_chunks(tmpdir, zarr_format): """Test that arrays with some missing chunks (sparse arrays) are handled correctly.""" @@ -373,7 +371,6 @@ async def get_manifest(): assert manifest.shape_chunk_grid == (3, 3) -@requires_pyarrow @zarr_versions() def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format): """Roundtrip a small dataset through the ZarrParser and compare with xarray.""" @@ -409,7 +406,6 @@ def test_parser_roundtrip_matches_xarray(tmpdir, zarr_format): xr.testing.assert_identical(actual, expected) -@requires_pyarrow @zarr_versions() def test_parser_scalar_roundtrip_matches_xarray(tmpdir, zarr_format): """Roundtrip a small dataset through the ZarrParser and compare with xarray.""" @@ -470,7 +466,6 @@ async def outer(): assert result == 42 -@requires_pyarrow @zarr_versions() def test_zarr_parser_works_inside_running_event_loop(tmpdir, zarr_format): """Test that ZarrParser.__call__ works inside a running event loop (notebook scenario).""" @@ -498,7 +493,6 @@ async def run_parser_in_loop(): xr.testing.assert_identical(actual, expected) -@requires_pyarrow def test_sharded_array_raises_error(tmpdir): """Test that attempting to virtualize a sharded Zarr V3 array raises NotImplementedError.""" filepath = f"{tmpdir}/test_sharded.zarr" From 716a0bb89d2a390dd4a2da6e0907ef6b542adc3c Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Mon, 9 Mar 2026 10:11:15 -0600 Subject: [PATCH 20/22] removed seperator normalization and added a method to get chunk seperator --- virtualizarr/parsers/zarr.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 4d736c30..ca1ac28c 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -146,18 +146,16 @@ async def _build_chunk_mapping( if len(all_paths) == 0: return None - # normalize: strip prefix, replace / with . - stripped = pc.utf8_replace_slice( + stripped_keys = pc.utf8_replace_slice( all_paths, start=0, stop=len(prefix), replacement="" ) - normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".") # construct full paths full_paths = pc.binary_join_element_wise( pa.scalar(path.rstrip("/")), all_paths, "/" ) - return normalized_keys, full_paths, all_sizes + return stripped_keys, full_paths, all_sizes class ZarrVersionStrategy(ABC): @@ -180,6 +178,11 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: """Get the storage prefix for chunk listing.""" ... + @abstractmethod + def get_separator(self, zarr_array: ZarrArrayType) -> str: + """Get the chunk key separator for the array.""" + ... + @abstractmethod def validate(self, zarr_array: ZarrArrayType) -> None: """Validate that the array can be virtualized.""" @@ -265,6 +268,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: name = _get_array_name(zarr_array) return f"{name}/" if name else "" + def get_separator(self, zarr_array: ZarrArrayType) -> str: + # Default for v2 should be "." + return zarr_array.metadata.dimension_separator + def validate(self, zarr_array: ZarrArrayType) -> None: pass # no restrictions for V2 @@ -313,6 +320,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: name = _get_array_name(zarr_array) return f"{name}/c/" if name else "c/" + def get_separator(self, zarr_array: ZarrArrayType) -> str: + # gets chunk separator. Default for v3 should be "/" + return zarr_array.metadata.chunk_key_encoding.separator + def validate(self, zarr_array: ZarrArrayType) -> None: from zarr.codecs import ShardingCodec @@ -391,11 +402,11 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan if result is None: return ChunkManifest({}, shape=chunk_grid_shape) - normalized_keys, full_paths, all_lengths = result + stripped_keys, full_paths, all_lengths = result total_size = zarr_array.nchunks - - split_keys = pc.split_pattern(normalized_keys, pattern=".") + separator = strategy.get_separator(zarr_array) + split_keys = pc.split_pattern(stripped_keys, pattern=separator) coords = [ pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy() for dim in range(zarr_array.ndim) From 5df7705a12e622025f73b5c018744311142f5713 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Mon, 9 Mar 2026 10:14:06 -0600 Subject: [PATCH 21/22] de-dup pyproj --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b57d79a0..e7d23fdc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,8 +86,6 @@ icechunk = [ "icechunk>=1.1.2", ] -kerchunk = ["fastparquet"] -zarr = ["arro3-core", "pyarrow"] kerchunk = ["fastparquet", "pandas"] From 08232a84c98598dd0f0d7cb149c471e3f603903a Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Mon, 9 Mar 2026 10:30:50 -0600 Subject: [PATCH 22/22] mypy --- virtualizarr/parsers/zarr.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index ca1ac28c..41785559 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -11,6 +11,7 @@ import zarr from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.group import GroupMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.storage import ObjectStore @@ -179,9 +180,7 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: ... @abstractmethod - def get_separator(self, zarr_array: ZarrArrayType) -> str: - """Get the chunk key separator for the array.""" - ... + def _get_separator(self, zarr_array: ZarrArrayType) -> str: ... @abstractmethod def validate(self, zarr_array: ZarrArrayType) -> None: @@ -268,9 +267,10 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: name = _get_array_name(zarr_array) return f"{name}/" if name else "" - def get_separator(self, zarr_array: ZarrArrayType) -> str: - # Default for v2 should be "." - return zarr_array.metadata.dimension_separator + def _get_separator(self, zarr_array: ZarrArrayType) -> str: + from typing import cast + + return cast(ArrayV2Metadata, zarr_array.metadata).dimension_separator def validate(self, zarr_array: ZarrArrayType) -> None: pass # no restrictions for V2 @@ -320,9 +320,11 @@ def get_prefix(self, zarr_array: ZarrArrayType) -> str: name = _get_array_name(zarr_array) return f"{name}/c/" if name else "c/" - def get_separator(self, zarr_array: ZarrArrayType) -> str: - # gets chunk separator. Default for v3 should be "/" - return zarr_array.metadata.chunk_key_encoding.separator + def _get_separator(self, zarr_array: ZarrArrayType) -> str: + from typing import cast + + metadata = cast(ArrayV3Metadata, zarr_array.metadata) + return cast(DefaultChunkKeyEncoding, metadata.chunk_key_encoding).separator def validate(self, zarr_array: ZarrArrayType) -> None: from zarr.codecs import ShardingCodec @@ -405,7 +407,7 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan stripped_keys, full_paths, all_lengths = result total_size = zarr_array.nchunks - separator = strategy.get_separator(zarr_array) + separator = strategy._get_separator(zarr_array) split_keys = pc.split_pattern(stripped_keys, pattern=separator) coords = [ pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy()