Skip to content

Commit 1b72dcf

Browse files
Updates zarr-parser to use obstore list_async instead of concurrent_map (#892)
* updates zarr-parser to use obstore list_async instead of concurrent_map * removes the zarr vendor code * adds arro3-core to zarr group * adds _from_arrow method * adds type_checking for pa type hint + import in _from_arrow * extra import removed * adds zarr to test-py31* test group * Update virtualizarr/manifests/manifest.py Co-authored-by: Tom Nicholas <tom@earthmover.io> * updates _from_arrow method to have paths, offsets, lengths and opt[shape]. Moves all weird arrow reshaping into zarr:build_chunk_manifest * update releases.md * mypy * mypy-2 * update pyproj * adds new zarr parser deps and fix to acccessor * fix double pyproj def * adds requires pyarrow decorator to the test_zarr so mins deps are ok * add strange pyarrow pandas context override to more test_kerchunk.py tests * mypy again * incorporate feedback * removed seperator normalization and added a method to get chunk seperator * de-dup pyproj * mypy --------- Co-authored-by: Tom Nicholas <tom@earthmover.io>
1 parent bf30692 commit 1b72dcf

11 files changed

Lines changed: 271 additions & 133 deletions

File tree

docs/releases.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ This release moves the `ObjectStoreRegistry` to a separate package `obspec_utils
4141

4242
### New Features
4343

44+
- Improved `ZarrParser` performance.
45+
([#892](https://github.com/zarr-developers/VirtualiZarr/pull/892)).
46+
By [Raphael Hagen](https://github.com/norlandrhagen).
47+
4448
- Added `reader_factory` parameter to `HDFParser` to allow customizing how files are read
4549
([#844](https://github.com/zarr-developers/VirtualiZarr/pull/844)).
4650
By [Max Jones](https://github.com/maxrjones).

pyproject.toml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ hdf = [
5050
"imagecodecs-numcodecs==2024.6.1",
5151
]
5252

53+
zarr = ["arro3-core", "pyarrow"]
54+
5355
# kerchunk-based parsers
5456
netcdf3 = [
5557
"virtualizarr[remote]",
@@ -76,13 +78,14 @@ all_parsers = [
7678
"virtualizarr[fits]",
7779
"virtualizarr[kerchunk_parquet]",
7880
"virtualizarr[tiff]",
81+
"virtualizarr[zarr]"
7982
]
8083

8184
# writers
8285
icechunk = [
8386
"icechunk>=1.1.2",
8487
]
85-
zarr = ["arro3-core", "pyarrow"]
88+
8689

8790
kerchunk = ["fastparquet", "pandas"]
8891

@@ -203,14 +206,17 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
203206
min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
204207
# Inherit from min-deps to get all the test commands, along with optional dependencies
205208
test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
206-
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
207-
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
209+
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
210+
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
208211
minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
209212
minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
210213
upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
211214
all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
212215
docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
213216

217+
[tool.pixi.dependencies]
218+
pytest = "*"
219+
214220
# Define commands to run within the docs environment
215221
[tool.pixi.feature.docs.tasks]
216222
serve-docs = { cmd = "mkdocs serve" }

virtualizarr/manifests/manifest.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import re
24
from collections.abc import (
35
Callable,
@@ -8,13 +10,16 @@
810
ValuesView,
911
)
1012
from pathlib import PosixPath
11-
from typing import Any, NewType, TypedDict, cast
13+
from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast
1214

1315
import numpy as np
1416

1517
from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
1618
from virtualizarr.types import ChunkKey
1719

20+
if TYPE_CHECKING:
21+
import pyarrow as pa # type: ignore[import-untyped,import-not-found]
22+
1823
# doesn't guarantee that writers actually handle these
1924
VALID_URI_PREFIXES = {
2025
"s3://",
@@ -322,6 +327,55 @@ def from_arrays(
322327

323328
return obj
324329

330+
@classmethod
331+
def _from_arrow(
332+
cls,
333+
*,
334+
paths: "pa.StringArray",
335+
offsets: "pa.UInt64Array",
336+
lengths: "pa.UInt64Array",
337+
shape: tuple[int, ...],
338+
) -> "ChunkManifest":
339+
"""
340+
Create a ChunkManifest from flat 1D PyArrow arrays.
341+
342+
Avoids intermediate Python dicts by converting Arrow arrays directly
343+
to the numpy arrays used internally by ChunkManifest.
344+
345+
Parameters
346+
----------
347+
paths
348+
Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks.
349+
offsets
350+
Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
351+
lengths
352+
Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
353+
shape
354+
Shape to reshape the flat arrays into.
355+
"""
356+
import pyarrow as pa # type: ignore[import-untyped,import-not-found]
357+
import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found]
358+
359+
arrow_paths = pc.if_else(pc.is_null(paths), "", paths)
360+
arrow_offsets = pc.if_else(
361+
pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets
362+
)
363+
arrow_lengths = pc.if_else(
364+
pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths
365+
)
366+
367+
np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype(
368+
np.dtypes.StringDType()
369+
)
370+
np_offsets = arrow_offsets.to_numpy(zero_copy_only=False)
371+
np_lengths = arrow_lengths.to_numpy(zero_copy_only=False)
372+
373+
return cls.from_arrays(
374+
paths=np_paths.reshape(shape),
375+
offsets=np_offsets.reshape(shape),
376+
lengths=np_lengths.reshape(shape),
377+
)
378+
325379
@property
326380
def ndim_chunk_grid(self) -> int:
327381
"""

0 commit comments

Comments
 (0)