diff --git a/conftest.py b/conftest.py index e1c13dd8e..3749a7c5e 100644 --- a/conftest.py +++ b/conftest.py @@ -256,8 +256,17 @@ def netcdf4_file_with_2d_coords(tmp_path: Path) -> str: def netcdf4_virtual_dataset(netcdf4_file): """Create a virtual dataset from a NetCDF4 file.""" from virtualizarr import open_virtual_dataset - - with open_virtual_dataset(netcdf4_file, loadable_variables=[]) as ds: + from virtualizarr.parsers import HDFParser + from virtualizarr.tests.utils import obstore_local + + store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=store, + parser=parser, + loadable_variables=[], + ) as ds: yield ds diff --git a/docs/api.rst b/docs/api.rst index e700cd661..6e25b5bd4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -13,7 +13,7 @@ User API Reading ------- -.. currentmodule:: virtualizarr.backend +.. currentmodule:: virtualizarr.xarray .. autosummary:: :nosignatures: :toctree: generated/ diff --git a/pyproject.toml b/pyproject.toml index cff9413ba..a8740a0f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,3 @@ - [project] name = "virtualizarr" description = "Create virtual Zarr stores from archival data using xarray API" @@ -28,7 +27,8 @@ dependencies = [ "numcodecs>=0.15.1", "ujson", "packaging", - "zarr>=3.0.2", + "zarr>=3.0.8", + "obstore>=0.5.1", ] # Dependency sets under optional-dependencies are available via PyPI @@ -40,25 +40,16 @@ remote = [ "aiohttp", "s3fs", ] -obstore = [ - "obstore>=0.5.1", -] -# non-kerchunk-based readers + +# non-kerchunk-based parsers hdf = [ - "virtualizarr[remote]", "h5py", "hdf5plugin", "imagecodecs", "imagecodecs-numcodecs==2024.6.1", - "obstore>=0.5.1", ] -# kerchunk-based readers -hdf5 = [ - "virtualizarr[remote]", - "kerchunk>=0.2.8", - "h5py", -] +# kerchunk-based parsers netcdf3 = [ "virtualizarr[remote]", "kerchunk>=0.2.8", @@ -69,11 +60,17 @@ fits = [ "kerchunk>=0.2.8", "astropy", ] -all_readers = [ +kerchunk_parquet = [ + "virtualizarr[remote]", + "fastparquet", +] + +# kerchunk +all_parsers = [ "virtualizarr[hdf]", - "virtualizarr[hdf5]", "virtualizarr[netcdf3]", "virtualizarr[fits]", + "virtualizarr[kerchunk_parquet]", ] # writers @@ -183,14 +180,14 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov # Define which features and groups to include in different pixi (similar to conda) environments) [tool.pixi.environments] -min-deps = ["dev", "test", "hdf", "hdf5", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs +min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs # Inherit from min-deps to get all the test commands, along with optional dependencies -test = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore"] -test-py311 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py311"] # test against python 3.11 -test-py312 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312"] # test against python 3.12 -minio = ["dev", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312", "minio"] -upstream = ["dev", "test", "hdf", "hdf5", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev"] -all = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "all_readers", "all_writers"] +test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib"] +test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "py311"] # test against python 3.11 +test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "py312"] # test against python 3.12 +minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "py312", "minio"] +upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev"] +all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk","kerchunk_parquet", "hdf5-lib", "all_parsers", "all_writers"] docs = ["docs"] # Define commands to run within the docs environment @@ -201,8 +198,8 @@ readthedocs = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTH # Define commands to run within the docs environment [tool.pixi.feature.minio.tasks] -run-tests = { cmd = "pytest virtualizarr/tests/test_manifests/test_store.py virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py --run-minio-tests --run-network-tests --verbose" } -run-tests-xml-cov = { cmd = "pytest virtualizarr/tests/test_manifests/test_store.py virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py --run-minio-tests --run-network-tests --verbose --cov-report=xml" } +run-tests = { cmd = "pytest virtualizarr/tests/test_manifests/test_store.py virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py --run-minio-tests --run-network-tests --verbose" } +run-tests-xml-cov = { cmd = "pytest virtualizarr/tests/test_manifests/test_store.py virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py --run-minio-tests --run-network-tests --verbose --cov-report=xml" } [tool.setuptools_scm] fallback_version = "9999" diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index 290fdc0da..24f0a7581 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -4,8 +4,8 @@ VirtualiZarrDatasetAccessor, VirtualiZarrDataTreeAccessor, ) -from virtualizarr.backend import open_virtual_dataset, open_virtual_mfdataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.xarray import open_virtual_dataset, open_virtual_mfdataset try: __version__ = _version("virtualizarr") diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py deleted file mode 100644 index 4b9c69cd0..000000000 --- a/virtualizarr/backend.py +++ /dev/null @@ -1,407 +0,0 @@ -import os -import warnings -from collections.abc import Iterable, Mapping -from concurrent.futures import Executor -from enum import Enum, auto -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, - Optional, - Sequence, - cast, -) - -import xarray as xr -from xarray import DataArray, Dataset, Index, combine_by_coords -from xarray.backends.common import _find_absolute_paths -from xarray.core.types import NestedSequence -from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine - -from virtualizarr.parallel import get_executor -from virtualizarr.readers import ( - DMRPPVirtualBackend, - FITSVirtualBackend, - HDFVirtualBackend, - KerchunkVirtualBackend, - NetCDF3VirtualBackend, - TIFFVirtualBackend, - ZarrVirtualBackend, -) -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.utils import _FsspecFSFromFilepath - -if TYPE_CHECKING: - from xarray.core.types import ( - CombineAttrsOptions, - CompatOptions, - JoinOptions, - ) - - -# TODO add entrypoint to allow external libraries to add to this mapping -VIRTUAL_BACKENDS = { - "kerchunk": KerchunkVirtualBackend, - "zarr": ZarrVirtualBackend, - "dmrpp": DMRPPVirtualBackend, - "hdf5": HDFVirtualBackend, - "netcdf4": HDFVirtualBackend, # note this is the same as for hdf5 - # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) - "netcdf3": NetCDF3VirtualBackend, - "tiff": TIFFVirtualBackend, - "fits": FITSVirtualBackend, -} - - -class AutoName(Enum): - # Recommended by official Python docs for auto naming: - # https://docs.python.org/3/library/enum.html#using-automatic-values - def _generate_next_value_(name, start, count, last_values): - return name - - -class FileType(AutoName): - netcdf3 = auto() - netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 - hdf4 = auto() - hdf5 = auto() - grib = auto() - tiff = auto() - fits = auto() - dmrpp = auto() - kerchunk = auto() - zarr = auto() - - -def automatically_determine_filetype( - *, - filepath: str, - reader_options: Optional[dict[str, Any]] = {}, -) -> FileType: - """ - Attempt to automatically infer the correct reader for this filetype. - - Uses magic bytes and file / directory suffixes. - """ - - # TODO this should ideally handle every filetype that we have a reader for, not just kerchunk - - # TODO how do we handle kerchunk json / parquet here? - if Path(filepath).suffix == ".zarr": - return FileType.zarr - - # Read magic bytes from local or remote file - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() - magic_bytes = fpath.read(8) - fpath.close() - - if magic_bytes.startswith(b"CDF"): - filetype = FileType.netcdf3 - elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): - raise NotImplementedError("HDF4 formatted files not supported") - elif magic_bytes.startswith(b"\x89HDF"): - filetype = FileType.hdf5 - elif magic_bytes.startswith(b"GRIB"): - filetype = FileType.grib - elif magic_bytes.startswith(b"II*"): - filetype = FileType.tiff - elif magic_bytes.startswith(b"SIMPLE"): - filetype = FileType.fits - else: - raise NotImplementedError( - f"Unrecognised file based on header bytes: {magic_bytes}" - ) - - return filetype - - -def open_virtual_dataset( - filepath: str, - *, - filetype: FileType | str | None = None, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - cftime_variables: Iterable[str] | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: dict | None = None, - reader_options: dict | None = None, - backend: type[VirtualBackend] | None = None, -) -> Dataset: - """ - Open a file or store as an xarray.Dataset wrapping virtualized zarr arrays. - - Some variables can be opened as loadable lazy numpy arrays. This can be controlled explicitly using the ``loadable_variables`` keyword argument. - By default this will be the same variables which `xarray.open_dataset` would create indexes for: i.e. one-dimensional coordinate variables whose - name matches the name of their only dimension (also known as "dimension coordinates"). - Pandas indexes will also now be created by default for these loadable variables, but this can be controlled by passing a value for the ``indexes`` keyword argument. - To avoid creating any xarray indexes pass ``indexes={}``. - - Parameters - ---------- - filepath - File path to open as a set of virtualized zarr arrays. - filetype - Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'kerchunk'}. - If not provided will attempt to automatically infer the correct filetype from header bytes. - group - Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4”, “hdf5”, and "dmrpp". - drop_variables - Variables in the file to drop before returning. - loadable_variables - Variables in the file to open as lazy numpy/dask arrays instead of instances of `ManifestArray`. - Default is to open all variables as virtual variables (i.e. as ManifestArrays). - decode_times - Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object. - indexes - Indexes to use on the returned xarray Dataset. - Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes. - To avoid creating any indexes, pass indexes={}. - virtual_backend_kwargs - Dictionary of keyword arguments passed down to this reader. Allows passing arguments specific to certain readers. - reader_options - Dict passed into Kerchunk file readers, to allow reading from remote filesystems. - Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. - - Returns - ------- - vds - An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. - """ - - if cftime_variables is not None: - # It seems like stacklevel=2 is req to surface this warning. - warnings.warn( - "cftime_variables is deprecated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.", - DeprecationWarning, - stacklevel=2, - ) - - if reader_options is None: - reader_options = {} - - if backend and filetype: - raise ValueError("Cannot pass both a filetype and an explicit VirtualBackend") - - if filetype is None: - filetype = automatically_determine_filetype( - filepath=filepath, reader_options=reader_options - ) - elif isinstance(filetype, str): - # if filetype is a user defined string, convert to FileType - filetype = FileType(filetype.lower()) - elif not isinstance(filetype, FileType): - raise ValueError("Filetype must be a valid string or FileType") - - if backend: - backend_cls = backend - else: - backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) # type: ignore - - if backend_cls is None: - raise NotImplementedError(f"Unsupported file type: {filetype.name}") - - vds = backend_cls.open_virtual_dataset( - filepath, - group=group, - drop_variables=drop_variables, - loadable_variables=loadable_variables, - decode_times=decode_times, - indexes=indexes, - virtual_backend_kwargs=virtual_backend_kwargs, - reader_options=reader_options, - ) - - return vds - - -def open_virtual_mfdataset( - paths: str - | os.PathLike - | Sequence[str | os.PathLike] - | "NestedSequence[str | os.PathLike]", - concat_dim: ( - str - | DataArray - | Index - | Sequence[str] - | Sequence[DataArray] - | Sequence[Index] - | None - ) = None, - compat: "CompatOptions" = "no_conflicts", - preprocess: Callable[[Dataset], Dataset] | None = None, - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords="different", - combine: Literal["by_coords", "nested"] = "by_coords", - parallel: Literal["dask", "lithops", False] | Executor = False, - join: "JoinOptions" = "outer", - attrs_file: str | os.PathLike | None = None, - combine_attrs: "CombineAttrsOptions" = "override", - **kwargs, -) -> Dataset: - """ - Open multiple files as a single virtual dataset. - - If combine='by_coords' then the function ``combine_by_coords`` is used to combine - the datasets into one before returning the result, and if combine='nested' then - ``combine_nested`` is used. The filepaths must be structured according to which - combining function is used, the details of which are given in the documentation for - ``combine_by_coords`` and ``combine_nested``. By default ``combine='by_coords'`` - will be used. Global attributes from the ``attrs_file`` are used - for the combined dataset. - - Parameters - ---------- - paths - Same as in xarray.open_mfdataset - concat_dim - Same as in xarray.open_mfdataset - compat - Same as in xarray.open_mfdataset - preprocess - Same as in xarray.open_mfdataset - data_vars - Same as in xarray.open_mfdataset - coords - Same as in xarray.open_mfdataset - combine - Same as in xarray.open_mfdataset - parallel : "dask", "lithops", False, or instance of a subclass of ``concurrent.futures.Executor`` - Specify whether the open and preprocess steps of this function will be - performed in parallel using lithops, dask.delayed, or any executor compatible - with the ``concurrent.futures`` interface, or in serial. - Default is False, which will execute these steps in serial. - join - Same as in xarray.open_mfdataset - attrs_file - Same as in xarray.open_mfdataset - combine_attrs - Same as in xarray.open_mfdataset - **kwargs : optional - Additional arguments passed on to :py:func:`virtualizarr.open_virtual_dataset`. For an - overview of some of the possible options, see the documentation of - :py:func:`virtualizarr.open_virtual_dataset`. - - Returns - ------- - xarray.Dataset - - Notes - ----- - The results of opening each virtual dataset in parallel are sent back to the client process, so must not be too large. - """ - - # TODO this is practically all just copied from xarray.open_mfdataset - an argument for writing a virtualizarr engine for xarray? - - # TODO list kwargs passed to open_virtual_dataset explicitly in docstring? - - paths = cast(NestedSequence[str], _find_absolute_paths(paths)) - - if not paths: - raise OSError("no files to open") - - paths1d: list[str] - if combine == "nested": - if isinstance(concat_dim, str | DataArray) or concat_dim is None: - concat_dim = [concat_dim] # type: ignore[assignment] - - # This creates a flat list which is easier to iterate over, whilst - # encoding the originally-supplied structure as "ids". - # The "ids" are not used at all if combine='by_coords`. - combined_ids_paths = _infer_concat_order_from_positions(paths) - ids, paths1d = ( - list(combined_ids_paths.keys()), - list(combined_ids_paths.values()), - ) - elif concat_dim is not None: - raise ValueError( - "When combine='by_coords', passing a value for `concat_dim` has no " - "effect. To manually combine along a specific dimension you should " - "instead specify combine='nested' along with a value for `concat_dim`.", - ) - else: - paths1d = paths # type: ignore[assignment] - - # TODO this refactored preprocess and executor logic should be upstreamed into xarray - see https://github.com/pydata/xarray/pull/9932 - - if preprocess: - # TODO we could reexpress these using functools.partial but then we would hit this lithops bug: https://github.com/lithops-cloud/lithops/issues/1428 - - def _open_and_preprocess(path: str) -> xr.Dataset: - ds = open_virtual_dataset(path, **kwargs) - return preprocess(ds) - - open_func = _open_and_preprocess - else: - - def _open(path: str) -> xr.Dataset: - return open_virtual_dataset(path, **kwargs) - - open_func = _open - - executor = get_executor(parallel=parallel) - with executor() as exec: - # wait for all the workers to finish, and send their resulting virtual datasets back to the client for concatenation there - virtual_datasets = list( - exec.map( - open_func, - paths1d, - ) - ) - - # TODO add file closers - - # Combine all datasets, closing them in case of a ValueError - try: - if combine == "nested": - # Combined nested list by successive concat and merge operations - # along each dimension, using structure given by "ids" - combined_vds = _nested_combine( - virtual_datasets, - concat_dims=concat_dim, - compat=compat, - data_vars=data_vars, - coords=coords, - ids=ids, - join=join, - combine_attrs=combine_attrs, - ) - elif combine == "by_coords": - # Redo ordering from coordinates, ignoring how they were ordered - # previously - combined_vds = combine_by_coords( - virtual_datasets, - compat=compat, - data_vars=data_vars, - coords=coords, - join=join, - combine_attrs=combine_attrs, - ) - else: - raise ValueError( - f"{combine} is an invalid option for the keyword argument ``combine``" - ) - except ValueError: - for vds in virtual_datasets: - vds.close() - raise - - # combined_vds.set_close(partial(_multi_file_closer, closers)) - - # read global attributes from the attrs_file or from the first dataset - if attrs_file is not None: - if isinstance(attrs_file, os.PathLike): - attrs_file = cast(str, os.fspath(attrs_file)) - combined_vds.attrs = virtual_datasets[paths1d.index(attrs_file)].attrs - - # TODO should we just immediately close everything? - # TODO If loadable_variables is eager then we should have already read everything we're ever going to read into memory at this point - - return combined_vds diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 6331618f1..7dd6c5a4e 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -1,9 +1,9 @@ from __future__ import annotations import pickle -from collections.abc import AsyncGenerator, Iterable +from collections.abc import AsyncGenerator, Iterable, Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeAlias from urllib.parse import urlparse from zarr.abc.store import ( @@ -13,23 +13,21 @@ Store, SuffixByteRequest, ) -from zarr.core.buffer import Buffer, default_buffer_prototype -from zarr.core.buffer.core import BufferPrototype +from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype +from zarr.core.common import BytesLike -from virtualizarr.manifests.array import ManifestArray from virtualizarr.manifests.group import ManifestGroup from virtualizarr.vendor.zarr.core.metadata import dict_to_buffer if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable, Mapping - from typing import Any - - import xarray as xr from obstore.store import ( ObjectStore, # type: ignore[import-not-found] ) - from zarr.core.buffer import BufferPrototype - from zarr.core.common import BytesLike + + StoreDict: TypeAlias = dict[str, ObjectStore] + + import xarray as xr + __all__ = ["ManifestStore"] @@ -51,23 +49,12 @@ class StoreRequest: """The key within the store to request.""" -async def list_dir_from_manifest_arrays( - arrays: Mapping[str, ManifestArray], prefix: str -) -> AsyncGenerator[str]: - """Create the expected results for Zarr's `store.list_dir()` from an Xarray DataArrray or Dataset - - Parameters - ---------- - arrays : Mapping[str, ManifestArrays] - prefix : str - - Returns - ------- - AsyncIterator[str] +def get_store_prefix(url: str) -> str: """ - # TODO shouldn't this just accept a ManifestGroup instead? - # Start with expected group level metadata - raise NotImplementedError + Get a logical prefix to use for a url in an ObjectStoreRegistry + """ + scheme, netloc, *_ = urlparse(url) + return "" if scheme in {"", "file"} else f"{scheme}://{netloc}" def get_zarr_metadata(manifest_group: ManifestGroup, key: str) -> Buffer: @@ -109,12 +96,20 @@ def parse_manifest_index(key: str, chunk_key_encoding: str = ".") -> tuple[int, Parameters ---------- - key : str - chunk_key_encoding : str + key + The key in the Zarr store to parse. + chunk_key_encoding + The chunk key separator used in the Zarr store. Returns ------- - tuple containing chunk indexes + tuple containing chunk indexes. + + Raises + ------ + NotImplementedError + Raised if the key ends with "c", indicating a scalar array, which is not yet supported. + """ if key.endswith("c"): # Scalar arrays hold the data in the "c" key @@ -127,40 +122,6 @@ def parse_manifest_index(key: str, chunk_key_encoding: str = ".") -> tuple[int, return tuple(int(ind) for ind in parts[1].split(chunk_key_encoding)) -def _find_bucket_region(bucket_name: str) -> str: - import requests - - resp = requests.head(f"https://{bucket_name}.s3.amazonaws.com") - region = resp.headers.get("x-amz-bucket-region") - if not region: - raise ValueError( - f"Unable to automatically determine region for bucket {bucket_name}" - ) - return region - - -def default_object_store(filepath: str) -> ObjectStore: - import obstore as obs - - parsed = urlparse(filepath) - - if parsed.scheme in ["", "file"]: - return obs.store.LocalStore() - if parsed.scheme == "s3": - bucket = parsed.netloc - return obs.store.S3Store( - bucket=bucket, - client_options={"allow_http": True}, - skip_signature=True, - virtual_hosted_style_request=False, - region=_find_bucket_region(bucket), - ) - if parsed.scheme in ["http", "https"]: - base_url = f"{parsed.scheme}://{parsed.netloc}" - return obs.store.HTTPStore.from_url(base_url) - raise NotImplementedError(f"{parsed.scheme} is not yet supported") - - class ObjectStoreRegistry: """ ObjectStoreRegistry maps the URL scheme and netloc to ObjectStore instances. This register allows @@ -177,49 +138,53 @@ def __init__(self, stores: dict[str, ObjectStore] | None = None): raise TypeError(f"expected ObjectStore class, got {store!r}") self._stores = stores - def register_store(self, url: str, store: ObjectStore): + def register_store(self, prefix: str, store: ObjectStore): """ - Register a store using the given url + Register a store using the given prefix If a store with the same key existed before, it is replaced + + Parameters + ---------- + prefix + A url to identify the appropriate object_store instance. If the url is contained in the + prefix of multiple stores in the registry, the store with the longer prefix is chosen. """ - parsed = urlparse(url) - scheme = parsed.scheme or "file" - self._stores[f"{scheme}://{parsed.netloc}"] = store + self._stores[prefix] = store def get_store(self, url: str) -> ObjectStore: """ - Get a suitable store for the provided URL. For example: - - - URL with scheme file:/// or no scheme will return the default LocalFS store - - URL with scheme s3://bucket/ will return the S3 store - - If no `ObjectStore` is found for the `url`, ad-hoc discovery may be executed depending on the - `url`. An `ObjectStore` may be lazily created and registered. + Get a registered store for the provided URL. Parameters ---------- url - A url to identify the appropriate object_store instance based on the URL scheme and netloc. + A url to identify the appropriate object_store instance. If the url is contained in the + prefix of multiple stores in the registry, the store with the longest prefix is chosen. Returns ------- ObjectStore + + Raises + ------ + ValueError + If no store is registered for the provided URL or its prefixes. """ - parsed = urlparse(url) - store = self._stores.get(f"{parsed.scheme}://{parsed.netloc}") - if not store: - store = default_object_store(url) - self.register_store(url, store) - return store + prefixes = filter(url.startswith, self._stores) + + if (longest_prefix := max(prefixes, default=None, key=len)) is None: + raise ValueError(f"No store registered for any prefix of {url!r}") + + return self._stores[longest_prefix] class ManifestStore(Store): """ - A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. The requests - from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing - multiple :class:`virtualizarr.manifests.ManifestArray`, - allowing for virtually interfacing with underlying data in other file format. + A read-only Zarr store that uses obstore to read data from inside arbitrary files on AWS, GCP, Azure, or a local filesystem. + + The requests from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing + multiple :class:`virtualizarr.manifests.ManifestArray`, allowing for virtually interfacing with underlying data in other file formats. Parameters ---------- @@ -307,16 +272,22 @@ async def get( path = manifest._paths[*chunk_indexes] offset = manifest._offsets[*chunk_indexes] length = manifest._lengths[*chunk_indexes] - + # Get the configured object store instance that matches the path + store = self._store_registry.get_store(path) + if not store: + raise ValueError( + f"Could not find a store to use for {path} in the store registry" + ) # Truncate path to match Obstore expectations key = urlparse(path).path + if hasattr(store, "prefix") and store.prefix: + # strip the prefix from key + key = key.removeprefix(str(store.prefix)) # Transform the input byte range to account for the chunk location in the file chunk_end_exclusive = offset + length byte_range = _transform_byte_range( byte_range, chunk_start=offset, chunk_end_exclusive=chunk_end_exclusive ) - # Get the configured object store instance that matches the path - store = self._store_registry.get_store(path) # Actually get the bytes try: bytes = await store.get_range_async( @@ -418,6 +389,11 @@ def to_virtual_dataset( from virtualizarr.xarray import construct_virtual_dataset + if loadable_variables and self._store_registry._stores is None: + raise ValueError( + f"ManifestStore contains an empty store registry, but {loadable_variables} were provided as loadable variables. Must provide an ObjectStore instance in order to load variables." + ) + return construct_virtual_dataset( manifest_store=self, group=group, diff --git a/virtualizarr/parallel.py b/virtualizarr/parallel.py index e5c736057..dcaf6c993 100644 --- a/virtualizarr/parallel.py +++ b/virtualizarr/parallel.py @@ -13,7 +13,7 @@ def get_executor( - parallel: Literal["dask", "lithops"] | Executor | Literal[False], + parallel: Literal["dask", "lithops"] | type[Executor] | Literal[False], ) -> type[Executor]: """Get an executor that follows the concurrent.futures.Executor ABC API.""" @@ -38,7 +38,7 @@ class SerialExecutor(Executor): concurrent.futures.Executor interface. Useful as a default and for debugging. """ - def __init__(self): + def __init__(self) -> None: # Track submitted futures to maintain interface compatibility self._futures: list[Future] = [] @@ -124,7 +124,7 @@ class DaskDelayedExecutor(Executor): This executor mimics the concurrent.futures.Executor interface but uses Dask's delayed computation model. """ - def __init__(self): + def __init__(self) -> None: """Initialize the Dask Delayed Executor.""" # Track submitted futures @@ -224,7 +224,7 @@ class LithopsEagerFunctionExecutor(Executor): Only required because lithops doesn't follow the concurrent.futures.Executor API, see https://github.com/lithops-cloud/lithops/issues/1427. """ - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: import lithops # type: ignore[import-untyped] # Create Lithops client with optional configuration diff --git a/virtualizarr/parsers/__init__.py b/virtualizarr/parsers/__init__.py new file mode 100644 index 000000000..ae9ba493d --- /dev/null +++ b/virtualizarr/parsers/__init__.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from obstore.store import ObjectStore + +from virtualizarr.manifests import ManifestStore +from virtualizarr.parsers.dmrpp import DMRPPParser +from virtualizarr.parsers.fits import FITSParser +from virtualizarr.parsers.hdf.hdf import HDFParser +from virtualizarr.parsers.kerchunk_json import KerchunkJSONParser +from virtualizarr.parsers.kerchunk_parquet import KerchunkParquetParser +from virtualizarr.parsers.netcdf3 import NetCDF3Parser +from virtualizarr.parsers.zarr import ZarrParser + +__all__ = [ + "DMRPPParser", + "FITSParser", + "HDFParser", + "NetCDF3Parser", + "KerchunkJSONParser", + "KerchunkParquetParser", + "ZarrParser", +] + + +@runtime_checkable +class Parser(Protocol): + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: ... + + """ + Parse the contents of a given file to produce a ManifestStore. + + Effectively maps the contents of the file (e.g. metadata, compression codecs, chunk byte offsets) to the Zarr data model. + + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/file.nc"). + object_store + An obstore ObjectStore instance for accessing the file specified in the `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore which provides a Zarr representation of the parsed file. + """ diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/parsers/dmrpp.py similarity index 67% rename from virtualizarr/readers/dmrpp.py rename to virtualizarr/parsers/dmrpp.py index fa5a1e702..3bdeffc1b 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,73 +1,79 @@ +import io import warnings -from collections.abc import Mapping from pathlib import Path -from typing import Any, Hashable, Iterable, Optional +from typing import Any, Iterable from xml.etree import ElementTree as ET import numpy as np -from xarray import Coordinates, Dataset, Index, Variable - -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri +from obstore.store import ObjectStore + +from virtualizarr.manifests import ( + ChunkManifest, + ManifestArray, + ManifestGroup, + ManifestStore, +) +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.readers.api import VirtualBackend +from virtualizarr.parsers.utils import encode_cf_fill_value from virtualizarr.types import ChunkKey -from virtualizarr.utils import _FsspecFSFromFilepath +from virtualizarr.utils import ObstoreReader -class DMRPPVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, +class DMRPPParser: + def __init__( + self, group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - if virtual_backend_kwargs: - raise NotImplementedError( - "DMR++ reader does not understand any virtual_backend_kwargs" - ) + skip_variables: Iterable[str] | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the __call__ method. - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + """ - # TODO: whilst this keeps backwards-compatible behaviour for the `loadable_variables` kwarg, - # it probably has to change, see https://github.com/zarr-developers/VirtualiZarr/pull/477/#issuecomment-2744448626 - if loadable_variables is None or indexes is None: - warnings.warn( - "The default value of the `loadable_variables` kwarg may attempt to load data from the referenced virtual chunks." - "As this is unlikely to be the desired behaviour when opening a DMR++ file, `loadable_variables` has been overridden, and set to `loadable_variables=[]`." - "To silence this warning pass `loadable_variables` explicitly.", - UserWarning, - ) - loadable_variables = [] - indexes = {} + self.group = group + self.skip_variables = skip_variables - if loadable_variables != [] or decode_times or indexes is None: - raise NotImplementedError( - "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." - ) + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given file to product a + VirtualiZarr ManifestStore. - filepath = validate_and_normalize_path_to_uri( - filepath, fs_root=Path.cwd().as_uri() - ) + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/file.dmrpp"). + object_store + An obstore ObjectStore instance for accessing the file specified in the `file_url` parameter. - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() + Returns + ------- + ManifestStore + A ManifestStore that provides a Zarr representation of the parsed file. + """ + reader = ObstoreReader(store=object_store, path=file_url) + file_bytes = reader.readall() + stream = io.BytesIO(file_bytes) parser = DMRParser( - root=ET.parse(fpath).getroot(), - data_filepath=filepath.removesuffix(".dmrpp"), + root=ET.parse(stream).getroot(), + data_filepath=file_url.removesuffix(".dmrpp"), + skip_variables=self.skip_variables, ) - vds = parser.parse_dataset(group=group, indexes=indexes) - - return vds.drop_vars(_drop_vars) + manifest_store = parser.parse_dataset( + object_store=object_store, group=self.group + ) + return manifest_store class DMRParser: @@ -105,11 +111,16 @@ class DMRParser: # Default zlib compression value _DEFAULT_ZLIB_VALUE = 6 # Encoding keys that should be removed from attributes and placed in xarray encoding dict - _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + # _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} root: ET.Element data_filepath: str - def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): + def __init__( + self, + root: ET.Element, + data_filepath: str | None = None, + skip_variables: Iterable[str] | None = None, + ): """ Initialize the DMRParser with the given DMR++ file contents and source data file path. @@ -125,74 +136,68 @@ def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): self.data_filepath = ( data_filepath if data_filepath is not None else self.root.attrib["name"] ) + self.skip_variables = skip_variables or () - def parse_dataset(self, group=None, indexes: Mapping[str, Index] = {}) -> Dataset: + def parse_dataset( + self, + object_store: ObjectStore, + group: str | None = None, + ) -> ManifestStore: """ - Parses the given file and creates a virtual xr.Dataset with ManifestArrays. + Parses the given file and creates a ManifestStore. Parameters ---------- - group : str - The group to parse. If None, and no groups are present, the dataset is parsed. - If None and groups are present, the first group is parsed. - - indexes : Mapping[str, Index], default is {} - Indexes to use on the returned xarray Dataset. - Default is {} which will avoid creating any indexes + group + The group to parse. Ignored if no groups are present, and the entire + dataset is parsed. If `None` or "/", and groups are present, the first group + is parsed. If not `None` or "/", and no groups are present, a UserWarning + is issued indicating that the group will be ignored. Returns ------- - An xr.Dataset wrapping virtualized zarr arrays. + ManifestStore Examples -------- Open a sample DMR++ file and parse the dataset - - >>> import requests - >>> r = requests.get("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp") - >>> parser = DMRParser(r.text) - >>> vds = parser.parse_dataset() - >>> vds - Size: 4MB - Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) - Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 - Data variables: - d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... - - >>> vds2 = open_virtual_dataset("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", filetype="dmrpp") - >>> vds2 - Size: 4MB - Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) - Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 - Data variables: - d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... """ - group_tags = self.root.findall("dap:Group", self._NS) - if group is not None: - group = Path(group) - if not group.is_absolute(): - group = Path("/") / group - if len(group_tags) == 0: - warnings.warn("No groups found in DMR++ file; ignoring group parameter") - else: - all_groups = self._split_groups(self.root) - if group in all_groups: - return self._parse_dataset(all_groups[group], indexes) - else: - raise ValueError(f"Group {group} not found in DMR++ file") - return self._parse_dataset(self.root, indexes) + group = group or "/" + ngroups = len(self.root.findall("dap:Group", self._NS)) + + if ngroups == 0 and group != "/": + warnings.warn( + f"No groups in DMR++ file {self.data_filepath!r}; " + f"ignoring group parameter {group!r}" + ) + + group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") + dataset_element = self._split_groups(self.root).get(group_path) + + if dataset_element is None: + raise ValueError( + f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" + ) + + manifest_group = self._parse_dataset(dataset_element) + registry = ObjectStoreRegistry( + {get_store_prefix(self.data_filepath): object_store} + ) + + return ManifestStore(store_registry=registry, group=manifest_group) def find_node_fqn(self, fqn: str) -> ET.Element: """ Find the element in the root element by converting the fully qualified name to an xpath query. E.g. fqn = "/a/b" --> root.find("./*[@name='a']/*[@name='b']") + See more about OPeNDAP fully qualified names (FQN) here: https://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Fully_Qualified_Names Parameters ---------- - fqn : str - The fully qualified name of an element. E.g. "/a/b" + fqn + The fully qualified name of an element. For example, "/a/b". Returns ------- @@ -206,12 +211,14 @@ def find_node_fqn(self, fqn: str) -> ET.Element: """ if fqn == "/": return self.root + elements = fqn.strip("/").split("/") # /a/b/ --> ['a', 'b'] xpath_segments = [f"*[@name='{element}']" for element in elements] - xpath_query = "./" + "/".join(xpath_segments) # "./[*[@name='a']/*[@name='b']" - element = self.root.find(xpath_query, self._NS) - if element is None: + xpath_query = "/".join([".", *xpath_segments]) # "./[*[@name='a']/*[@name='b']" + + if (element := self.root.find(xpath_query, self._NS)) is None: raise ValueError(f"Path {fqn} not found in provided root") + return element def _split_groups(self, root: ET.Element) -> dict[Path, ET.Element]: @@ -253,8 +260,9 @@ def _split_groups_recursive( return group_dict def _parse_dataset( - self, root: ET.Element, indexes: Mapping[str, Index] = {} - ) -> Dataset: + self, + root: ET.Element, + ) -> ManifestGroup: """ Parse the dataset using the root element of the DMR++ file. @@ -265,33 +273,15 @@ def _parse_dataset( Returns ------- - xr.Dataset + ManifestGroup """ - # Dimension names and sizes - dims: dict[str, int] = {} - dimension_tags = self._find_dimension_tags(root) - for dim in dimension_tags: - dims.update(self._parse_dim(dim)) - # Data variables and coordinates - coord_names: set[str] = set() - coord_tags = root.findall( - ".//dap:Attribute[@name='coordinates']/dap:Value", self._NS - ) - for c in coord_tags: - if c.text is not None: - coord_names.update(c.text.split(" ")) - # Separate and parse coords + data variables - coord_vars: dict[str, Variable] = {} - data_vars: dict[str, Variable] = {} + + manifest_dict: dict[str, ManifestArray] = {} for var_tag in self._find_var_tags(root): - variable = self._parse_variable(var_tag) - # Either coordinates are explicitly defined or 1d variable with same name as dimension is a coordinate - if var_tag.attrib["name"] in coord_names or ( - len(variable.dims) == 1 and variable.dims[0] == var_tag.attrib["name"] - ): - coord_vars[var_tag.attrib["name"]] = variable - else: - data_vars[var_tag.attrib["name"]] = variable + if var_tag.attrib["name"] not in self.skip_variables: + variable = self._parse_variable(var_tag) + manifest_dict[var_tag.attrib["name"]] = variable + # Attributes attrs: dict[str, str] = {} # Look for an attribute tag called "HDF5_GLOBAL" and unpack it @@ -302,10 +292,10 @@ def _parse_dataset( root.extend(hdf5_global_attrs) for attr_tag in root.iterfind("dap:Attribute", self._NS): attrs.update(self._parse_attribute(attr_tag)) - return Dataset( - data_vars=data_vars, - coords=Coordinates(coords=coord_vars, indexes=indexes), - attrs=attrs, + + return ManifestGroup( + arrays=manifest_dict, + attributes=attrs, ) def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: @@ -376,7 +366,7 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]: dimension_tags.append(dimension_tag) return dimension_tags - def _parse_variable(self, var_tag: ET.Element) -> Variable: + def _parse_variable(self, var_tag: ET.Element) -> ManifestArray: """ Parse a variable from a DMR++ tag. @@ -387,7 +377,7 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable: Returns ------- - xr.Variable + ManifestArray """ # Dimension info @@ -405,6 +395,7 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable: shape: tuple[int, ...] = tuple(dims.values()) chunks_shape = shape chunks_tag = var_tag.find("dmrpp:chunks", self._NS) + array_fill_value = np.array(0).astype(dtype)[()] if chunks_tag is not None: # Chunks chunk_dim_text = chunks_tag.findtext( @@ -415,6 +406,9 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable: chunks_shape = tuple(map(int, chunk_dim_text.split())) else: chunks_shape = shape + if "fillValue" in chunks_tag.attrib: + fillValue_attrib = chunks_tag.attrib["fillValue"] + array_fill_value = np.array(fillValue_attrib).astype(dtype)[()] chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) # Filters codecs = self._parse_filters(chunks_tag, dtype) @@ -422,19 +416,20 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable: attrs: dict[str, Any] = {} for attr_tag in var_tag.iterfind("dap:Attribute", self._NS): attrs.update(self._parse_attribute(attr_tag)) - # Fill value is placed in zarr array's fill_value and variable encoding and removed from attributes - encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs} - fill_value = attrs.pop("_FillValue", None) - # create ManifestArray + if "_FillValue" in attrs: + encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) + attrs["_FillValue"] = encoded_cf_fill_value + metadata = create_v3_array_metadata( shape=shape, data_type=dtype, chunk_shape=chunks_shape, - fill_value=fill_value, codecs=codecs, + dimension_names=dims, + attributes=attrs, + fill_value=array_fill_value, ) - marr = ManifestArray(metadata=metadata, chunkmanifest=chunkmanifest) - return Variable(dims=dims.keys(), data=marr, attrs=attrs, encoding=encoding) + return ManifestArray(metadata=metadata, chunkmanifest=chunkmanifest) def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: """ diff --git a/virtualizarr/parsers/fits.py b/virtualizarr/parsers/fits.py new file mode 100644 index 000000000..4304267dd --- /dev/null +++ b/virtualizarr/parsers/fits.py @@ -0,0 +1,74 @@ +from pathlib import Path +from typing import Iterable, Optional + +from obstore.store import ObjectStore + +from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs +from virtualizarr.types.kerchunk import KerchunkStoreRefs + + +class FITSParser: + def __init__( + self, + group: str | None = None, + skip_variables: Iterable[str] | None = None, + reader_options: Optional[dict] = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + reader_options + Configuration options used internally for kerchunk's fsspec backend. + """ + + self.group = group + self.skip_variables = skip_variables + self.reader_options = reader_options + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the contents of a FITS file to produce a ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/file.fits"). + object_store + An obstore ObjectStore instance for accessing the file specified in the `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore which provides a Zarr representation of the parsed file. + """ + + from kerchunk.fits import process_file + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs( + {"refs": process_file(file_url, **self.reader_options)} + ) + + manifestgroup = manifestgroup_from_kerchunk_refs( + refs, + group=self.group, + skip_variables=self.skip_variables, + fs_root=Path.cwd().as_uri(), + ) + + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) + + return ManifestStore(group=manifestgroup, store_registry=registry) diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/parsers/hdf/__init__.py similarity index 100% rename from virtualizarr/tests/test_readers/__init__.py rename to virtualizarr/parsers/hdf/__init__.py diff --git a/virtualizarr/readers/hdf/filters.py b/virtualizarr/parsers/hdf/filters.py similarity index 100% rename from virtualizarr/readers/hdf/filters.py rename to virtualizarr/parsers/hdf/filters.py diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py new file mode 100644 index 000000000..a5e43bd9c --- /dev/null +++ b/virtualizarr/parsers/hdf/hdf.py @@ -0,0 +1,326 @@ +from __future__ import annotations + +import math +from typing import ( + TYPE_CHECKING, + Iterable, +) + +import numpy as np + +from virtualizarr.codecs import numcodec_config_to_configurable +from virtualizarr.manifests import ( + ChunkEntry, + ChunkManifest, + ManifestArray, + ManifestGroup, + ManifestStore, +) +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.manifests.utils import create_v3_array_metadata +from virtualizarr.parsers.hdf.filters import codecs_from_dataset +from virtualizarr.parsers.utils import encode_cf_fill_value +from virtualizarr.types import ChunkKey +from virtualizarr.utils import ObstoreReader, soft_import + +h5py = soft_import("h5py", "reading hdf files", strict=False) + + +if TYPE_CHECKING: + from h5py import Dataset as H5Dataset + from h5py import Group as H5Group + from obstore.store import ObjectStore + + +def _construct_manifest_array( + filepath: str, + dataset: H5Dataset, + group: str, +) -> ManifestArray: + """ + Construct a ManifestArray from an h5py dataset + + Parameters + ---------- + filepath + The path of the hdf5 file. + dataset + An h5py dataset. + group + Name of the group containing this h5py.Dataset. + + Returns + ------- + ManifestArray + """ + chunks = dataset.chunks or dataset.shape + codecs = codecs_from_dataset(dataset) + attrs = _extract_attrs(dataset) + dtype = dataset.dtype + + # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR. + # cfcodec = cfcodec_from_dataset(dataset) + # if cfcodec: + # codecs.insert(0, cfcodec["codec"]) + # dtype = cfcodec["target_dtype"] + # attrs.pop("scale_factor", None) + # attrs.pop("add_offset", None) + # else: + # dtype = dataset.dtype + + if "_FillValue" in attrs: + encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) + attrs["_FillValue"] = encoded_cf_fill_value + + codec_configs = [ + numcodec_config_to_configurable(codec.get_config()) for codec in codecs + ] + + fill_value = dataset.fillvalue.item() + dims = tuple(_dataset_dims(dataset, group=group)) + metadata = create_v3_array_metadata( + shape=dataset.shape, + data_type=dtype, + chunk_shape=chunks, + fill_value=fill_value, + codecs=codec_configs, + dimension_names=dims, + attributes=attrs, + ) + manifest = _dataset_chunk_manifest(filepath, dataset) + return ManifestArray(metadata=metadata, chunkmanifest=manifest) + + +def _construct_manifest_group( + filepath: str, + reader: ObstoreReader, + *, + group: str | None = None, + drop_variables: Iterable[str] | None = None, +) -> ManifestGroup: + """ + Construct a virtual Group from a HDF dataset. + """ + + import h5py + + with h5py.File(reader, mode="r") as f: + if not isinstance(g := f.get(group or "/"), h5py.Group): + raise ValueError(f"Group {group!r} is not an HDF Group") + + # Several of our test fixtures which use xr.tutorial data have + # non coord dimensions serialized using big endian dtypes which are not + # yet supported in zarr-python v3. We'll drop these variables for the + # moment until big endian support is included upstream. + + non_coordinate_dimension_vars = _find_non_coord_dimension_vars(group=g) + drop_variables = set(drop_variables or ()) | set(non_coordinate_dimension_vars) + group_name = str(g.name) # NOTE: this will always include leading "/" + arrays = { + key: _construct_manifest_array(filepath, dataset, group_name) + for key in g.keys() + if key not in drop_variables and isinstance(dataset := g[key], h5py.Dataset) + } + attributes = _extract_attrs(g) + + return ManifestGroup(arrays=arrays, attributes=attributes) + + +class HDFParser: + def __init__( + self, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + ): + self.group = group + self.drop_variables = drop_variables + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + reader = ObstoreReader(store=object_store, path=file_url) + manifest_group = _construct_manifest_group( + filepath=file_url, + reader=reader, + group=self.group, + drop_variables=self.drop_variables, + ) + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) + # Convert to a manifest store + return ManifestStore(store_registry=registry, group=manifest_group) + + +def _dataset_chunk_manifest( + filepath: str, + dataset: H5Dataset, +) -> ChunkManifest: + """ + Generate ChunkManifest for HDF5 dataset. + + Parameters + ---------- + filepath + The path of the HDF5 file + dataset + h5py dataset for which to create a ChunkManifest + + Returns + ------- + ChunkManifest + A Virtualizarr ChunkManifest + """ + dsid = dataset.id + if dataset.chunks is None: + if dsid.get_offset() is None: + chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) + else: + key_list = [0] * (len(dataset.shape) or 1) + key = ".".join(map(str, key_list)) + + chunk_entry: ChunkEntry = ChunkEntry.with_validation( # type: ignore[attr-defined] + path=filepath, offset=dsid.get_offset(), length=dsid.get_storage_size() + ) + chunk_key = ChunkKey(key) + chunk_entries = {chunk_key: chunk_entry} + chunk_manifest = ChunkManifest(entries=chunk_entries) + else: + num_chunks = dsid.get_num_chunks() + if num_chunks == 0: + chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) + else: + shape = tuple( + math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks) + ) + paths = np.empty(shape, dtype=np.dtypes.StringDType) + offsets = np.empty(shape, dtype=np.uint64) + lengths = np.empty(shape, dtype=np.uint64) + + def get_key(blob): + return tuple(a // b for a, b in zip(blob.chunk_offset, dataset.chunks)) + + def add_chunk_info(blob): + key = get_key(blob) + paths[key] = filepath + offsets[key] = blob.byte_offset + lengths[key] = blob.size + + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(add_chunk_info) + else: + for index in range(num_chunks): + add_chunk_info(dsid.get_chunk_info(index)) + + chunk_manifest = ChunkManifest.from_arrays( + paths=paths, # type: ignore + offsets=offsets, + lengths=lengths, + ) + return chunk_manifest + + +def _dataset_dims(dataset: H5Dataset, group: str = "/") -> list[str]: + """ + Get a list of dimension scale names attached to input HDF5 dataset. + + This is required by the xarray package to work with Zarr arrays. Only + one dimension scale per dataset dimension is allowed. If dataset is + dimension scale, it will be considered as the dimension to itself. + + Parameters + ---------- + dataset + An h5py dataset. + group + Name of the group we are pulling these dimensions from (default: the root + group "/"). Required for removing subgroup prefixes. + + Returns + ------- + list[str] + List with HDF5 path names of dimension scales attached to input + dataset. + """ + import h5py + + dims: list[str] = [] + + for n in range(len(dataset.shape)): + if (num_scales := len(dataset.dims[n])) == 1: + dims.append(str(dataset.dims[n][0].name)) + elif h5py.h5ds.is_scale(dataset.id): + dims.append(str(dataset.name)) + elif num_scales > 1: + raise ValueError( + f"{dataset.name} has {num_scales} dimension scales attached to " + f"dimension #{n}; require exactly 1" + ) + elif num_scales == 0: + # Some HDF5 files do not have dimension scales. + # If this is the case, `num_scales` will be 0. + # In this case, we mimic netCDF4 and assign phony dimension names. + # See https://github.com/fsspec/kerchunk/issues/41 + dims.append(f"phony_dim_{n}") + + return [dim.removeprefix(group).removeprefix("/") for dim in dims] + + +def _extract_attrs(h5obj: H5Dataset | H5Group): + """ + Extract attributes from an HDF5 group or dataset. + + Parameters + ---------- + h5obj + An h5py group or dataset. + """ + _HIDDEN_ATTRS = { + "REFERENCE_LIST", + "CLASS", + "DIMENSION_LIST", + "NAME", + "_Netcdf4Dimid", + "_Netcdf4Coordinates", + "_nc3_strict", + "_NCProperties", + } + attrs = {} + for n, v in h5obj.attrs.items(): + if n in _HIDDEN_ATTRS: + continue + if n == "_FillValue": + v = v + # Fix some attribute values to avoid JSON encoding exceptions... + if isinstance(v, bytes): + v = v.decode("utf-8") or " " + elif isinstance(v, (np.ndarray, np.number, np.bool_)): + if v.dtype.kind == "S": + v = v.astype(str) + elif v.size == 1: + v = v.flatten()[0] + if isinstance(v, (np.ndarray, np.number, np.bool_)): + v = v.tolist() + else: + v = v.tolist() + elif isinstance(v, h5py._hl.base.Empty): + v = "" + if v == "DIMENSION_SCALE": + continue + attrs[n] = v + return attrs + + +def _find_non_coord_dimension_vars(group: H5Group) -> list[str]: + dimension_names = [] + non_coordinate_dimension_variables = [] + for name, obj in group.items(): + if "_Netcdf4Dimid" in obj.attrs: + dimension_names.append(name) + for name, obj in group.items(): + if type(obj) is h5py.Dataset: + if obj.id.get_storage_size() == 0 and name in dimension_names: + non_coordinate_dimension_variables.append(name) + + return non_coordinate_dimension_variables diff --git a/virtualizarr/parsers/kerchunk_json.py b/virtualizarr/parsers/kerchunk_json.py new file mode 100644 index 000000000..f0c76dfcd --- /dev/null +++ b/virtualizarr/parsers/kerchunk_json.py @@ -0,0 +1,88 @@ +from collections.abc import Iterable + +import ujson +from obstore.store import ObjectStore + +from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs +from virtualizarr.utils import ObstoreReader + + +class KerchunkJSONParser: + def __init__( + self, + group: str | None = None, + fs_root: str | None = None, + skip_variables: Iterable[str] | None = None, + store_registry: ObjectStoreRegistry | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + fs_root + The qualifier to be used for kerchunk references containing relative paths. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + store_registry + A user defined ObjectStoreRegistry to be used for reading data for kerchunk + references contain paths to multiple locations. + """ + + self.group = group + self.fs_root = fs_root + self.skip_variables = skip_variables + self.store_registry = store_registry + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given file to produce a + VirtualiZarr ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/kerchunk.json"). + object_store + An obstore ObjectStore instance for accessing the file specified in the + `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore that provides a Zarr representation of the parsed file. + """ + + reader = ObstoreReader(store=object_store, path=file_url) + + reader.seek(0) + content = reader.readall().decode() + refs = ujson.loads(content) + if self.store_registry is None: + unique_paths = { + v[0] + for v in refs["refs"].values() + if isinstance(v, list) and isinstance(v[0], str) + } + stores = {} + for path in unique_paths: + stores[get_store_prefix(path)] = object_store + registry = ObjectStoreRegistry(stores=stores) + else: + registry = self.store_registry + manifestgroup = manifestgroup_from_kerchunk_refs( + refs, + group=self.group, + fs_root=self.fs_root, + skip_variables=self.skip_variables, + ) + return ManifestStore(group=manifestgroup, store_registry=registry) diff --git a/virtualizarr/parsers/kerchunk_parquet.py b/virtualizarr/parsers/kerchunk_parquet.py new file mode 100644 index 000000000..2473bac47 --- /dev/null +++ b/virtualizarr/parsers/kerchunk_parquet.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import io +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs +from virtualizarr.types.kerchunk import ( + KerchunkStoreRefs, +) + +if TYPE_CHECKING: + from typing import TypeAlias + + import fsspec + import fsspec.core + import fsspec.spec + from obstore.store import ObjectStore + + # See pangeo_forge_recipes.storage + OpenFileType: TypeAlias = ( + fsspec.core.OpenFile | fsspec.spec.AbstractBufferedFile | io.IOBase + ) + + +class KerchunkParquetParser: + def __init__( + self, + group: str | None = None, + fs_root: str | None = None, + skip_variables: Iterable[str] | None = None, + reader_options: dict | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + fs_root + The qualifier to be used for kerchunk references containing relative paths. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + reader_options + Configuration options used internally for the fsspec backend. + """ + + self.group = group + self.fs_root = fs_root + self.skip_variables = skip_variables + self.reader_options = reader_options + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given file to product a + VirtualiZarr ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input parquet directory (e.g., "s3://bucket/file.parq"). + object_store + An obstore ObjectStore instance for accessing the file specified in the + `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore which provides a Zarr representation of the parsed file. + """ + + # The kerchunk .parquet storage format isn't actually a parquet, but a + # directory that contains named parquets for each group/variable. + fs = _FsspecFSFromFilepath(file_url, self.reader_options) + from fsspec.implementations.reference import LazyReferenceMapper + + lrm = LazyReferenceMapper(file_url, fs.fs) + + # build reference dict from KV pairs in LazyReferenceMapper + # is there a better / more performant way to extract this? + array_refs = {k: lrm[k] for k in lrm.keys()} + full_reference = {"refs": array_refs} + refs = KerchunkStoreRefs(full_reference) + + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) + manifestgroup = manifestgroup_from_kerchunk_refs( + refs, + group=self.group, + fs_root=self.fs_root, + skip_variables=self.skip_variables, + ) + + return ManifestStore(group=manifestgroup, store_registry=registry) + + +@dataclass +class _FsspecFSFromFilepath: + """Class to create fsspec Filesystem from input filepath. + + Attributes + ---------- + filepath + Input filepath + reader_options + dict containing kwargs to pass to file opener, by default {} + fs + The fsspec filesystem object, created in the `__post_init__` method. + + """ + + filepath: str + reader_options: dict | None = field(default_factory=dict) + fs: fsspec.AbstractFileSystem = field(init=False) + + def open_file(self) -> OpenFileType: + """Calls `open` on `fsspec.Filesystem` instantiation using `self.filepath` as an input. + + Returns + ------- + OpenFileType + file opened with fsspec + """ + return self.fs.open(self.filepath) + + def read_bytes(self, bytes: int) -> bytes: + with self.open_file() as of: + return of.read(bytes) + + def get_mapper(self): + """Returns a mapper for use with Zarr""" + return self.fs.get_mapper(self.filepath) + + def __post_init__(self) -> None: + """Initialize the fsspec filesystem object""" + import fsspec + from upath import UPath + + upath = UPath(self.filepath) + self.reader_options = self.reader_options or {} + storage_options = self.reader_options.get("storage_options", {}) + + self.fs = fsspec.filesystem(upath.protocol, **storage_options) diff --git a/virtualizarr/parsers/netcdf3.py b/virtualizarr/parsers/netcdf3.py new file mode 100644 index 000000000..fef01b566 --- /dev/null +++ b/virtualizarr/parsers/netcdf3.py @@ -0,0 +1,74 @@ +from collections.abc import Iterable +from pathlib import Path + +from obstore.store import ObjectStore + +from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs + + +class NetCDF3Parser: + def __init__( + self, + group: str | None = None, + skip_variables: Iterable[str] | None = None, + reader_options: dict | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + reader_options + Configuration options used internally for the kerchunk's fsspec backend. + """ + + self.group = group + self.skip_variables = skip_variables + self.reader_options = reader_options or {} + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given file to product a VirtualiZarr ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/file.nc"). + object_store + An obstore ObjectStore instance for accessing the file specified in the + `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore that provides a Zarr representation of the parsed file. + """ + + from kerchunk.netCDF3 import NetCDF3ToZarr + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = NetCDF3ToZarr( + file_url, inline_threshold=0, **self.reader_options + ).translate() + + manifestgroup = manifestgroup_from_kerchunk_refs( + refs, + group=self.group, + skip_variables=self.skip_variables, + fs_root=Path.cwd().as_uri(), + ) + + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) + + return ManifestStore(group=manifestgroup, store_registry=registry) diff --git a/virtualizarr/parsers/tiff.py b/virtualizarr/parsers/tiff.py new file mode 100644 index 000000000..77e03b6ca --- /dev/null +++ b/virtualizarr/parsers/tiff.py @@ -0,0 +1,75 @@ +from collections.abc import Iterable +from pathlib import Path + +from obstore.store import ObjectStore + +from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests.store import ObjectStoreRegistry, get_store_prefix +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs +from virtualizarr.types.kerchunk import KerchunkStoreRefs + + +class Parser: + def __init__( + self, + group: str | None = None, + skip_variables: Iterable[str] | None = None, + remote_options: dict | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the ManifestStore. + skip_variables + Variables in the file that will be ignored when creating the ManifestStore. + remote_options + Configuration options used internally for kerchunk's fsspec backend + """ + + self.group = group + self.skip_variables = skip_variables + self.remote_options = remote_options or {} + + def __call__( + self, + file_url: str, + object_store: ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given file to product a VirtualiZarr ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input file (e.g., "s3://bucket/file.tiff"). + object_store + An obstore ObjectStore instance for accessing the file specified in the + `file_url` parameter. + + Returns + ------- + ManifestStore + A ManifestStore which provides a Zarr representation of the parsed file. + """ + + from kerchunk.tiff import tiff_to_zarr + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs( + {"refs": tiff_to_zarr(file_url, **self.remote_options)} + ) + + manifestgroup = manifestgroup_from_kerchunk_refs( + refs, + group=self.group, + skip_variables=self.skip_variables, + fs_root=Path.cwd().as_uri(), + ) + + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) + + return ManifestStore(group=manifestgroup, store_registry=registry) diff --git a/virtualizarr/parsers/utils.py b/virtualizarr/parsers/utils.py new file mode 100644 index 000000000..e5781d2cd --- /dev/null +++ b/virtualizarr/parsers/utils.py @@ -0,0 +1,40 @@ +import numpy as np +from xarray.backends.zarr import FillValueCoder + +FillValueType = ( + int + | float + | bool + | complex + | str + | np.integer + | np.floating + | np.bool_ + | np.complexfloating + | bytes # For fixed-length string storage + | tuple[bytes, int] # Structured type +) + + +def encode_cf_fill_value( + fill_value: np.ndarray | np.generic, + target_dtype: np.dtype, +) -> FillValueType: + """ + Convert a fill value into one properly encoded for a target dtype. + + Parameters + ---------- + fill_value + An ndarray or value. + target_dtype + The target dtype of the ManifestArray that will use `fill_value` as its fill value. + """ + if isinstance(fill_value, (np.ndarray, np.generic)): + if isinstance(fill_value, np.ndarray) and fill_value.size > 1: + raise ValueError("Expected a scalar") + fillvalue = fill_value.item() + else: + fillvalue = fill_value + encoded_fillvalue = FillValueCoder.encode(fillvalue, target_dtype) + return encoded_fillvalue diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/parsers/zarr.py similarity index 62% rename from virtualizarr/readers/zarr.py rename to virtualizarr/parsers/zarr.py index 0cea7a16f..47e82035b 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -1,17 +1,16 @@ from __future__ import annotations import asyncio +from collections.abc import Iterable from pathlib import Path # noqa from typing import ( Any, Hashable, - Iterable, - Mapping, - Optional, ) +from urllib.parse import urlparse import numpy as np -from xarray import Dataset, Index +import obstore from zarr.api.asynchronous import open_group as open_group_async from zarr.core.metadata import ArrayV3Metadata @@ -20,9 +19,10 @@ ManifestArray, ManifestGroup, ManifestStore, + ObjectStoreRegistry, ) from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri # noqa -from virtualizarr.readers.api import VirtualBackend +from virtualizarr.manifests.store import get_store_prefix from virtualizarr.vendor.zarr.core.common import _concurrent_map FillValueT = bool | str | float | int | list | None @@ -111,22 +111,20 @@ async def _construct_manifest_array(zarr_array: zarr.AsyncArray[Any], filepath: async def _construct_manifest_group( filepath: str, + store: zarr.storage.ObjectStore | zarr.storage.LocalStore, *, - reader_options: Optional[dict] = None, - drop_variables: str | Iterable[str] | None = None, + skip_variables: str | Iterable[str] | None = None, group: str | None = None, ): - reader_options = reader_options or {} zarr_group = await open_group_async( - filepath, - storage_options=reader_options.get("storage_options"), + store=store, path=group, mode="r", ) zarr_array_keys = [key async for key in zarr_group.array_keys()] - _drop_vars: list[Hashable] = [] if drop_variables is None else list(drop_variables) + _drop_vars: list[Hashable] = [] if skip_variables is None else list(skip_variables) zarr_arrays = await asyncio.gather( *[zarr_group.getitem(var) for var in zarr_array_keys if var not in _drop_vars] @@ -145,52 +143,71 @@ async def _construct_manifest_group( return ManifestGroup(manifest_dict, attributes=zarr_group.attrs) -def _construct_manifest_store( - filepath: str, - *, - reader_options: Optional[dict] = None, - drop_variables: str | Iterable[str] | None = None, - group: str | None = None, -) -> ManifestStore: - import asyncio - - manifest_group = asyncio.run( - _construct_manifest_group( - filepath=filepath, - group=group, - drop_variables=drop_variables, - reader_options=reader_options, - ) - ) - return ManifestStore(manifest_group) - - -class ZarrVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, +class ZarrParser: + def __init__( + self, group: str | None = None, - drop_variables: str | Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: + skip_variables: Iterable[str] | None = None, + ): + """ + Instantiate a parser with parser-specific parameters that can be used in the + `__call__` method. + + Parameters + ---------- + group + The group within the file to be used as the Zarr root group for the + ManifestStore (default: the file's root group). + skip_variables + Variables in the file that will be ignored when creating the ManifestStore + (default: `None`, do not ignore any variables). + """ + + self.group = group + self.skip_variables = skip_variables + + def __call__( + self, + file_url: str, + object_store: obstore.store.ObjectStore, + ) -> ManifestStore: + """ + Parse the metadata and byte offsets from a given Zarr store to produce a VirtualiZarr ManifestStore. + + Parameters + ---------- + file_url + The URI or path to the input Zarr store (e.g., "s3://bucket/store.zarr"). + object_store + An obstore ObjectStore instance for accessing the directory specified in the + `file_url` parameter. + + Returns + ------- + ManifestStore: A ManifestStore which provides a Zarr representation of the parsed file. + """ + filepath = validate_and_normalize_path_to_uri( - filepath, fs_root=Path.cwd().as_uri() + file_url, fs_root=Path.cwd().as_uri() ) - - manifest_store = _construct_manifest_store( - filepath=filepath, - group=group, - drop_variables=drop_variables, - reader_options=reader_options, + import asyncio + + # Temporary handling of local paths with Zarr LocalStore + # until zarr-python adopts obstore LocalStore + zarr_store: zarr.storage.LocalStore | zarr.storage.ObjectStore + if isinstance(object_store, obstore.store.LocalStore): + parsed = urlparse(filepath) + zarr_store = zarr.storage.LocalStore(parsed.path) + else: + zarr_store = zarr.storage.ObjectStore(store=object_store) + manifest_group = asyncio.run( + _construct_manifest_group( + store=zarr_store, + filepath=file_url, + group=self.group, + skip_variables=self.skip_variables, + ) ) + registry = ObjectStoreRegistry({get_store_prefix(file_url): object_store}) - ds = manifest_store.to_virtual_dataset( - loadable_variables=loadable_variables, - decode_times=decode_times, - indexes=indexes, - ) - return ds + return ManifestStore(store_registry=registry, group=manifest_group) diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py deleted file mode 100644 index 3d887844c..000000000 --- a/virtualizarr/readers/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from virtualizarr.readers.dmrpp import DMRPPVirtualBackend -from virtualizarr.readers.fits import FITSVirtualBackend -from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.readers.hdf5 import HDF5VirtualBackend -from virtualizarr.readers.kerchunk import KerchunkVirtualBackend -from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend -from virtualizarr.readers.tiff import TIFFVirtualBackend -from virtualizarr.readers.zarr import ( - ZarrVirtualBackend, -) - -__all__ = [ - "DMRPPVirtualBackend", - "FITSVirtualBackend", - "HDFVirtualBackend", - "HDF5VirtualBackend", - "KerchunkVirtualBackend", - "NetCDF3VirtualBackend", - "TIFFVirtualBackend", - "ZarrVirtualBackend", -] diff --git a/virtualizarr/readers/api.py b/virtualizarr/readers/api.py deleted file mode 100644 index 37317c8c1..000000000 --- a/virtualizarr/readers/api.py +++ /dev/null @@ -1,33 +0,0 @@ -from abc import ABC -from collections.abc import Iterable, Mapping -from typing import Optional - -import xarray as xr - - -class VirtualBackend(ABC): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, xr.Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> xr.Dataset: - raise NotImplementedError() - - @staticmethod - def open_virtual_datatree( - path: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, xr.Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> xr.DataTree: - raise NotImplementedError() diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py deleted file mode 100644 index 9704c43c4..000000000 --- a/virtualizarr/readers/fits.py +++ /dev/null @@ -1,64 +0,0 @@ -from pathlib import Path -from typing import Hashable, Iterable, Mapping, Optional - -from xarray import Dataset, Index - -from virtualizarr.readers.api import ( - VirtualBackend, -) -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.xarray import construct_fully_virtual_dataset - - -class FITSVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - from kerchunk.fits import process_file - - if virtual_backend_kwargs: - raise NotImplementedError( - "FITS reader does not understand any virtual_backend_kwargs" - ) - - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) - - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) - - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly - if loadable_variables or indexes: - raise NotImplementedError( - "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS" - ) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( - refs, - fs_root=Path.cwd().as_uri(), - ) - - vds = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/hdf/__init__.py b/virtualizarr/readers/hdf/__init__.py deleted file mode 100644 index 9239ad80f..000000000 --- a/virtualizarr/readers/hdf/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .hdf import ( - HDFVirtualBackend, -) - -__all__ = [ - "HDFVirtualBackend", -] diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py deleted file mode 100644 index 5a02585bf..000000000 --- a/virtualizarr/readers/hdf/hdf.py +++ /dev/null @@ -1,430 +0,0 @@ -from __future__ import annotations - -import math -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Iterable, - List, - Mapping, - Optional, - Tuple, - Union, -) - -import numpy as np -import xarray as xr -from xarray.backends.zarr import FillValueCoder - -from virtualizarr.codecs import numcodec_config_to_configurable -from virtualizarr.manifests import ( - ChunkEntry, - ChunkManifest, - ManifestArray, - ManifestGroup, - ManifestStore, -) -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri -from virtualizarr.manifests.store import ObjectStoreRegistry, default_object_store -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.readers.hdf.filters import codecs_from_dataset -from virtualizarr.types import ChunkKey -from virtualizarr.utils import soft_import - -h5py = soft_import("h5py", "For reading hdf files", strict=False) - - -if TYPE_CHECKING: - from h5py import Dataset as H5Dataset - from h5py import Group as H5Group - from obstore.store import ObjectStore - -FillValueType = Union[ - int, - float, - bool, - complex, - str, - np.integer, - np.floating, - np.bool_, - np.complexfloating, - bytes, # For fixed-length string storage - Tuple[bytes, int], # Structured type -] - - -class HDFVirtualBackend(VirtualBackend): - @staticmethod - def _construct_manifest_array( - path: str, - dataset: H5Dataset, - group: str, - ) -> ManifestArray: - """ - Construct a ManifestArray from an h5py dataset - Parameters - ---------- - path - The path of the hdf5 file. - dataset - An h5py dataset. - group - Name of the group containing this h5py.Dataset. - Returns - ------- - ManifestArray - """ - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - attrs = HDFVirtualBackend._extract_attrs(dataset) - dtype = dataset.dtype - - # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR. - # cfcodec = cfcodec_from_dataset(dataset) - # if cfcodec: - # codecs.insert(0, cfcodec["codec"]) - # dtype = cfcodec["target_dtype"] - # attrs.pop("scale_factor", None) - # attrs.pop("add_offset", None) - # else: - # dtype = dataset.dtype - - if "_FillValue" in attrs: - encoded_cf_fill_value = HDFVirtualBackend._encode_cf_fill_value( - attrs["_FillValue"], dtype - ) - attrs["_FillValue"] = encoded_cf_fill_value - - codec_configs = [ - numcodec_config_to_configurable(codec.get_config()) for codec in codecs - ] - - fill_value = dataset.fillvalue.item() - dims = tuple(HDFVirtualBackend._dataset_dims(dataset, group=group)) - metadata = create_v3_array_metadata( - shape=dataset.shape, - data_type=dtype, - chunk_shape=chunks, - fill_value=fill_value, - codecs=codec_configs, - dimension_names=dims, - attributes=attrs, - ) - - manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset) - return ManifestArray(metadata=metadata, chunkmanifest=manifest) - - @staticmethod - def _construct_manifest_group( - store: ObjectStore, - filepath: str, - *, - group: str | None = None, - drop_variables: Optional[Iterable[str]] = None, - ) -> ManifestGroup: - """ - Construct a virtual Group from a HDF dataset. - """ - from virtualizarr.utils import ObstoreReader - - if drop_variables is None: - drop_variables = [] - - reader = ObstoreReader(store=store, path=filepath) - f = h5py.File(reader, mode="r") - - if group is not None and group != "": - g = f[group] - group_name = group - if not isinstance(g, h5py.Group): - raise ValueError("The provided group is not an HDF group") - else: - g = f["/"] - group_name = "/" - - manifest_dict = {} - # Several of our test fixtures which use xr.tutorial data have - # non coord dimensions serialized using big endian dtypes which are not - # yet supported in zarr-python v3. We'll drop these variables for the - # moment until big endian support is included upstream.) - - non_coordinate_dimension_vars = ( - HDFVirtualBackend._find_non_coord_dimension_vars(group=g) - ) - drop_variables = list(set(list(drop_variables) + non_coordinate_dimension_vars)) - attrs = HDFVirtualBackend._extract_attrs(g) - for key in g.keys(): - if key not in drop_variables: - if isinstance(g[key], h5py.Dataset): - variable = HDFVirtualBackend._construct_manifest_array( - path=filepath, - dataset=g[key], - group=group_name, - ) - if variable is not None: - manifest_dict[key] = variable - return ManifestGroup(arrays=manifest_dict, attributes=attrs) - - @staticmethod - def _create_manifest_store( - filepath: str, - *, - store: ObjectStore | None = None, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - ) -> ManifestStore: - # Create a group containing dataset level metadata and all the manifest arrays - if not store: - store = default_object_store(filepath) # type: ignore - manifest_group = HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=filepath, - group=group, - drop_variables=drop_variables, - ) - registry = ObjectStoreRegistry({filepath: store}) - # Convert to a manifest store - return ManifestStore(store_registry=registry, group=manifest_group) - - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, xr.Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> xr.Dataset: - if h5py is None: - raise ImportError("h5py is required for using the HDFVirtualBackend") - if virtual_backend_kwargs: - raise NotImplementedError( - "HDF reader does not understand any virtual_backend_kwargs" - ) - - filepath = validate_and_normalize_path_to_uri( - filepath, fs_root=Path.cwd().as_uri() - ) - - _drop_vars: Iterable[str] = ( - [] if drop_variables is None else list(drop_variables) - ) - - manifest_store = HDFVirtualBackend._create_manifest_store( - filepath=filepath, - drop_variables=_drop_vars, - group=group, - ) - ds = manifest_store.to_virtual_dataset( - loadable_variables=loadable_variables, - decode_times=decode_times, - indexes=indexes, - ) - return ds - - @staticmethod - def _dataset_chunk_manifest( - path: str, - dataset: H5Dataset, - ) -> ChunkManifest: - """ - Generate ChunkManifest for HDF5 dataset. - - Parameters - ---------- - path - The path of the HDF5 file - dataset - h5py dataset for which to create a ChunkManifest - - Returns - ------- - ChunkManifest - A Virtualizarr ChunkManifest - """ - dsid = dataset.id - if dataset.chunks is None: - if dsid.get_offset() is None: - chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) - else: - key_list = [0] * (len(dataset.shape) or 1) - key = ".".join(map(str, key_list)) - - chunk_entry: ChunkEntry = ChunkEntry.with_validation( # type: ignore[attr-defined] - path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() - ) - chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry} - chunk_manifest = ChunkManifest(entries=chunk_entries) - else: - num_chunks = dsid.get_num_chunks() - if num_chunks == 0: - chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) - else: - shape = tuple( - math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks) - ) - paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore - offsets = np.empty(shape, dtype=np.uint64) - lengths = np.empty(shape, dtype=np.uint64) - - def get_key(blob): - return tuple( - [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] - ) - - def add_chunk_info(blob): - key = get_key(blob) - paths[key] = path - offsets[key] = blob.byte_offset - lengths[key] = blob.size - - has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) - if has_chunk_iter: - dsid.chunk_iter(add_chunk_info) - else: - for index in range(num_chunks): - add_chunk_info(dsid.get_chunk_info(index)) - - chunk_manifest = ChunkManifest.from_arrays( - paths=paths, # type: ignore - offsets=offsets, - lengths=lengths, - ) - return chunk_manifest - - @staticmethod - def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]: - """ - Get a list of dimension scale names attached to input HDF5 dataset. - - This is required by the xarray package to work with Zarr arrays. Only - one dimension scale per dataset dimension is allowed. If dataset is - dimension scale, it will be considered as the dimension to itself. - - Parameters - ---------- - dataset - An h5py dataset. - group - Name of the group we are pulling these dimensions from. Required for potentially removing subgroup prefixes. - - Returns - ------- - list[str] - List with HDF5 path names of dimension scales attached to input - dataset. - """ - dims = list() - rank = len(dataset.shape) - if rank: - for n in range(rank): - num_scales = len(dataset.dims[n]) # type: ignore - if num_scales == 1: - dims.append(dataset.dims[n][0].name[1:]) # type: ignore - elif h5py.h5ds.is_scale(dataset.id): - dims.append(dataset.name[1:]) - elif num_scales > 1: - raise ValueError( - f"{dataset.name}: {len(dataset.dims[n])} " # type: ignore - f"dimension scales attached to dimension #{n}" - ) - elif num_scales == 0: - # Some HDF5 files do not have dimension scales. - # If this is the case, `num_scales` will be 0. - # In this case, we mimic netCDF4 and assign phony dimension names. - # See https://github.com/fsspec/kerchunk/issues/41 - dims.append(f"phony_dim_{n}") - - if not group.endswith("/"): - group += "/" - - return [dim.removeprefix(group) for dim in dims] - - @staticmethod - def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): - """ - Extract attributes from an HDF5 group or dataset. - - Parameters - ---------- - h5obj - An h5py group or dataset. - """ - _HIDDEN_ATTRS = { - "REFERENCE_LIST", - "CLASS", - "DIMENSION_LIST", - "NAME", - "_Netcdf4Dimid", - "_Netcdf4Coordinates", - "_nc3_strict", - "_NCProperties", - } - attrs = {} - for n, v in h5obj.attrs.items(): - if n in _HIDDEN_ATTRS: - continue - if n == "_FillValue": - v = v - # Fix some attribute values to avoid JSON encoding exceptions... - if isinstance(v, bytes): - v = v.decode("utf-8") or " " - elif isinstance(v, (np.ndarray, np.number, np.bool_)): - if v.dtype.kind == "S": - v = v.astype(str) - elif v.size == 1: - v = v.flatten()[0] - if isinstance(v, (np.ndarray, np.number, np.bool_)): - v = v.tolist() - else: - v = v.tolist() - elif isinstance(v, h5py._hl.base.Empty): - v = "" - if v == "DIMENSION_SCALE": - continue - attrs[n] = v - return attrs - - @staticmethod - def _find_non_coord_dimension_vars(group: H5Group) -> List[str]: - dimension_names = [] - non_coordinate_dimension_variables = [] - for name, obj in group.items(): - if "_Netcdf4Dimid" in obj.attrs: - dimension_names.append(name) - for name, obj in group.items(): - if type(obj) is h5py.Dataset: - if obj.id.get_storage_size() == 0 and name in dimension_names: - non_coordinate_dimension_variables.append(name) - - return non_coordinate_dimension_variables - - @staticmethod - def _encode_cf_fill_value( - fill_value: Union[np.ndarray, np.generic], - target_dtype: np.dtype, - ) -> FillValueType: - """ - Convert the _FillValue attribute from an HDF5 group or dataset into - one properly encoded for the target dtype. - - Parameters - ---------- - fill_value - An ndarray or value. - target_dtype - The target dtype of the ManifestArray that will use the _FillValue - """ - if isinstance(fill_value, (np.ndarray, np.generic)): - if isinstance(fill_value, np.ndarray) and fill_value.size > 1: - raise ValueError("Expected a scalar") - fillvalue = fill_value.item() - else: - fillvalue = fill_value - encoded_fillvalue = FillValueCoder.encode(fillvalue, target_dtype) - return encoded_fillvalue diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py deleted file mode 100644 index 786b68810..000000000 --- a/virtualizarr/readers/hdf5.py +++ /dev/null @@ -1,69 +0,0 @@ -from pathlib import Path -from typing import Hashable, Iterable, Mapping, Optional - -from xarray import Dataset, Index - -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) - - -class HDF5VirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - from kerchunk.hdf import SingleHdf5ToZarr - - if virtual_backend_kwargs: - raise NotImplementedError( - "HDF5 reader does not understand any virtual_backend_kwargs" - ) - - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) - - refs = SingleHdf5ToZarr( - filepath, inline_threshold=0, **reader_options - ).translate() - - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( - refs, - fs_root=Path.cwd().as_uri(), - ) - - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, - group=group, - loadable_variables=loadable_variables, - reader_options=reader_options, - indexes=indexes, - decode_times=decode_times, - ) - - return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py deleted file mode 100644 index 6e2e78193..000000000 --- a/virtualizarr/readers/kerchunk.py +++ /dev/null @@ -1,95 +0,0 @@ -import warnings -from typing import Hashable, Iterable, Mapping, Optional - -import ujson -from xarray import Dataset, Index - -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs -from virtualizarr.types.kerchunk import ( - KerchunkStoreRefs, -) -from virtualizarr.utils import _FsspecFSFromFilepath - - -class KerchunkVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """Reads existing kerchunk references (in JSON or parquet) format.""" - - if virtual_backend_kwargs is None: - virtual_backend_kwargs = {} - - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) - - fs_root = virtual_backend_kwargs.pop("fs_root", None) - - if virtual_backend_kwargs: - raise NotImplementedError( - f"Kerchunk reader does not understand any of the virtual_backend_kwargs {virtual_backend_kwargs}" - ) - - if group: - raise NotImplementedError() - - if loadable_variables or indexes or decode_times: - raise NotImplementedError() - - # TODO: whilst this keeps backwards-compatible behaviour for the `loadable_variables`` kwarg, - # it probably has to change, see https://github.com/zarr-developers/VirtualiZarr/pull/477/#issuecomment-2744448626 - if loadable_variables is None or indexes is None: - warnings.warn( - "The default value of the `loadable_variables` kwarg may attempt to load data from the referenced virtual chunks." - "As this is unlikely to be the desired behaviour when opening a Kerchunk file, `loadable_variables` has been overridden, and set to `loadable_variables=[]`." - "To silence this warning pass `loadable_variables` explicitly.", - UserWarning, - ) - loadable_variables = [] - indexes = {} - - fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) - - # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. - if fs.filepath.endswith(".parquet") and fs.fs.isfile( - f"{fs.filepath}/.zmetadata" - ): - from fsspec.implementations.reference import LazyReferenceMapper - - lrm = LazyReferenceMapper(filepath, fs.fs) - - # build reference dict from KV pairs in LazyReferenceMapper - # is there a better / more performant way to extract this? - array_refs = {k: lrm[k] for k in lrm.keys()} - - full_reference = {"refs": array_refs} - - vds = dataset_from_kerchunk_refs( - KerchunkStoreRefs(full_reference), fs_root=fs_root - ) - - # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': - # https://fsspec.github.io/kerchunk/spec.html - elif fs.read_bytes(9).startswith(b'{"version'): - with fs.open_file() as of: - refs = ujson.load(of) - - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root) - - else: - raise ValueError( - "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" - ) - - # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict - return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py deleted file mode 100644 index c1917ed59..000000000 --- a/virtualizarr/readers/netcdf3.py +++ /dev/null @@ -1,68 +0,0 @@ -from pathlib import Path -from typing import Hashable, Iterable, Mapping, Optional - -from xarray import Dataset, Index - -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) - - -class NetCDF3VirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - from kerchunk.netCDF3 import NetCDF3ToZarr - - if virtual_backend_kwargs: - raise NotImplementedError( - "netcdf3 reader does not understand any virtual_backend_kwargs" - ) - - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) - - refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - - # both group=None and group='' mean to read root group - if group: - raise ValueError( - "group kwarg passed, but netCDF3 files can't have multiple groups!" - ) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( - refs, - fs_root=Path.cwd().as_uri(), - ) - - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, - group=group, - loadable_variables=loadable_variables, - reader_options=reader_options, - indexes=indexes, - decode_times=decode_times, - ) - - return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py deleted file mode 100644 index f4dbcfe73..000000000 --- a/virtualizarr/readers/tiff.py +++ /dev/null @@ -1,79 +0,0 @@ -import warnings -from pathlib import Path -from typing import Hashable, Iterable, Mapping, Optional - -from xarray import Dataset, Index - -from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) - - -class TIFFVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - if virtual_backend_kwargs: - raise NotImplementedError( - "TIFF reader does not understand any virtual_backend_kwargs" - ) - - from kerchunk.tiff import tiff_to_zarr - - if reader_options is None: - reader_options = {} - - reader_options.pop("storage_options", {}) - warnings.warn( - "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", - UserWarning, - ) - - _drop_vars: list[Hashable] = ( - [] if drop_variables is None else list(drop_variables) - ) - - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) - - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( - refs, - fs_root=Path.cwd().as_uri(), - ) - - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, - group=group, - loadable_variables=loadable_variables, - reader_options=reader_options, - indexes=indexes, - decode_times=decode_times, - ) - - return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 58682e32f..f809214f1 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -3,9 +3,6 @@ import pytest from packaging.version import Version -from virtualizarr.readers import HDF5VirtualBackend -from virtualizarr.readers.hdf import HDFVirtualBackend - requires_network = pytest.mark.network requires_minio = pytest.mark.minio @@ -43,8 +40,3 @@ def _importorskip( has_zarr_python, requires_zarr_python = _importorskip("zarr") has_dask, requires_dask = _importorskip("dask") has_obstore, requires_obstore = _importorskip("obstore") - -parametrize_over_hdf_backends = pytest.mark.parametrize( - "hdf_backend", - [HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend], -) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py deleted file mode 100644 index 7fff68aec..000000000 --- a/virtualizarr/tests/test_backend.py +++ /dev/null @@ -1,593 +0,0 @@ -import functools -from collections.abc import Mapping -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from unittest.mock import patch - -import numpy as np -import pytest -import xarray as xr -import xarray.testing as xrt -from xarray import Dataset, open_dataset -from xarray.core.indexes import Index - -from virtualizarr import open_virtual_dataset, open_virtual_mfdataset -from virtualizarr.backend import ( - FileType, - VirtualBackend, - automatically_determine_filetype, -) -from virtualizarr.manifests import ManifestArray -from virtualizarr.readers import HDF5VirtualBackend -from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import ( - has_astropy, - parametrize_over_hdf_backends, - requires_dask, - requires_hdf5plugin, - requires_imagecodecs, - requires_lithops, - requires_network, - requires_s3fs, - requires_scipy, -) - - -@requires_scipy -def test_automatically_determine_filetype_netcdf3_netcdf4(): - # test the NetCDF3 vs NetCDF4 automatic file type selection - - ds = xr.Dataset({"a": (["x"], [0, 1])}) - netcdf3_file_path = "/tmp/netcdf3.nc" - netcdf4_file_path = "/tmp/netcdf4.nc" - - # write two version of NetCDF - ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") - ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") - - assert FileType("netcdf3") == automatically_determine_filetype( - filepath=netcdf3_file_path - ) - assert FileType("hdf5") == automatically_determine_filetype( - filepath=netcdf4_file_path - ) - - -@pytest.mark.parametrize( - "filetype,headerbytes", - [ - ("netcdf3", b"CDF"), - ("hdf5", b"\x89HDF"), - ("grib", b"GRIB"), - ("tiff", b"II*"), - ("fits", b"SIMPLE"), - ], -) -def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): - filepath = tmp_path / "file.abc" - with open(filepath, "wb") as f: - f.write(headerbytes) - assert FileType(filetype) == automatically_determine_filetype(filepath=filepath) - - -def test_notimplemented_filetype(tmp_path): - for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: - filepath = tmp_path / "file.abc" - with open(filepath, "wb") as f: - f.write(headerbytes) - with pytest.raises(NotImplementedError): - automatically_determine_filetype(filepath=filepath) - - -def test_FileType(): - # tests if FileType converts user supplied strings to correct filetype - assert "netcdf3" == FileType("netcdf3").name - assert "netcdf4" == FileType("netcdf4").name - assert "hdf4" == FileType("hdf4").name - assert "hdf5" == FileType("hdf5").name - assert "grib" == FileType("grib").name - assert "tiff" == FileType("tiff").name - assert "fits" == FileType("fits").name - with pytest.raises(ValueError): - FileType(None) - - -@parametrize_over_hdf_backends -class TestOpenVirtualDatasetIndexes: - @pytest.mark.xfail(reason="not yet implemented") - def test_specify_no_indexes(self, netcdf4_file, hdf_backend): - vds = open_virtual_dataset(netcdf4_file, backend=hdf_backend, indexes={}) - assert vds.indexes == {} - - @requires_hdf5plugin - @requires_imagecodecs - def test_create_default_indexes_for_loadable_variables( - self, netcdf4_file, hdf_backend - ): - loadable_variables = ["time", "lat"] - - with ( - open_virtual_dataset( - netcdf4_file, - indexes=None, - backend=hdf_backend, - loadable_variables=loadable_variables, - ) as vds, - open_dataset(netcdf4_file, decode_times=True) as ds, - ): - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds[loadable_variables].xindexes) - - -def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): - # Check if the mappings have the same keys - if set(indexes1.keys()) != set(indexes2.keys()): - return False - - # Check if the values for each key are identical - for key in indexes1.keys(): - index1 = indexes1[key] - index2 = indexes2[key] - - if not index1.equals(index2): - return False - - return True - - -@requires_hdf5plugin -@requires_imagecodecs -@parametrize_over_hdf_backends -def test_cftime_index(tmp_path: Path, hdf_backend: type[VirtualBackend]): - """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" - # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 - ds = xr.Dataset( - data_vars={ - "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)), - }, - coords={ - "time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"), - "lat": np.arange(-90, 90, 10), - "lon": np.arange(-180, 180, 10), - }, - attrs={"attr1_key": "attr1_val"}, - ) - ds.to_netcdf(str(tmp_path / "tmp.nc")) - - with open_virtual_dataset( - str(tmp_path / "tmp.nc"), - loadable_variables=["time", "lat", "lon"], - backend=hdf_backend, - ) as vds: - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds.xindexes) - assert list(ds.coords) == list(vds.coords) - assert vds.sizes == ds.sizes - assert vds.attrs == ds.attrs - - -@parametrize_over_hdf_backends -class TestOpenVirtualDatasetAttrs: - def test_drop_array_dimensions(self, netcdf4_file, hdf_backend): - # regression test for GH issue #150 - vds = open_virtual_dataset(netcdf4_file, backend=hdf_backend) - assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs - - def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend): - # regression test for GH issue #155 - vds = open_virtual_dataset(netcdf4_file, backend=hdf_backend) - assert vds["lat"].attrs == { - "standard_name": "latitude", - "long_name": "Latitude", - "units": "degrees_north", - "axis": "Y", - } - - -@parametrize_over_hdf_backends -class TestDetermineCoords: - def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend): - with open_virtual_dataset(netcdf4_file, backend=hdf_backend) as vds: - assert set(vds.coords) == {"time", "lat", "lon"} - - def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend): - with open_virtual_dataset( - netcdf4_file_with_2d_coords, backend=hdf_backend - ) as vds: - expected_dimension_coords = ["ocean_time", "s_rho"] - expected_2d_coords = ["lon_rho", "lat_rho", "h"] - expected_1d_non_dimension_coords = ["Cs_r"] - expected_scalar_coords = ["hc", "Vtransform"] - expected_coords = ( - expected_dimension_coords - + expected_2d_coords - + expected_1d_non_dimension_coords - + expected_scalar_coords - ) - assert set(vds.coords) == set(expected_coords) - - -@requires_network -@requires_s3fs -class TestReadFromS3: - @pytest.mark.parametrize( - "indexes", - [ - None, - pytest.param({}, marks=pytest.mark.xfail(reason="not implemented")), - ], - ids=["None index", "empty dict index"], - ) - @parametrize_over_hdf_backends - def test_anon_read_s3(self, indexes, hdf_backend): - """Parameterized tests for empty vs supplied indexes and filetypes.""" - # TODO: Switch away from this s3 url after minIO is implemented. - fpath = "s3://carbonplan-share/virtualizarr/local.nc" - with open_virtual_dataset( - fpath, - indexes=indexes, - reader_options={"storage_options": {"anon": True}}, - backend=hdf_backend, - ) as vds: - assert vds.sizes == {"time": 2920, "lat": 25, "lon": 53} - - assert isinstance(vds["air"].data, ManifestArray) - for name in ["time", "lat", "lon"]: - assert isinstance(vds[name].data, np.ndarray) - - -@requires_network -@parametrize_over_hdf_backends -class TestReadFromURL: - @pytest.mark.parametrize( - "filetype, url", - [ - ( - "grib", - "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", - ), - pytest.param( - "netcdf3", - "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", - marks=pytest.mark.xfail( - reason="Big endian not yet supported by zarr-python 3.0" - ), # https://github.com/zarr-developers/zarr-python/issues/2324 - ), - ( - "netcdf4", - "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", - ), - pytest.param( - "hdf4", - "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", - marks=pytest.mark.skip(reason="often times out"), - ), - pytest.param( - "hdf5", - "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5", - marks=pytest.mark.skip(reason="often times out"), - ), - # https://github.com/zarr-developers/VirtualiZarr/issues/159 - # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), - pytest.param( - "tiff", - "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", - marks=pytest.mark.xfail(reason="not yet implemented"), - ), - pytest.param( - "fits", - "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", - marks=[ - pytest.mark.skipif( - not has_astropy, reason="package astropy is not available" - ), - pytest.mark.xfail( - reason="Big endian not yet supported by zarr-python 3.0" - ), # https://github.com/zarr-developers/zarr-python/issues/2324 - ], - ), - ( - "jpg", - "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", - ), - ], - ) - def test_read_from_url(self, hdf_backend, filetype, url): - if filetype == "netcdf3": - pytest.importorskip("scipy") - if filetype in ["grib", "jpg", "hdf4"]: - with pytest.raises(NotImplementedError): - open_virtual_dataset(url, reader_options={}) - elif filetype == "hdf5": - with open_virtual_dataset( - url, - group="science/LSAR/GCOV/grids/frequencyA", - drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], - reader_options={}, - backend=hdf_backend, - ) as vds: - assert isinstance(vds, xr.Dataset) - else: - with open_virtual_dataset(url) as vds: - assert isinstance(vds, xr.Dataset) - - @pytest.mark.skip(reason="often times out, as nisar file is 200MB") - def test_virtualizarr_vs_local_nisar(self, hdf_backend): - import fsspec - - # Open group directly from locally cached file with xarray - url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" - tmpfile = fsspec.open_local( - f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True) - ) - assert isinstance(tmpfile, str) # make type-checkers happy - hdf_group = "science/LSAR/GCOV/grids/frequencyA" - - with ( - xr.open_dataset( - tmpfile, - engine="h5netcdf", - group=hdf_group, - drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], - phony_dims="access", - ) as dsXR, - # save group reference file via virtualizarr, then open with engine="kerchunk" - open_virtual_dataset( - tmpfile, - group=hdf_group, - drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], - backend=hdf_backend, - ) as vds, - ): - tmpref = "/tmp/cmip6.json" - vds.virtualize.to_kerchunk(tmpref, format="json") - - with xr.open_dataset(tmpref, engine="kerchunk") as dsV: - # xrt.assert_identical(dsXR, dsV) #Attribute order changes - xrt.assert_equal(dsXR, dsV) - - -@parametrize_over_hdf_backends -class TestOpenVirtualDatasetHDFGroup: - def test_open_empty_group(self, empty_netcdf4_file, hdf_backend): - with open_virtual_dataset(empty_netcdf4_file, backend=hdf_backend) as vds: - assert isinstance(vds, xr.Dataset) - expected = Dataset() - xrt.assert_identical(vds, expected) - - def test_open_subgroup( - self, netcdf4_file_with_data_in_multiple_groups, hdf_backend - ): - with open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, - group="subgroup", - backend=hdf_backend, - ) as vds: - assert list(vds.variables) == ["bar"] - assert isinstance(vds["bar"].data, ManifestArray) - assert vds["bar"].shape == (2,) - - @pytest.mark.parametrize("group", ["", None]) - def test_open_root_group( - self, - netcdf4_file_with_data_in_multiple_groups, - hdf_backend, - group, - ): - with open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, - group=group, - backend=hdf_backend, - ) as vds: - assert list(vds.variables) == ["foo"] - assert isinstance(vds["foo"].data, ManifestArray) - assert vds["foo"].shape == (3,) - - -@requires_hdf5plugin -@requires_imagecodecs -class TestLoadVirtualDataset: - @parametrize_over_hdf_backends - @pytest.mark.parametrize( - "loadable_variables, expected_loadable_variables", - [ - ([], []), - (["time"], ["time"]), - (["air", "time"], ["air", "time"]), - (None, ["lat", "lon", "time"]), - ], - ) - def test_loadable_variables( - self, netcdf4_file, hdf_backend, loadable_variables, expected_loadable_variables - ): - with ( - open_virtual_dataset( - netcdf4_file, - loadable_variables=loadable_variables, - backend=hdf_backend, - ) as vds, - xr.open_dataset(netcdf4_file, decode_times=True) as ds, - ): - assert set(vds.variables) == set(ds.variables) - assert set(vds.coords) == set(ds.coords) - - virtual_variables = { - name: var - for name, var in vds.variables.items() - if isinstance(var.data, ManifestArray) - } - actual_loadable_variables = { - name: var - for name, var in vds.variables.items() - if not isinstance(var.data, ManifestArray) - } - - assert set(actual_loadable_variables) == set(expected_loadable_variables) - - for var in virtual_variables.values(): - assert isinstance(var.data, ManifestArray) - - for name, var in ds.variables.items(): - if name in actual_loadable_variables: - xrt.assert_identical(vds.variables[name], ds.variables[name]) - - def test_explicit_filetype(self, netcdf4_file): - with pytest.raises(ValueError): - open_virtual_dataset(netcdf4_file, filetype="unknown") - - with pytest.raises(ValueError): - open_virtual_dataset(netcdf4_file, filetype=ManifestArray) # type: ignore - - with pytest.raises(NotImplementedError): - open_virtual_dataset(netcdf4_file, filetype="grib") - - with open_virtual_dataset(netcdf4_file, filetype="netCDF4"): - pass - - def test_explicit_filetype_and_backend(self, netcdf4_file): - with pytest.raises(ValueError): - open_virtual_dataset( - netcdf4_file, filetype="hdf", backend=HDFVirtualBackend - ) - - @parametrize_over_hdf_backends - def test_group_kwarg(self, hdf5_groups_file, hdf_backend): - if hdf_backend == HDFVirtualBackend: - with pytest.raises(KeyError, match="doesn't exist"): - with open_virtual_dataset( - hdf5_groups_file, group="doesnt_exist", backend=hdf_backend - ): - pass - if hdf_backend == HDF5VirtualBackend: - with pytest.raises(ValueError, match="not found in"): - with open_virtual_dataset( - hdf5_groups_file, group="doesnt_exist", backend=hdf_backend - ): - pass - - vars_to_load = ["air", "time"] - with ( - open_virtual_dataset( - hdf5_groups_file, - group="test/group", - loadable_variables=vars_to_load, - backend=hdf_backend, - ) as vds, - xr.open_dataset(hdf5_groups_file, group="test/group") as full_ds, - ): - for name in full_ds.variables: - if name in vars_to_load: - xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - @pytest.mark.xfail(reason="patches a function which no longer exists") - @patch("virtualizarr.translators.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - with open_virtual_dataset(netcdf4_file, reader_options=reader_options): - pass - args = { - "filepath": netcdf4_file, - "filetype": None, - "group": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) - - @parametrize_over_hdf_backends - def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend): - with open_virtual_dataset(hdf5_empty, backend=hdf_backend) as vds: - assert vds.empty.dims == () - assert vds.empty.attrs == {"empty": "true"} - - @parametrize_over_hdf_backends - def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend): - with open_virtual_dataset(hdf5_scalar, backend=hdf_backend) as vds: - assert vds.scalar.dims == () - assert vds.scalar.attrs == {"scalar": "true"} - - -preprocess_func = functools.partial( - xr.Dataset.rename_vars, - air="nair", -) - - -@requires_hdf5plugin -@requires_imagecodecs -@parametrize_over_hdf_backends -class TestOpenVirtualMFDataset: - @pytest.mark.parametrize("invalid_parallel_kwarg", ["ray", Dataset]) - def test_invalid_parallel_kwarg( - self, netcdf4_files_factory, invalid_parallel_kwarg, hdf_backend - ): - filepath1, filepath2 = netcdf4_files_factory() - - with pytest.raises(ValueError, match="Unrecognized argument"): - open_virtual_mfdataset( - [filepath1, filepath2], - combine="nested", - concat_dim="time", - backend=hdf_backend, - parallel=invalid_parallel_kwarg, - ) - - @pytest.mark.parametrize( - "parallel", - [ - False, - ThreadPoolExecutor, - pytest.param("dask", marks=requires_dask), - pytest.param("lithops", marks=requires_lithops), - ], - ) - @pytest.mark.parametrize( - "preprocess", - [ - None, - preprocess_func, - ], - ) - def test_parallel_open( - self, netcdf4_files_factory, hdf_backend, parallel, preprocess - ): - filepath1, filepath2 = netcdf4_files_factory() - vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) - vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) - - expected_vds = xr.concat([vds1, vds2], dim="time") - if preprocess: - expected_vds = preprocess_func(expected_vds) - - # test combine nested, which doesn't use in-memory indexes - combined_vds = open_virtual_mfdataset( - [filepath1, filepath2], - combine="nested", - concat_dim="time", - backend=hdf_backend, - parallel=parallel, - preprocess=preprocess, - ) - xrt.assert_identical(combined_vds, expected_vds) - - # test combine by coords using in-memory indexes - combined_vds = open_virtual_mfdataset( - [filepath1, filepath2], - combine="by_coords", - backend=hdf_backend, - parallel=parallel, - preprocess=preprocess, - ) - xrt.assert_identical(combined_vds, expected_vds) - - # test combine by coords again using in-memory indexes but for a glob - file_glob = Path(filepath1).parent.glob("air*.nc") - combined_vds = open_virtual_mfdataset( - file_glob, - combine="by_coords", - backend=hdf_backend, - parallel=parallel, - preprocess=preprocess, - ) - xrt.assert_identical(combined_vds, expected_vds) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index af61df828..afc8f2ca5 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,19 +10,18 @@ from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC from virtualizarr import open_virtual_dataset -from virtualizarr.backend import VirtualBackend -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests import ChunkManifest, ManifestArray, ManifestStore +from virtualizarr.manifests.utils import create_v3_array_metadata +from virtualizarr.parsers import HDFParser, ZarrParser from virtualizarr.tests import ( has_fastparquet, has_icechunk, has_kerchunk, - parametrize_over_hdf_backends, requires_kerchunk, requires_zarr_python, ) -from virtualizarr.translators.kerchunk import ( - dataset_from_kerchunk_refs, -) +from virtualizarr.tests.utils import obstore_local +from virtualizarr.translators.kerchunk import manifestgroup_from_kerchunk_refs RoundtripFunction: TypeAlias = Callable[ Concatenate[xr.Dataset | xr.DataTree, Path, ...], xr.Dataset | xr.DataTree @@ -36,8 +35,13 @@ def test_kerchunk_roundtrip_in_memory_no_concat(array_v3_metadata): "0.1": {"path": "/foo.nc", "offset": 200, "length": 100}, } manifest = ChunkManifest(entries=chunks_dict) + metadata = create_v3_array_metadata( + shape=(2, 4), + chunk_shape=(2, 4), + data_type=np.dtype("float32"), + ) marr = ManifestArray( - metadata=array_v3_metadata(shape=(2, 4), chunks=(2, 4)), + metadata=metadata, chunkmanifest=manifest, ) vds = xr.Dataset({"a": (["x", "y"], marr)}) @@ -45,8 +49,10 @@ def test_kerchunk_roundtrip_in_memory_no_concat(array_v3_metadata): # Use accessor to write it out to kerchunk reference dict ds_refs = vds.virtualize.to_kerchunk(format="dict") - # Use dataset_from_kerchunk_refs to reconstruct the dataset - roundtrip = dataset_from_kerchunk_refs(ds_refs) + # reconstruct the dataset + manifestgroup = manifestgroup_from_kerchunk_refs(ds_refs) + manifeststore = ManifestStore(group=manifestgroup) + roundtrip = manifeststore.to_virtual_dataset(loadable_variables=[]) # Assert equal to original dataset xrt.assert_equal(roundtrip, vds) @@ -65,9 +71,10 @@ def test_kerchunk_roundtrip_in_memory_no_concat(array_v3_metadata): ), ], ) -@parametrize_over_hdf_backends def test_numpy_arrays_to_inlined_kerchunk_refs( - netcdf4_file, inline_threshold, vars_to_inline, hdf_backend + netcdf4_file, + inline_threshold, + vars_to_inline, ): from kerchunk.hdf import SingleHdf5ToZarr @@ -77,8 +84,13 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( ).translate() # loading the variables should produce same result as inlining them using kerchunk + store = obstore_local(netcdf4_file) + parser = HDFParser() with open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_inline, backend=hdf_backend + file_url=netcdf4_file, + object_store=store, + parser=parser, + loadable_variables=vars_to_inline, ) as vds: refs = vds.virtualize.to_kerchunk(format="dict") @@ -163,11 +175,17 @@ def test_zarr_roundtrip( tmp_path, roundtrip_func: RoundtripFunction, ): - air_zarr_path = tmp_path / "air_temperature.zarr" + air_zarr_path = str(tmp_path / "air_temperature.zarr") + store = obstore_local(file_url=air_zarr_path) + parser = ZarrParser() with xr.tutorial.open_dataset("air_temperature", decode_times=False) as ds: # TODO: for now we will save as Zarr V3. Later we can parameterize it for V2. ds.to_zarr(air_zarr_path, zarr_format=3, consolidated=False) - with open_virtual_dataset(str(air_zarr_path)) as vds: + with open_virtual_dataset( + file_url=air_zarr_path, + object_store=store, + parser=parser, + ) as vds: roundtrip = roundtrip_func(vds, tmp_path, decode_times=False) # assert all_close to original dataset @@ -177,22 +195,23 @@ def test_zarr_roundtrip( for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - @parametrize_over_hdf_backends def test_roundtrip_no_concat( self, tmp_path, roundtrip_func: RoundtripFunction, - hdf_backend: type[VirtualBackend], ): - air_nc_path = tmp_path / "air.nc" + air_nc_path = str(tmp_path / "air.nc") # set up example xarray dataset with xr.tutorial.open_dataset("air_temperature", decode_times=False) as ds: # save it to disk as netCDF (in temporary directory) ds.to_netcdf(air_nc_path) - + store = obstore_local(air_nc_path) + parser = HDFParser() # use open_dataset_via_kerchunk to read it as references - with open_virtual_dataset(str(air_nc_path), backend=hdf_backend) as vds: + with open_virtual_dataset( + file_url=air_nc_path, object_store=store, parser=parser + ) as vds: roundtrip = roundtrip_func(vds, tmp_path, decode_times=False) # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) @@ -204,13 +223,11 @@ def test_roundtrip_no_concat( for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - @parametrize_over_hdf_backends @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat( self, tmp_path: Path, roundtrip_func: RoundtripFunction, - hdf_backend: type[VirtualBackend], decode_times: bool, time_vars: list[str], ): @@ -223,22 +240,26 @@ def test_kerchunk_roundtrip_concat( ds2 = ds.isel(time=slice(1460, None)) # save it to disk as netCDF (in temporary directory) - air1_nc_path = tmp_path / "air1.nc" - air2_nc_path = tmp_path / "air2.nc" + air1_nc_path = str(tmp_path / "air1.nc") + air2_nc_path = str(tmp_path / "air2.nc") ds1.to_netcdf(air1_nc_path) ds2.to_netcdf(air2_nc_path) # use open_dataset_via_kerchunk to read it as references + parser = HDFParser() + store = obstore_local(str(air1_nc_path)) with ( open_virtual_dataset( - str(air1_nc_path), + file_url=air1_nc_path, + object_store=store, + parser=parser, loadable_variables=time_vars, - backend=hdf_backend, ) as vds1, open_virtual_dataset( - str(air2_nc_path), + file_url=air2_nc_path, + object_store=store, + parser=parser, loadable_variables=time_vars, - backend=hdf_backend, ) as vds2, ): if not decode_times: @@ -270,26 +291,25 @@ def test_kerchunk_roundtrip_concat( == ds.time.encoding["calendar"] ) - @parametrize_over_hdf_backends def test_non_dimension_coordinates( self, tmp_path: Path, roundtrip_func: RoundtripFunction, - hdf_backend: type[VirtualBackend], ): # regression test for GH issue #105 - if hdf_backend: - pytest.xfail("To fix coordinate behavior with HDF reader") - # set up example xarray dataset containing non-dimension coordinate variables ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) # save it to disk as netCDF (in temporary directory) - nc_path = tmp_path / "non_dim_coords.nc" + nc_path = str(tmp_path / "non_dim_coords.nc") ds.to_netcdf(nc_path) - with open_virtual_dataset(str(nc_path), backend=hdf_backend) as vds: + store = obstore_local(nc_path) + parser = HDFParser() + with open_virtual_dataset( + file_url=nc_path, object_store=store, parser=parser + ) as vds: assert "lat" in vds.coords assert "coordinates" not in vds.attrs @@ -337,7 +357,6 @@ def test_datetime64_dtype_fill_value( assert roundtrip.a.attrs == vds.a.attrs -@parametrize_over_hdf_backends @pytest.mark.parametrize( "roundtrip_func", [roundtrip_as_in_memory_icechunk] if has_icechunk else [] ) @@ -347,7 +366,6 @@ def test_datetime64_dtype_fill_value( def test_datatree_roundtrip( tmp_path: Path, roundtrip_func: RoundtripFunction, - hdf_backend: type[VirtualBackend], decode_times: bool, time_vars: list[str], inherit: bool, @@ -359,24 +377,28 @@ def test_datatree_roundtrip( ds2 = ds.isel(time=slice(1460, None)) # save it to disk as netCDF (in temporary directory) - air1_nc_path = tmp_path / "air1.nc" - air2_nc_path = tmp_path / "air2.nc" + air1_nc_path = str(tmp_path / "air1.nc") + air2_nc_path = str(tmp_path / "air2.nc") ds1.to_netcdf(air1_nc_path) ds2.to_netcdf(air2_nc_path) + store = obstore_local(file_url=air1_nc_path) + parser = HDFParser() # use open_dataset_via_kerchunk to read it as references with ( open_virtual_dataset( - str(air1_nc_path), + file_url=air1_nc_path, + object_store=store, + parser=parser, loadable_variables=time_vars, decode_times=decode_times, - backend=hdf_backend, ) as vds1, open_virtual_dataset( - str(air2_nc_path), + file_url=air2_nc_path, + object_store=store, + parser=parser, loadable_variables=time_vars, decode_times=decode_times, - backend=hdf_backend, ) as vds2, ): if not decode_times or not time_vars: @@ -433,32 +455,47 @@ def test_datatree_roundtrip( ) -@parametrize_over_hdf_backends -def test_open_scalar_variable(tmp_path: Path, hdf_backend: type[VirtualBackend]): +def test_open_scalar_variable(tmp_path: Path): # regression test for GH issue #100 - nc_path = tmp_path / "scalar.nc" + nc_path = str(tmp_path / "scalar.nc") ds = xr.Dataset(data_vars={"a": 0}) ds.to_netcdf(nc_path) - with open_virtual_dataset(str(nc_path), backend=hdf_backend) as vds: + store = obstore_local(nc_path) + parser = HDFParser() + with open_virtual_dataset( + file_url=nc_path, + object_store=store, + parser=parser, + ) as vds: assert vds["a"].shape == () -@parametrize_over_hdf_backends class TestPathsToURIs: - def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): - with open_virtual_dataset(netcdf4_file, backend=hdf_backend) as vds: + def test_convert_absolute_paths_to_uris(self, netcdf4_file): + store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=store, + parser=parser, + ) as vds: expected_path = Path(netcdf4_file).as_uri() manifest = vds["air"].data.manifest.dict() path = manifest["0.0.0"]["path"] assert path == expected_path - def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend): + def test_convert_relative_paths_to_uris(self, netcdf4_file): relative_path = relpath(netcdf4_file) - - with open_virtual_dataset(relative_path, backend=hdf_backend) as vds: + store = obstore_local(relative_path) + parser = HDFParser() + with open_virtual_dataset( + file_url=relative_path, + object_store=store, + parser=parser, + ) as vds: expected_path = Path(netcdf4_file).as_uri() manifest = vds["air"].data.manifest.dict() path = manifest["0.0.0"]["path"] diff --git a/virtualizarr/tests/test_manifests/test_store.py b/virtualizarr/tests/test_manifests/test_store.py index f1a114f04..b5e3fed0d 100644 --- a/virtualizarr/tests/test_manifests/test_store.py +++ b/virtualizarr/tests/test_manifests/test_store.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from obstore.store import MemoryStore from zarr.abc.store import ( OffsetByteRequest, RangeByteRequest, @@ -21,13 +22,12 @@ ManifestStore, ObjectStoreRegistry, ) -from virtualizarr.manifests.store import default_object_store +from virtualizarr.manifests.store import get_store_prefix from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.tests import ( requires_hdf5plugin, requires_imagecodecs, requires_minio, - requires_network, requires_obstore, ) @@ -126,49 +126,6 @@ def s3_store(minio_bucket): ) -@requires_obstore -@requires_minio -def test_default_object_store_s3(minio_bucket): - from obstore.store import S3Store - - filepath = f"s3://{minio_bucket['bucket']}/data/data.tmp" - store = default_object_store( - filepath, - ) - assert isinstance(store, S3Store) - - -@requires_obstore -@requires_minio -def test_default_object_store_http(minio_bucket): - from obstore.store import HTTPStore - - filepath = minio_bucket["endpoint"] - store = default_object_store( - filepath, - ) - assert isinstance(store, HTTPStore) - - -@requires_obstore -def test_default_object_store_local(tmpdir): - from obstore.store import LocalStore - - filepath = f"{tmpdir}/data.tmp" - store = default_object_store(filepath) - assert isinstance(store, LocalStore) - - -@requires_network -@requires_obstore -def test_default_region_raises(): - file = "s3://cworthy/oae-efficiency-atlas/data/experiments/000/01/alk-forcing.000-1999-01.pop.h.0347-01.nc" - with pytest.raises( - ValueError, match="Unable to automatically determine region for bucket*" - ): - default_object_store(file) - - @requires_obstore class TestManifestStore: def test_manifest_store_properties(self, local_store): @@ -279,7 +236,10 @@ class TestToVirtualXarray: ], ) def test_single_group_to_dataset( - self, manifest_array, loadable_variables, expected_loadable_variables + self, + manifest_array, + loadable_variables, + expected_loadable_variables, ): marr1 = manifest_array( shape=(3, 2, 5), chunks=(1, 2, 1), dimension_names=["x", "y", "t"] @@ -287,6 +247,16 @@ def test_single_group_to_dataset( marr2 = manifest_array(shape=(3, 2), chunks=(1, 2), dimension_names=["x", "y"]) marr3 = manifest_array(shape=(5,), chunks=(5,), dimension_names=["t"]) + paths1 = list({v["path"] for v in marr1.manifest.values()}) + paths2 = list({v["path"] for v in marr2.manifest.values()}) + paths3 = list({v["path"] for v in marr2.manifest.values()}) + unique_paths = list(set(paths1 + paths2 + paths3)) + stores = {} + for path in unique_paths: + store = MemoryStore() + stores[get_store_prefix(path)] = store + store_registry = ObjectStoreRegistry(stores=stores) + manifest_group = ManifestGroup( arrays={ "T": marr1, # data variable @@ -296,7 +266,7 @@ def test_single_group_to_dataset( attributes={"coordinates": "elevation t", "ham": "eggs"}, ) - manifest_store = ManifestStore(manifest_group) + manifest_store = ManifestStore(manifest_group, store_registry=store_registry) vds = manifest_store.to_virtual_dataset(loadable_variables=loadable_variables) assert set(vds.variables) == set(["T", "elevation", "t"]) diff --git a/virtualizarr/tests/test_parsers/__init__.py b/virtualizarr/tests/test_parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py similarity index 100% rename from virtualizarr/tests/test_readers/conftest.py rename to virtualizarr/tests/test_parsers/conftest.py diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py similarity index 74% rename from virtualizarr/tests/test_readers/test_dmrpp.py rename to virtualizarr/tests/test_parsers/test_dmrpp.py index f45783cc3..4945696cf 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,5 +1,6 @@ import os import textwrap +from contextlib import nullcontext from pathlib import Path from xml.etree import ElementTree as ET @@ -8,15 +9,17 @@ import xarray as xr import xarray.testing as xrt -from virtualizarr import open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.readers.dmrpp import DMRParser +from virtualizarr.parsers import DMRPPParser, HDFParser +from virtualizarr.parsers.dmrpp import DMRParser from virtualizarr.tests import requires_network +from virtualizarr.tests.utils import obstore_local, obstore_s3 +from virtualizarr.xarray import open_virtual_dataset urls = [ ( - "https://its-live-data.s3-us-west-2.amazonaws.com/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", - "https://its-live-data.s3-us-west-2.amazonaws.com/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp", + "s3://its-live-data/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", + "s3://its-live-data/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp", ) # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] @@ -179,11 +182,44 @@ def dmrparser(dmrpp_xml_str: str, tmp_path: Path, filename="test.nc") -> DMRPars @requires_network @pytest.mark.parametrize("data_url, dmrpp_url", urls) -@pytest.mark.skip(reason="Fill_val mismatch") def test_NASA_dmrpp(data_url, dmrpp_url): - result = open_virtual_dataset(dmrpp_url, filetype="dmrpp", loadable_variables=[]) - expected = open_virtual_dataset(data_url, loadable_variables=[]) - xr.testing.assert_identical(result, expected) + store = obstore_s3( + file_url=dmrpp_url, + region="us-west-2", + ) + + with ( + open_virtual_dataset( + file_url=dmrpp_url, + object_store=store, + parser=DMRPPParser(), + loadable_variables=[], + ) as actual, + open_virtual_dataset( + file_url=data_url, + object_store=store, + parser=HDFParser(), + loadable_variables=[], + ) as expected, + ): + xr.testing.assert_identical(actual, expected) + + +@requires_network +@pytest.mark.parametrize("data_url, dmrpp_url", urls) +def test_NASA_dmrpp_load(data_url, dmrpp_url): + store = obstore_s3( + file_url=dmrpp_url, + region="us-west-2", + ) + + parser = DMRPPParser() + manifest_store = parser(file_url=dmrpp_url, object_store=store) + + with xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) as ds: + assert ds.load() @pytest.mark.parametrize( @@ -230,31 +266,58 @@ def test_split_groups(tmp_path, dmrpp_xml_str_key, group_path): assert result_tags == expected_tags -def test_parse_dataset(tmp_path): +@pytest.mark.parametrize( + "group,warns", + [ + pytest.param(None, False, id="None"), + pytest.param("/", False, id="/"), + pytest.param("/no-such-group", True, id="/no-such-group"), + ], +) +def test_parse_dataset_basic(group: str | None, warns: bool, tmp_path: Path): basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + store = obstore_local(file_url=basic_dmrpp.data_filepath) + + with nullcontext() if warns else pytest.raises(BaseException, match="DID NOT WARN"): + with pytest.warns(UserWarning, match=f"ignoring group parameter {group!r}"): + ms = basic_dmrpp.parse_dataset(object_store=store, group=group) + + vds = ms.to_virtual_dataset() - vds = basic_dmrpp.parse_dataset() assert vds.sizes == {"x": 720, "y": 1440, "z": 3} assert vds.data_vars.keys() == {"data", "mask"} assert vds.data_vars["data"].dims == ("x", "y") assert vds.attrs == {"Conventions": "CF-1.6", "title": "Sample Dataset"} assert vds.coords.keys() == {"x", "y", "z"} + +def test_parse_dataset_nested(tmp_path: Path): nested_groups_dmrpp = dmrparser( DMRPP_XML_STRINGS["nested_groups"], tmp_path=tmp_path ) + store = obstore_local(file_url=nested_groups_dmrpp.data_filepath) + + vds_root_implicit = nested_groups_dmrpp.parse_dataset( + object_store=store + ).to_virtual_dataset(loadable_variables=[]) + vds_root = nested_groups_dmrpp.parse_dataset( + group="/", object_store=store + ).to_virtual_dataset(loadable_variables=[]) - vds_root_implicit = nested_groups_dmrpp.parse_dataset() - vds_root = nested_groups_dmrpp.parse_dataset(group="/") xrt.assert_identical(vds_root_implicit, vds_root) assert vds_root.sizes == {"a": 10, "b": 10} assert vds_root.coords.keys() == {"a", "b"} - vds_g1 = nested_groups_dmrpp.parse_dataset(group="/group1") + vds_g1 = nested_groups_dmrpp.parse_dataset( + group="/group1", object_store=store + ).to_virtual_dataset(loadable_variables=[]) assert vds_g1.sizes == {"x": 720, "y": 1440} assert vds_g1.coords.keys() == {"x", "y"} - vds_g2 = nested_groups_dmrpp.parse_dataset(group="/group1/group2") + vds_g2 = nested_groups_dmrpp.parse_dataset( + group="/group1/group2", object_store=store + ).to_virtual_dataset(loadable_variables=[]) + assert vds_g2.sizes == {"x": 720, "y": 1440} assert vds_g2.data_vars.keys() == {"area"} assert vds_g2.data_vars["area"].dims == ("x", "y") @@ -291,25 +354,17 @@ def test_parse_variable(tmp_path): basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) var = basic_dmrpp._parse_variable(basic_dmrpp.find_node_fqn("/data")) - assert var.dtype == "float32" - assert var.dims == ("x", "y") + assert var.metadata.dtype == "float32" + assert var.metadata.dimension_names == ("x", "y") assert var.shape == (720, 1440) - assert var.data.metadata.to_dict()["chunk_grid"]["configuration"][ - "chunk_shape" - ] == (360, 720) - assert var.data.metadata.fill_value == -32768 - assert var.encoding == { - "add_offset": 298.15, - "scale_factor": 0.001, - "_FillValue": -32768, - } - assert var.attrs == { - "long_name": "analysed sea surface temperature", - "items": [1, 2, 3], - "coordinates": "x y z", - "add_offset": 298.15, - "scale_factor": 0.001, - } + assert var.chunks == (360, 720) + # _FillValue is encoded for array dtype + assert var.metadata.attributes["_FillValue"] == "AAAAAAAA4MA=" + assert var.metadata.attributes["add_offset"] == 298.15 + assert var.metadata.attributes["scale_factor"] == 0.001 + assert var.metadata.attributes["long_name"] == "analysed sea surface temperature" + assert var.metadata.attributes["items"] == [1, 2, 3] + assert var.metadata.attributes["coordinates"] == "x y z" @pytest.mark.parametrize( @@ -428,43 +483,53 @@ def test_absolute_path_to_dmrpp_file_containing_relative_path( self, basic_dmrpp_temp_filepath: Path, ): - vds = open_virtual_dataset( - str(basic_dmrpp_temp_filepath), loadable_variables=[], filetype="dmrpp" - ) - path = vds["x"].data.manifest["0"]["path"] - - # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc - # and the manifest should only contain absolute file URIs - expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( - ".dmrpp" - ) - assert path == expected_datafile_path_uri + store = obstore_local(file_url=basic_dmrpp_temp_filepath.as_posix()) + parser = DMRPPParser() + with open_virtual_dataset( + file_url=basic_dmrpp_temp_filepath.as_posix(), + object_store=store, + parser=parser, + loadable_variables=[], + ) as vds: + path = vds["x"].data.manifest["0"]["path"] + + # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + # and the manifest should only contain absolute file URIs + expected_datafile_path_uri = ( + basic_dmrpp_temp_filepath.as_uri().removesuffix(".dmrpp") + ) + assert path == expected_datafile_path_uri def test_relative_path_to_dmrpp_file(self, basic_dmrpp_temp_filepath: Path): # test that if a user supplies a relative path to a DMR++ file we still get an absolute path in the manifest relative_dmrpp_filepath = os.path.relpath( str(basic_dmrpp_temp_filepath), start=os.getcwd() ) - - vds = open_virtual_dataset( - relative_dmrpp_filepath, loadable_variables=[], filetype="dmrpp" - ) - path = vds["x"].data.manifest["0"]["path"] - - # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc - expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( - ".dmrpp" - ) - assert path == expected_datafile_path_uri - - -@pytest.mark.parametrize("drop_variables", [["mask"], ["data", "mask"]]) -def test_drop_variables(basic_dmrpp_temp_filepath: Path, drop_variables): - vds = open_virtual_dataset( - str(basic_dmrpp_temp_filepath), + store = obstore_local(file_url=relative_dmrpp_filepath) + parser = DMRPPParser() + with open_virtual_dataset( + file_url=relative_dmrpp_filepath, + object_store=store, + parser=parser, + loadable_variables=[], + ) as vds: + path = vds["x"].data.manifest["0"]["path"] + + # # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + expected_datafile_path_uri = ( + basic_dmrpp_temp_filepath.as_uri().removesuffix(".dmrpp") + ) + assert path == expected_datafile_path_uri + + +@pytest.mark.parametrize("skip_variables", [["mask"], ["data", "mask"]]) +def test_skip_variables(basic_dmrpp_temp_filepath: Path, skip_variables): + store = obstore_local(file_url=basic_dmrpp_temp_filepath.as_posix()) + parser = DMRPPParser(skip_variables=skip_variables) + with open_virtual_dataset( + file_url=basic_dmrpp_temp_filepath.as_posix(), + object_store=store, + parser=parser, loadable_variables=[], - filetype="dmrpp", - drop_variables=drop_variables, - ) - - assert all(var not in vds for var in drop_variables) + ) as vds: + assert all(var not in vds for var in skip_variables) diff --git a/virtualizarr/tests/test_parsers/test_fits.py b/virtualizarr/tests/test_parsers/test_fits.py new file mode 100644 index 000000000..3b4d86b1b --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_fits.py @@ -0,0 +1,31 @@ +import pytest +from xarray import Dataset + +from virtualizarr import open_virtual_dataset +from virtualizarr.parsers import FITSParser +from virtualizarr.tests import requires_kerchunk, requires_network +from virtualizarr.tests.utils import obstore_s3 + +pytest.importorskip("astropy") + + +@requires_kerchunk +@requires_network +@pytest.mark.xfail( + reason="Big endian not yet supported by zarr-python 3.0" +) # https://github.com/zarr-developers/zarr-python/issues/2324 +def test_open_hubble_data(): + # data from https://registry.opendata.aws/hst/ + file_url = "s3://stpubdata/hst/public/f05i/f05i0201m/f05i0201m_a1f.fits" + store = obstore_s3(file_url=file_url, region="us-west-2") + parser = FITSParser(reader_options={"storage_options": {"anon": True}}) + with open_virtual_dataset( + file_url=file_url, + object_store=store, + parser=parser, + ) as vds: + assert isinstance(vds, Dataset) + assert list(vds.variables) == ["PRIMARY"] + var = vds["PRIMARY"].variable + assert var.sizes == {"y": 17, "x": 589} + assert var.dtype == ">i4" diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py new file mode 100644 index 000000000..4a48572d7 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -0,0 +1,251 @@ +import h5py # type: ignore +import numpy as np +import pytest + +from virtualizarr import open_virtual_dataset +from virtualizarr.parsers import HDFParser +from virtualizarr.tests import ( + requires_hdf5plugin, + requires_imagecodecs, +) +from virtualizarr.tests.utils import obstore_local + + +@requires_hdf5plugin +@requires_imagecodecs +class TestDatasetChunkManifest: + @pytest.mark.xfail( + reason="Tutorial data non coord dimensions are serialized with big endidan types and internally dropped" + ) + def test_empty_chunks(self, empty_chunks_hdf5_file): + store = obstore_local(file_url=empty_chunks_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=empty_chunks_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].shape == (0,) + + def test_empty_dataset(self, empty_dataset_hdf5_file): + store = obstore_local(file_url=empty_dataset_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=empty_dataset_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].shape == (0,) + + def test_no_chunking(self, no_chunks_hdf5_file): + store = obstore_local(file_url=no_chunks_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=no_chunks_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].manifest.shape_chunk_grid == (1, 1) + + def test_chunked(self, chunked_hdf5_file): + store = obstore_local(file_url=chunked_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=chunked_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].manifest.shape_chunk_grid == (2, 2) + + def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): + store = obstore_local(file_url=chunked_roundtrip_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=chunked_roundtrip_hdf5_file, object_store=store + ) + assert manifest_store._group.arrays["var2"].manifest.shape_chunk_grid == (2, 8) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestDatasetDims: + def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): + store = obstore_local(file_url=single_dimension_scale_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=single_dimension_scale_hdf5_file, object_store=store + ) + assert manifest_store._group.arrays["data"].metadata.dimension_names == ("x",) + + def test_is_dimension_scale(self, is_scale_hdf5_file): + store = obstore_local(file_url=is_scale_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=is_scale_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].metadata.dimension_names == ( + "data", + ) + + def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): + store = obstore_local(file_url=multiple_dimension_scales_hdf5_file) + parser = HDFParser() + with pytest.raises(ValueError, match="dimension scales attached"): + parser(file_url=multiple_dimension_scales_hdf5_file, object_store=store) + + def test_no_dimension_scales(self, no_chunks_hdf5_file): + store = obstore_local(file_url=no_chunks_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=no_chunks_hdf5_file, object_store=store) + assert manifest_store._group.arrays["data"].metadata.dimension_names == ( + "phony_dim_0", + "phony_dim_1", + ) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestDatasetToManifestArray: + def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): + store = obstore_local(file_url=chunked_dimensions_netcdf4_file) + parser = HDFParser() + manifest_store = parser( + file_url=chunked_dimensions_netcdf4_file, object_store=store + ) + assert manifest_store._group.arrays["data"].chunks == (50, 50) + + def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): + store = obstore_local(file_url=single_dimension_scale_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=single_dimension_scale_hdf5_file, object_store=store + ) + assert manifest_store._group.arrays["data"].chunks == (2,) + + def test_dataset_attributes(self, string_attributes_hdf5_file): + store = obstore_local(file_url=string_attributes_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=string_attributes_hdf5_file, object_store=store + ) + metadata = manifest_store._group.arrays["data"].metadata + assert metadata.attributes["attribute_name"] == "attribute_name" + + def test_scalar_fill_value(self, scalar_fill_value_hdf5_file): + store = obstore_local(file_url=scalar_fill_value_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=scalar_fill_value_hdf5_file, object_store=store + ) + metadata = manifest_store._group.arrays["data"].metadata + assert metadata.fill_value == 42 + + def test_cf_fill_value(self, cf_fill_value_hdf5_file): + f = h5py.File(cf_fill_value_hdf5_file) + ds = f["data"] + if ds.dtype.kind in "S": + pytest.xfail("Investigate fixed-length binary encoding in Zarr v3") + if ds.dtype.names: + pytest.xfail("To fix, structured dtype fill value encoding for Zarr parser") + store = obstore_local(file_url=cf_fill_value_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=cf_fill_value_hdf5_file, object_store=store) + metadata = manifest_store._group.arrays["data"].metadata + assert "_FillValue" in metadata.attributes + + def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): + store = obstore_local(file_url=cf_array_fill_value_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=cf_array_fill_value_hdf5_file, object_store=store + ) + metadata = manifest_store._group.arrays["data"].metadata + assert not isinstance(metadata.attributes["_FillValue"], np.ndarray) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestExtractAttributes: + def test_root_attribute(self, root_attributes_hdf5_file): + store = obstore_local(file_url=root_attributes_hdf5_file) + parser = HDFParser() + manifest_store = parser(file_url=root_attributes_hdf5_file, object_store=store) + assert ( + manifest_store._group.metadata.attributes["attribute_name"] + == "attribute_name" + ) + + def test_multiple_attributes(self, string_attributes_hdf5_file): + store = obstore_local(file_url=string_attributes_hdf5_file) + parser = HDFParser() + manifest_store = parser( + file_url=string_attributes_hdf5_file, object_store=store + ) + metadata = manifest_store._group.arrays["data"].metadata + assert len(metadata.attributes.keys()) == 2 + + +@requires_hdf5plugin +@requires_imagecodecs +class TestManifestGroupFromHDF: + def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): + store = obstore_local(file_url=chunked_dimensions_netcdf4_file) + parser = HDFParser() + manifest_store = parser( + file_url=chunked_dimensions_netcdf4_file, object_store=store + ) + assert len(manifest_store._group.arrays) == 3 + + def test_nested_groups_are_ignored(self, nested_group_hdf5_file): + store = obstore_local(file_url=nested_group_hdf5_file) + parser = HDFParser(group="group") + manifest_store = parser(file_url=nested_group_hdf5_file, object_store=store) + assert len(manifest_store._group.arrays) == 1 + + def test_drop_variables(self, multiple_datasets_hdf5_file): + store = obstore_local(file_url=multiple_datasets_hdf5_file) + parser = HDFParser(drop_variables=["data2"]) + manifest_store = parser( + file_url=multiple_datasets_hdf5_file, object_store=store + ) + assert "data2" not in manifest_store._group.arrays.keys() + + def test_dataset_in_group(self, group_hdf5_file): + store = obstore_local(file_url=group_hdf5_file) + parser = HDFParser(group="group") + manifest_store = parser(file_url=group_hdf5_file, object_store=store) + assert len(manifest_store._group.arrays) == 1 + + def test_non_group_error(self, group_hdf5_file): + store = obstore_local(file_url=group_hdf5_file) + parser = HDFParser(group="group/data") + with pytest.raises(ValueError): + parser(file_url=group_hdf5_file, object_store=store) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestOpenVirtualDataset: + def test_coord_names( + self, + root_coordinates_hdf5_file, + ): + store = obstore_local(file_url=root_coordinates_hdf5_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=root_coordinates_hdf5_file, + object_store=store, + parser=parser, + ) as vds: + assert set(vds.coords) == {"lat", "lon"} + + @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support") + def test_big_endian( + self, + big_endian_dtype_hdf5_file, + ): + store = obstore_local(file_url=big_endian_dtype_hdf5_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=big_endian_dtype_hdf5_file, + object_store=store, + parser=parser, + ) as vds: + print(vds) + + +@requires_hdf5plugin +@requires_imagecodecs +@pytest.mark.parametrize("group", [None, "/", "subgroup", "subgroup/", "/subgroup/"]) +def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group): + # regression test for GH issue #364 + store = obstore_local(file_url=netcdf4_file_with_data_in_multiple_groups) + parser = HDFParser(group=group) + with open_virtual_dataset( + file_url=netcdf4_file_with_data_in_multiple_groups, + object_store=store, + parser=parser, + ) as vds: + assert list(vds.dims) == ["dim_0"] diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_filters.py similarity index 99% rename from virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py rename to virtualizarr/tests/test_parsers/test_hdf/test_hdf_filters.py index b4eff29cf..6062fd308 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_filters.py @@ -11,7 +11,7 @@ warnings.warn("imagecodecs is required for HDF reader") -from virtualizarr.readers.hdf.filters import ( +from virtualizarr.parsers.hdf.filters import ( _filter_to_codec, cfcodec_from_dataset, codecs_from_dataset, diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py similarity index 67% rename from virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py rename to virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py index eaa8d537d..8b1a8adc6 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py @@ -2,38 +2,37 @@ import xarray as xr import xarray.testing as xrt -import virtualizarr -from virtualizarr.readers.hdf import HDFVirtualBackend +from virtualizarr import open_virtual_dataset +from virtualizarr.parsers import HDFParser from virtualizarr.tests import ( requires_hdf5plugin, requires_icechunk, requires_imagecodecs, requires_kerchunk, ) -from virtualizarr.tests.test_integration import ( - roundtrip_as_in_memory_icechunk, -) +from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk +from virtualizarr.tests.utils import obstore_local @requires_kerchunk @requires_hdf5plugin @requires_imagecodecs class TestIntegration: - @pytest.mark.xfail( - reason="0 time start is being interpreted as fillvalue see issues/280" - ) def test_filters_h5netcdf_roundtrip( self, tmp_path, filter_encoded_roundtrip_hdf5_file ): + store = obstore_local(file_url=filter_encoded_roundtrip_hdf5_file) + parser = HDFParser() with ( xr.open_dataset( filter_encoded_roundtrip_hdf5_file, decode_times=True ) as ds, - virtualizarr.open_virtual_dataset( - filter_encoded_roundtrip_hdf5_file, + open_virtual_dataset( + file_url=filter_encoded_roundtrip_hdf5_file, + object_store=store, + parser=parser, loadable_variables=["time"], cftime_variables=["time"], - backend=HDFVirtualBackend, ) as vds, ): kerchunk_file = str(tmp_path / "kerchunk.json") @@ -47,10 +46,14 @@ def test_filters_netcdf4_roundtrip( self, tmp_path, filter_encoded_roundtrip_netcdf4_file ): filepath = filter_encoded_roundtrip_netcdf4_file["filepath"] + store = obstore_local(file_url=filepath) + parser = HDFParser() with ( xr.open_dataset(filepath) as ds, - virtualizarr.open_virtual_dataset( - filepath, backend=HDFVirtualBackend + open_virtual_dataset( + file_url=filepath, + object_store=store, + parser=parser, ) as vds, ): kerchunk_file = str(tmp_path / "kerchunk.json") @@ -59,10 +62,14 @@ def test_filters_netcdf4_roundtrip( xrt.assert_equal(ds, roundtrip) def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file): + store = obstore_local(file_url=filter_and_cf_roundtrip_hdf5_file) + parser = HDFParser() with ( xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds, - virtualizarr.open_virtual_dataset( - filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend + open_virtual_dataset( + file_url=filter_and_cf_roundtrip_hdf5_file, + object_store=store, + parser=parser, ) as vds, ): kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json") @@ -75,10 +82,14 @@ def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_fi ) def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim): + store = obstore_local(file_url=non_coord_dim) + parser = HDFParser() with ( xr.open_dataset(non_coord_dim) as ds, - virtualizarr.open_virtual_dataset( - non_coord_dim, backend=HDFVirtualBackend + open_virtual_dataset( + file_url=non_coord_dim, + object_store=store, + parser=parser, ) as vds, ): kerchunk_file = str(tmp_path / "kerchunk.json") @@ -88,14 +99,18 @@ def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim): @requires_icechunk def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file): + store = obstore_local(file_url=cf_fill_value_hdf5_file) + parser = HDFParser() with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds: if ds["data"].dtype in [float, object]: pytest.xfail( "TODO: fix handling fixed-length and structured type fill value" - " encoding in xarray zarr backend." + " encoding in xarray zarr parser." ) - with virtualizarr.open_virtual_dataset( - cf_fill_value_hdf5_file, backend=HDFVirtualBackend + with open_virtual_dataset( + file_url=cf_fill_value_hdf5_file, + object_store=store, + parser=parser, ) as vds: roundtrip = roundtrip_as_in_memory_icechunk( vds, tmp_path, decode_times=False diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py new file mode 100644 index 000000000..aea932f00 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py @@ -0,0 +1,113 @@ +from pathlib import Path +from urllib.parse import urlparse + +import numpy as np +import pytest +import xarray as xr + +from virtualizarr.manifests import ManifestArray +from virtualizarr.parsers import HDFParser +from virtualizarr.tests import ( + requires_hdf5plugin, + requires_minio, + requires_obstore, +) +from virtualizarr.tests.utils import obstore_local + + +@pytest.fixture(name="basic_ds") +def basic_ds(): + x = np.arange(100) + y = np.arange(100) + temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + ds = xr.Dataset( + {"temperature": (["x", "y"], temperature)}, + coords={"x": np.arange(100), "y": np.arange(100)}, + ) + return ds + + +@requires_hdf5plugin +class TestHDFManifestStore: + def test_roundtrip_simple_virtualdataset(self, tmpdir, basic_ds): + "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore" + + filepath = f"{tmpdir}/basic_ds_roundtrip.nc" + basic_ds.to_netcdf(filepath, engine="h5netcdf") + store = obstore_local(file_url=filepath) + parser = HDFParser() + manifest_store = parser( + file_url=filepath, + object_store=store, + ) + with xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) as rountripped_ds: + xr.testing.assert_allclose(basic_ds, rountripped_ds) + + def test_rountrip_partial_chunk_virtualdataset(self, tmpdir, basic_ds): + "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore with a single partial chunk" + + filepath = f"{tmpdir}/basic_ds_roundtrip.nc" + encoding = { + "temperature": {"chunksizes": (90, 90), "original_shape": (100, 100)} + } + basic_ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + store = obstore_local(file_url=filepath) + parser = HDFParser() + manifest_store = parser( + file_url=filepath, + object_store=store, + ) + with xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) as rountripped_ds: + xr.testing.assert_allclose(basic_ds, rountripped_ds) + + def test_rountrip_simple_virtualdataset_default_store(self, tmpdir, basic_ds): + "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore" + + filepath = f"{tmpdir}/basic_ds_roundtrip.nc" + basic_ds.to_netcdf(filepath, engine="h5netcdf") + store = obstore_local(file_url=filepath) + parser = HDFParser() + manifest_store = parser( + file_url=filepath, + object_store=store, + ) + with xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) as rountripped_ds: + xr.testing.assert_allclose(basic_ds, rountripped_ds) + + @requires_minio + @requires_obstore + def test_store(self, minio_bucket, chunked_roundtrip_hdf5_s3_file): + import obstore as obs + + parsed = urlparse(chunked_roundtrip_hdf5_s3_file) + path_without_file = str(Path(parsed.path).parent) + parsed_without_file = parsed._replace(path=path_without_file) + url_without_file = parsed_without_file.geturl() + + s3store = obs.store.from_url( + url_without_file, + config={ + "virtual_hosted_style_request": False, + "skip_signature": True, + "endpoint_url": "http://localhost:9000", + }, + client_options={"allow_http": True}, + ) + parser = HDFParser() + manifest_store = parser( + file_url=chunked_roundtrip_hdf5_s3_file, object_store=s3store + ) + + with manifest_store.to_virtual_dataset() as vds: + assert vds.dims == {"phony_dim_0": 5} + assert isinstance(vds["data"].data, ManifestArray) + with xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) as ds: + assert ds.load() diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py new file mode 100644 index 000000000..0dbc088e6 --- /dev/null +++ b/virtualizarr/tests/test_parsers/test_kerchunk.py @@ -0,0 +1,345 @@ +from pathlib import Path +from typing import Any, Callable, Generator, Optional + +import numpy as np +import pytest +import ujson +import xarray as xr +import xarray.testing as xrt + +from virtualizarr.manifests import ManifestArray +from virtualizarr.parsers import KerchunkJSONParser, KerchunkParquetParser +from virtualizarr.tests import has_fastparquet, requires_kerchunk +from virtualizarr.tests.utils import obstore_local +from virtualizarr.xarray import open_virtual_dataset + + +def gen_ds_refs( + zgroup: str | None = None, + zarray: str | None = None, + zattrs: str | None = None, + chunks: dict[str, list[str | int]] | None = None, +): + if zgroup is None: + zgroup = '{"zarr_format":2}' + if zarray is None: + zarray = '{"chunks":[2,3],"compressor":null,"dtype":" Generator[ + Callable[[Optional[Any], Optional[Any], Optional[Any], Optional[Any]], str], + None, + None, +]: + """ + Fixture which defers creation of the references file until the parameters zgroup etc. are known. + """ + + def _refs_file(zgroup=None, zarray=None, zattrs=None, chunks=None) -> str: + refs = gen_ds_refs(zgroup=zgroup, zarray=zarray, zattrs=zattrs, chunks=chunks) + file_url = tmp_path / "refs.json" + + with open(file_url, "w") as json_file: + ujson.dump(refs, json_file) + + return str(file_url) + + yield _refs_file + + +def test_dataset_from_df_refs(refs_file_factory): + refs_file = refs_file_factory() + store = obstore_local(file_url=refs_file) + parser = KerchunkJSONParser() + with open_virtual_dataset( + file_url=refs_file, object_store=store, parser=parser + ) as vds: + assert "a" in vds + vda = vds["a"] + assert isinstance(vda.data, ManifestArray) + assert vda.dims == ("x", "y") + assert vda.shape == (2, 3) + assert vda.chunks == (2, 3) + assert vda.dtype == np.dtype("i4" diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py deleted file mode 100644 index 1cf84e731..000000000 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ /dev/null @@ -1,242 +0,0 @@ -import h5py # type: ignore -import numpy as np -import pytest -from obstore.store import LocalStore - -from virtualizarr import open_virtual_dataset -from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import ( - requires_hdf5plugin, - requires_imagecodecs, -) - - -@requires_hdf5plugin -@requires_imagecodecs -class TestDatasetChunkManifest: - def test_empty_chunks(self, empty_chunks_hdf5_file): - f = h5py.File(empty_chunks_hdf5_file) - ds = f["data"] - manifest = HDFVirtualBackend._dataset_chunk_manifest( - path=empty_chunks_hdf5_file, dataset=ds - ) - assert manifest.shape_chunk_grid == (0,) - - def test_empty_dataset(self, empty_dataset_hdf5_file): - f = h5py.File(empty_dataset_hdf5_file) - ds = f["data"] - manifest = HDFVirtualBackend._dataset_chunk_manifest( - path=empty_dataset_hdf5_file, dataset=ds - ) - assert manifest.shape_chunk_grid == (0,) - - def test_no_chunking(self, no_chunks_hdf5_file): - f = h5py.File(no_chunks_hdf5_file) - ds = f["data"] - manifest = HDFVirtualBackend._dataset_chunk_manifest( - path=no_chunks_hdf5_file, dataset=ds - ) - assert manifest.shape_chunk_grid == (1, 1) - - def test_chunked(self, chunked_hdf5_file): - f = h5py.File(chunked_hdf5_file) - ds = f["data"] - manifest = HDFVirtualBackend._dataset_chunk_manifest( - path=chunked_hdf5_file, dataset=ds - ) - assert manifest.shape_chunk_grid == (2, 2) - - def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): - f = h5py.File(chunked_roundtrip_hdf5_file) - ds = f["var2"] - manifest = HDFVirtualBackend._dataset_chunk_manifest( - path=chunked_roundtrip_hdf5_file, dataset=ds - ) - assert manifest.shape_chunk_grid == (2, 8) - - -@requires_hdf5plugin -@requires_imagecodecs -class TestDatasetDims: - def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): - f = h5py.File(single_dimension_scale_hdf5_file) - ds = f["data"] - dims = HDFVirtualBackend._dataset_dims(ds) - assert dims[0] == "x" - - def test_is_dimension_scale(self, is_scale_hdf5_file): - f = h5py.File(is_scale_hdf5_file) - ds = f["data"] - dims = HDFVirtualBackend._dataset_dims(ds) - assert dims[0] == "data" - - def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): - f = h5py.File(multiple_dimension_scales_hdf5_file) - ds = f["data"] - with pytest.raises(ValueError, match="dimension scales attached"): - HDFVirtualBackend._dataset_dims(ds) - - def test_no_dimension_scales(self, no_chunks_hdf5_file): - f = h5py.File(no_chunks_hdf5_file) - ds = f["data"] - dims = HDFVirtualBackend._dataset_dims(ds) - assert dims == ["phony_dim_0", "phony_dim_1"] - - -@requires_hdf5plugin -@requires_imagecodecs -class TestDatasetToManifestArray: - def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): - f = h5py.File(chunked_dimensions_netcdf4_file) - ds = f["data"] - ma = HDFVirtualBackend._construct_manifest_array( - chunked_dimensions_netcdf4_file, ds, group="" - ) - assert ma.chunks == (50, 50) - - def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): - f = h5py.File(single_dimension_scale_hdf5_file) - ds = f["data"] - ma = HDFVirtualBackend._construct_manifest_array( - single_dimension_scale_hdf5_file, ds, group="" - ) - assert ma.chunks == (2,) - - def test_dataset_attributes(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - ma = HDFVirtualBackend._construct_manifest_array( - string_attributes_hdf5_file, ds, group="" - ) - assert ma.metadata.attributes["attribute_name"] == "attribute_name" - - def test_scalar_fill_value(self, scalar_fill_value_hdf5_file): - f = h5py.File(scalar_fill_value_hdf5_file) - ds = f["data"] - ma = HDFVirtualBackend._construct_manifest_array( - scalar_fill_value_hdf5_file, ds, group="" - ) - assert ma.metadata.fill_value == 42 - - def test_cf_fill_value(self, cf_fill_value_hdf5_file): - f = h5py.File(cf_fill_value_hdf5_file) - ds = f["data"] - if ds.dtype.kind in "S": - pytest.xfail("Investigate fixed-length binary encoding in Zarr v3") - if ds.dtype.names: - pytest.xfail( - "To fix, structured dtype fill value encoding for Zarr backend" - ) - ma = HDFVirtualBackend._construct_manifest_array( - cf_fill_value_hdf5_file, ds, group="" - ) - assert "_FillValue" in ma.metadata.attributes - - def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): - f = h5py.File(cf_array_fill_value_hdf5_file) - ds = f["data"] - ma = HDFVirtualBackend._construct_manifest_array( - cf_array_fill_value_hdf5_file, ds, group="" - ) - assert not isinstance(ma.metadata.attributes["_FillValue"], np.ndarray) - - -@requires_hdf5plugin -@requires_imagecodecs -class TestExtractAttributes: - def test_string_attribute(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - attrs = HDFVirtualBackend._extract_attrs(ds) - assert attrs["attribute_name"] == "attribute_name" - - def test_root_attribute(self, root_attributes_hdf5_file): - f = h5py.File(root_attributes_hdf5_file) - attrs = HDFVirtualBackend._extract_attrs(f) - assert attrs["attribute_name"] == "attribute_name" - - def test_multiple_attributes(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - attrs = HDFVirtualBackend._extract_attrs(ds) - assert len(attrs.keys()) == 2 - - -@requires_hdf5plugin -@requires_imagecodecs -class TestManifestGroupFromHDF: - def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): - store = LocalStore() - manifest_group = HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=chunked_dimensions_netcdf4_file, - ) - assert len(manifest_group.arrays) == 3 - - def test_nested_groups_are_ignored(self, nested_group_hdf5_file): - store = LocalStore() - manifest_group = HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=nested_group_hdf5_file, - group="group", - ) - assert len(manifest_group.arrays) == 1 - - def test_drop_variables(self, multiple_datasets_hdf5_file): - store = LocalStore() - manifest_group = HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=multiple_datasets_hdf5_file, - drop_variables=["data2"], - ) - assert "data2" not in manifest_group.arrays.keys() - - def test_dataset_in_group(self, group_hdf5_file): - store = LocalStore() - manifest_group = HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=group_hdf5_file, - group="group", - ) - assert len(manifest_group.arrays) == 1 - - def test_non_group_error(self, group_hdf5_file): - store = LocalStore() - with pytest.raises(ValueError): - HDFVirtualBackend._construct_manifest_group( - store=store, - filepath=group_hdf5_file, - group="group/data", - ) - - -@requires_hdf5plugin -@requires_imagecodecs -class TestOpenVirtualDataset: - def test_coord_names( - self, - root_coordinates_hdf5_file, - ): - vds = HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file) - assert set(vds.coords) == {"lat", "lon"} - - @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support") - def test_big_endian( - self, - big_endian_dtype_hdf5_file, - ): - vds = HDFVirtualBackend.open_virtual_dataset(big_endian_dtype_hdf5_file) - print(vds) - - -@requires_hdf5plugin -@requires_imagecodecs -@pytest.mark.parametrize("group", [None, "subgroup", "subgroup/"]) -def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group): - # regression test for GH issue #364 - vds = open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, - group=group, - backend=HDFVirtualBackend, - ) - assert list(vds.dims) == ["dim_0"] diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py deleted file mode 100644 index 9cad29a5d..000000000 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py +++ /dev/null @@ -1,102 +0,0 @@ -import numpy as np -import pytest -import xarray as xr - -from virtualizarr.manifests import ManifestArray -from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import ( - requires_hdf5plugin, - requires_minio, - requires_network, - requires_obstore, -) - - -@pytest.fixture(name="basic_ds") -def basic_ds(): - x = np.arange(100) - y = np.arange(100) - temperature = 0.1 * x[:, None] + 0.1 * y[None, :] - ds = xr.Dataset( - {"temperature": (["x", "y"], temperature)}, - coords={"x": np.arange(100), "y": np.arange(100)}, - ) - return ds - - -@requires_hdf5plugin -@requires_obstore -class TestHDFManifestStore: - def test_rountrip_simple_virtualdataset(self, tmpdir, basic_ds): - "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore" - - filepath = f"{tmpdir}/basic_ds_roundtrip.nc" - basic_ds.to_netcdf(filepath, engine="h5netcdf") - store = HDFVirtualBackend._create_manifest_store( - filepath=filepath, - ) - rountripped_ds = xr.open_dataset( - store, engine="zarr", consolidated=False, zarr_format=3 - ) - xr.testing.assert_allclose(basic_ds, rountripped_ds) - - def test_rountrip_partial_chunk_virtualdataset(self, tmpdir, basic_ds): - "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore with a single partial chunk" - - filepath = f"{tmpdir}/basic_ds_roundtrip.nc" - encoding = { - "temperature": {"chunksizes": (90, 90), "original_shape": (100, 100)} - } - basic_ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - store = HDFVirtualBackend._create_manifest_store( - filepath=filepath, - ) - rountripped_ds = xr.open_dataset( - store, engine="zarr", consolidated=False, zarr_format=3 - ) - xr.testing.assert_allclose(basic_ds, rountripped_ds) - - def test_rountrip_simple_virtualdataset_default_store(self, tmpdir, basic_ds): - "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore" - - filepath = f"{tmpdir}/basic_ds_roundtrip.nc" - basic_ds.to_netcdf(filepath, engine="h5netcdf") - store = HDFVirtualBackend._create_manifest_store(filepath=filepath) - rountripped_ds = xr.open_dataset( - store, engine="zarr", consolidated=False, zarr_format=3 - ) - xr.testing.assert_allclose(basic_ds, rountripped_ds) - - @requires_minio - @requires_obstore - def test_store(self, minio_bucket, chunked_roundtrip_hdf5_s3_file): - import obstore as obs - - s3store = obs.store.S3Store( - bucket=minio_bucket["bucket"], - config={ - "endpoint": minio_bucket["endpoint"], - "virtual_hosted_style_request": False, - "skip_signature": True, - }, - client_options={"allow_http": True}, - ) - store = HDFVirtualBackend._create_manifest_store( - filepath=chunked_roundtrip_hdf5_s3_file, - store=s3store, - ) - vds = store.to_virtual_dataset() - assert vds.sizes == {"phony_dim_0": 5} - assert isinstance(vds["data"].data, ManifestArray) - - @requires_network - @requires_obstore - def test_default_store(self): - store = HDFVirtualBackend._create_manifest_store( - filepath="s3://carbonplan-share/virtualizarr/local.nc", - ) - vds = store.to_virtual_dataset() - assert vds.sizes == {"time": 2920, "lat": 25, "lon": 53} - assert isinstance(vds["air"].data, ManifestArray) - for name in ["time", "lat", "lon"]: - assert isinstance(vds[name].data, np.ndarray) diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py deleted file mode 100644 index 591810bca..000000000 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ /dev/null @@ -1,271 +0,0 @@ -from pathlib import Path -from typing import Any, Callable, Generator, Optional - -import numpy as np -import pytest -import ujson - -from virtualizarr.backend import open_virtual_dataset -from virtualizarr.manifests import ManifestArray -from virtualizarr.tests import has_fastparquet, requires_kerchunk - - -def gen_ds_refs( - zgroup: str | None = None, - zarray: str | None = None, - zattrs: str | None = None, - chunks: dict[str, list[str | int]] | None = None, -): - if zgroup is None: - zgroup = '{"zarr_format":2}' - if zarray is None: - zarray = '{"chunks":[2,3],"compressor":null,"dtype":" Generator[ - Callable[[Optional[Any], Optional[Any], Optional[Any], Optional[Any]], str], - None, - None, -]: - """ - Fixture which defers creation of the references file until the parameters zgroup etc. are known. - """ - - def _refs_file(zgroup=None, zarray=None, zattrs=None, chunks=None) -> str: - refs = gen_ds_refs(zgroup=zgroup, zarray=zarray, zattrs=zattrs, chunks=chunks) - filepath = tmp_path / "refs.json" - - with open(filepath, "w") as json_file: - ujson.dump(refs, json_file) - - return str(filepath) - - yield _refs_file - - -def test_dataset_from_df_refs(refs_file_factory): - refs_file = refs_file_factory() - - vds = open_virtual_dataset(refs_file, filetype="kerchunk") - - assert "a" in vds - vda = vds["a"] - assert isinstance(vda.data, ManifestArray) - assert vda.dims == ("x", "y") - assert vda.shape == (2, 3) - assert vda.chunks == (2, 3) - assert vda.dtype == np.dtype(" xr.Dataset: ) -def test_fsspec_openfile_from_path(tmp_path: pathlib.Path, dataset: xr.Dataset) -> None: - f = tmp_path / "dataset.nc" - dataset.to_netcdf(f) - - result = _FsspecFSFromFilepath(filepath=f.as_posix()).open_file() - assert isinstance(result, fsspec.implementations.local.LocalFileOpener) - - -@requires_scipy -def test_fsspec_openfile_memory(dataset: xr.Dataset): - fs = fsspec.filesystem("memory") - with contextlib.redirect_stderr(None): - # Suppress "Exception ignored in: " - with fs.open("dataset.nc", mode="wb") as f: - dataset.to_netcdf(f, engine="h5netcdf") - - result = _FsspecFSFromFilepath(filepath="memory://dataset.nc").open_file() - with result: - assert isinstance(result, fsspec.implementations.memory.MemoryFile) - - def test_copy_and_replace_metadata(array_v3_metadata): old_metadata = array_v3_metadata( shape=(10, 10), diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 104d0ff44..b665deff7 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,18 +1,27 @@ +import functools +from collections.abc import Mapping +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path from typing import Callable import numpy as np import pytest import xarray as xr -from xarray import open_dataset +import xarray.testing as xrt +from xarray import Dataset, open_dataset +from xarray.core.indexes import Index -from virtualizarr import open_virtual_dataset -from virtualizarr.backend import VirtualBackend +from virtualizarr import open_virtual_dataset, open_virtual_mfdataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.parsers import HDFParser from virtualizarr.tests import ( - parametrize_over_hdf_backends, + requires_dask, requires_hdf5plugin, requires_imagecodecs, + requires_lithops, + requires_network, ) +from virtualizarr.tests.utils import obstore_http, obstore_local, obstore_s3 def test_wrapping(array_v3_metadata): @@ -180,24 +189,25 @@ def test_concat_dim_coords_along_existing_dim(self, array_v3_metadata): @requires_hdf5plugin @requires_imagecodecs -@parametrize_over_hdf_backends class TestCombine: def test_combine_by_coords( self, netcdf4_files_factory: Callable[[], tuple[str, str]], - hdf_backend: type[VirtualBackend], ): filepath1, filepath2 = netcdf4_files_factory() - + store = obstore_local(file_url=filepath1) + parser = HDFParser() with ( open_virtual_dataset( - filepath1, - backend=hdf_backend, + file_url=filepath1, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds1, open_virtual_dataset( - filepath2, - backend=hdf_backend, + file_url=filepath2, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds2, ): @@ -212,28 +222,33 @@ def test_combine_by_coords( def test_2d_combine_by_coords( self, netcdf4_files_factory_2d: Callable[[], tuple[str, str, str, str]], - hdf_backend: type[VirtualBackend], ): filepath1, filepath2, filepath3, filepath4 = netcdf4_files_factory_2d() + store = obstore_local(file_url=filepath1) + parser = HDFParser() with ( open_virtual_dataset( - filepath1, - backend=hdf_backend, + file_url=filepath1, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds1, open_virtual_dataset( - filepath2, - backend=hdf_backend, + file_url=filepath2, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds2, open_virtual_dataset( - filepath3, - backend=hdf_backend, + file_url=filepath3, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds3, open_virtual_dataset( - filepath4, - backend=hdf_backend, + file_url=filepath4, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds4, ): @@ -254,28 +269,33 @@ def test_2d_combine_by_coords( def test_2d_combine_nested( self, netcdf4_files_factory_2d: Callable[[], tuple[str, str, str, str]], - hdf_backend: type[VirtualBackend], ): filepath1, filepath2, filepath3, filepath4 = netcdf4_files_factory_2d() + store = obstore_local(file_url=filepath1) + parser = HDFParser() with ( open_virtual_dataset( - filepath1, - backend=hdf_backend, + file_url=filepath1, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds1, open_virtual_dataset( - filepath2, - backend=hdf_backend, + file_url=filepath2, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds2, open_virtual_dataset( - filepath3, - backend=hdf_backend, + file_url=filepath3, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds3, open_virtual_dataset( - filepath4, - backend=hdf_backend, + file_url=filepath4, + object_store=store, + parser=parser, loadable_variables=["time", "lat", "lon"], ) as vds4, ): @@ -301,13 +321,17 @@ def test_2d_combine_nested( def test_combine_by_coords_keeping_manifestarrays( self, netcdf4_files_factory: Callable[[], tuple[str, str]], - hdf_backend: type[VirtualBackend], ): filepath1, filepath2 = netcdf4_files_factory() - + store = obstore_local(file_url=filepath1) + parser = HDFParser() with ( - open_virtual_dataset(filepath1, backend=hdf_backend) as vds1, - open_virtual_dataset(filepath2, backend=hdf_backend) as vds2, + open_virtual_dataset( + file_url=filepath1, object_store=store, parser=parser + ) as vds1, + open_virtual_dataset( + file_url=filepath2, object_store=store, parser=parser + ) as vds2, ): combined_vds = xr.combine_by_coords([vds2, vds1]) @@ -316,17 +340,22 @@ def test_combine_by_coords_keeping_manifestarrays( assert isinstance(combined_vds["lon"].data, ManifestArray) -@parametrize_over_hdf_backends class TestRenamePaths: - def test_rename_to_str(self, netcdf4_file, hdf_backend): - with open_virtual_dataset(netcdf4_file, backend=hdf_backend) as vds: + def test_rename_to_str(self, netcdf4_file): + store = obstore_local(netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=store, + parser=parser, + ) as vds: renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert ( renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] == "s3://bucket/air.nc" ) - def test_rename_using_function(self, netcdf4_file, hdf_backend): + def test_rename_using_function(self, netcdf4_file): def local_to_s3_url(old_local_path: str) -> str: from pathlib import Path @@ -334,27 +363,38 @@ def local_to_s3_url(old_local_path: str) -> str: filename = Path(old_local_path).name return str(new_s3_bucket_url + filename) - with open_virtual_dataset(netcdf4_file, backend=hdf_backend) as vds: + store = obstore_local(netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=store, + parser=parser, + ) as vds: renamed_vds = vds.virtualize.rename_paths(local_to_s3_url) assert ( renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] == "s3://bucket/air.nc" ) - def test_invalid_type(self, netcdf4_file, hdf_backend): - with open_virtual_dataset(netcdf4_file, backend=hdf_backend) as vds: + def test_invalid_type(self, netcdf4_file): + store = obstore_local(netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, object_store=store, parser=parser + ) as vds: with pytest.raises(TypeError): vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) @requires_hdf5plugin @requires_imagecodecs - def test_mixture_of_manifestarrays_and_numpy_arrays( - self, netcdf4_file, hdf_backend - ): + def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): + store = obstore_local(netcdf4_file) + parser = HDFParser() with open_virtual_dataset( - netcdf4_file, + file_url=netcdf4_file, + object_store=store, + parser=parser, loadable_variables=["lat", "lon"], - backend=hdf_backend, ) as vds: renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert ( @@ -367,12 +407,474 @@ def test_mixture_of_manifestarrays_and_numpy_arrays( @requires_hdf5plugin @requires_imagecodecs def test_nbytes(simple_netcdf4): - with open_virtual_dataset(simple_netcdf4) as vds: + store = obstore_local(simple_netcdf4) + parser = HDFParser() + with open_virtual_dataset( + file_url=simple_netcdf4, + object_store=store, + parser=parser, + ) as vds: assert vds.virtualize.nbytes == 32 assert vds.nbytes == 48 - with open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"]) as vds: + with open_virtual_dataset( + file_url=simple_netcdf4, + object_store=store, + parser=parser, + loadable_variables=["foo"], + ) as vds: assert vds.virtualize.nbytes == 48 with open_dataset(simple_netcdf4) as ds: assert ds.virtualize.nbytes == ds.nbytes + + +class TestOpenVirtualDatasetIndexes: + @pytest.mark.xfail(reason="not yet implemented") + def test_specify_no_indexes(self, netcdf4_file): + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, object_store=object_store, parser=parser, indexes={} + ) as vds: + assert vds.indexes == {} + + @requires_hdf5plugin + @requires_imagecodecs + def test_create_default_indexes_for_loadable_variables(self, netcdf4_file): + loadable_variables = ["time", "lat"] + + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with ( + open_virtual_dataset( + file_url=netcdf4_file, + object_store=object_store, + parser=parser, + indexes=None, + loadable_variables=loadable_variables, + ) as vds, + open_dataset(netcdf4_file, decode_times=True) as ds, + ): + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds[loadable_variables].xindexes) + + +def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): + # Check if the mappings have the same keys + if set(indexes1.keys()) != set(indexes2.keys()): + return False + + # Check if the values for each key are identical + for key in indexes1.keys(): + index1 = indexes1[key] + index2 = indexes2[key] + + if not index1.equals(index2): + return False + + return True + + +@requires_hdf5plugin +@requires_imagecodecs +def test_cftime_index(tmp_path: Path): + """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" + # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 + filepath = str(tmp_path / "tmp.nc") + ds = xr.Dataset( + data_vars={ + "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)), + }, + coords={ + "time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"), + "lat": np.arange(-90, 90, 10), + "lon": np.arange(-180, 180, 10), + }, + attrs={"attr1_key": "attr1_val"}, + ) + ds.to_netcdf(filepath) + + object_store = obstore_local(file_url=filepath) + parser = HDFParser() + with open_virtual_dataset( + file_url=filepath, + object_store=object_store, + parser=parser, + loadable_variables=["time", "lat", "lon"], + ) as vds: + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds.xindexes) + assert list(ds.coords) == list(vds.coords) + assert vds.dims == ds.dims + assert vds.attrs == ds.attrs + + +class TestOpenVirtualDatasetAttrs: + def test_drop_array_dimensions(self, netcdf4_file): + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + # regression test for GH issue #150 + vds = open_virtual_dataset( + file_url=netcdf4_file, + object_store=object_store, + parser=parser, + ) + assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs + + def test_coordinate_variable_attrs_preserved(self, netcdf4_file): + # regression test for GH issue #155 + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=object_store, + parser=parser, + ) as vds: + assert vds["lat"].attrs == { + "standard_name": "latitude", + "long_name": "Latitude", + "units": "degrees_north", + "axis": "Y", + } + + +class TestDetermineCoords: + def test_infer_one_dimensional_coords(self, netcdf4_file): + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, + object_store=object_store, + parser=parser, + ) as vds: + assert set(vds.coords) == {"time", "lat", "lon"} + + def test_var_attr_coords(self, netcdf4_file_with_2d_coords): + object_store = obstore_local(file_url=netcdf4_file_with_2d_coords) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file_with_2d_coords, + object_store=object_store, + parser=parser, + ) as vds: + expected_dimension_coords = ["ocean_time", "s_rho"] + expected_2d_coords = ["lon_rho", "lat_rho", "h"] + expected_1d_non_dimension_coords = ["Cs_r"] + expected_scalar_coords = ["hc", "Vtransform"] + expected_coords = ( + expected_dimension_coords + + expected_2d_coords + + expected_1d_non_dimension_coords + + expected_scalar_coords + ) + assert set(vds.coords) == set(expected_coords) + + +@requires_network +class TestReadRemote: + @pytest.mark.parametrize( + "indexes", + [ + None, + pytest.param({}, marks=pytest.mark.xfail(reason="not implemented")), + ], + ids=["None index", "empty dict index"], + ) + def test_anon_read_s3(self, indexes): + """Parameterized tests for empty vs supplied indexes and filetypes.""" + # TODO: Switch away from this s3 url after minIO is implemented. + filepath = "s3://carbonplan-share/virtualizarr/local.nc" + object_store = obstore_s3(file_url=filepath, region="us-west-2") + parser = HDFParser() + with open_virtual_dataset( + file_url=filepath, + object_store=object_store, + indexes=indexes, + parser=parser, + ) as vds: + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + + assert isinstance(vds["air"].data, ManifestArray) + for name in ["time", "lat", "lon"]: + assert isinstance(vds[name].data, np.ndarray) + + @pytest.mark.skip(reason="often times out, as nisar file is 200MB") + def test_virtualizarr_vs_local_nisar(self): + # Open group directly from locally cached file with xarray + url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" + hdf_group = "science/LSAR/GCOV/grids/frequencyA" + store = obstore_http(file_url=url) + drop_variables = ["listOfCovarianceTerms", "listOfPolarizations"] + parser = HDFParser(group=hdf_group, drop_variables=drop_variables) + with ( + xr.open_dataset( + url, + engine="h5netcdf", + group=hdf_group, + drop_variables=drop_variables, + phony_dims="access", + ) as dsXR, + # save group reference file via virtualizarr, then open with engine="kerchunk" + open_virtual_dataset( + file_url=url, + object_store=store, + parser=parser, + ) as vds, + ): + tmpref = "/tmp/cmip6.json" + vds.virtualize.to_kerchunk(tmpref, format="json") + + with xr.open_dataset(tmpref, engine="kerchunk") as dsV: + # xrt.assert_identical(dsXR, dsV) #Attribute order changes + xrt.assert_equal(dsXR, dsV) + + +class TestOpenVirtualDatasetHDFGroup: + def test_open_empty_group(self, empty_netcdf4_file): + object_store = obstore_local(file_url=empty_netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=empty_netcdf4_file, + object_store=object_store, + parser=parser, + ) as vds: + assert isinstance(vds, xr.Dataset) + expected = Dataset() + xrt.assert_identical(vds, expected) + + def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups): + object_store = obstore_local(file_url=netcdf4_file_with_data_in_multiple_groups) + parser = HDFParser(group="subgroup") + with open_virtual_dataset( + file_url=netcdf4_file_with_data_in_multiple_groups, + object_store=object_store, + parser=parser, + ) as vds: + assert list(vds.variables) == ["bar"] + assert isinstance(vds["bar"].data, ManifestArray) + assert vds["bar"].shape == (2,) + + @pytest.mark.parametrize("group", ["", None]) + def test_open_root_group( + self, + netcdf4_file_with_data_in_multiple_groups, + group, + ): + object_store = obstore_local(file_url=netcdf4_file_with_data_in_multiple_groups) + parser = HDFParser(group=group) + with open_virtual_dataset( + file_url=netcdf4_file_with_data_in_multiple_groups, + object_store=object_store, + parser=parser, + ) as vds: + assert list(vds.variables) == ["foo"] + assert isinstance(vds["foo"].data, ManifestArray) + assert vds["foo"].shape == (3,) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestLoadVirtualDataset: + @pytest.mark.parametrize( + "loadable_variables, expected_loadable_variables", + [ + ([], []), + (["time"], ["time"]), + (["air", "time"], ["air", "time"]), + (None, ["lat", "lon", "time"]), + ], + ) + def test_loadable_variables( + self, netcdf4_file, loadable_variables, expected_loadable_variables + ): + object_store = obstore_local(file_url=netcdf4_file) + parser = HDFParser() + with ( + open_virtual_dataset( + file_url=netcdf4_file, + object_store=object_store, + loadable_variables=loadable_variables, + parser=parser, + ) as vds, + xr.open_dataset(netcdf4_file, decode_times=True) as ds, + ): + assert set(vds.variables) == set(ds.variables) + assert set(vds.coords) == set(ds.coords) + + virtual_variables = { + name: var + for name, var in vds.variables.items() + if isinstance(var.data, ManifestArray) + } + actual_loadable_variables = { + name: var + for name, var in vds.variables.items() + if not isinstance(var.data, ManifestArray) + } + + assert set(actual_loadable_variables) == set(expected_loadable_variables) + + for var in virtual_variables.values(): + assert isinstance(var.data, ManifestArray) + + for name, var in ds.variables.items(): + if name in actual_loadable_variables: + xrt.assert_identical(vds.variables[name], ds.variables[name]) + + def test_group_kwarg_not_a_group(self, hdf5_groups_file): + object_store = obstore_local(file_url=hdf5_groups_file) + parser = HDFParser(group="doesnt_exist") + with pytest.raises(ValueError, match="not an HDF Group"): + with open_virtual_dataset( + file_url=hdf5_groups_file, + object_store=object_store, + parser=parser, + ): + pass + + def test_group_kwarg(self, hdf5_groups_file): + object_store = obstore_local(file_url=hdf5_groups_file) + parser = HDFParser(group="test/group") + vars_to_load = ["air", "time"] + with ( + open_virtual_dataset( + file_url=hdf5_groups_file, + object_store=object_store, + loadable_variables=vars_to_load, + parser=parser, + ) as vds, + xr.open_dataset(hdf5_groups_file, group="test/group") as full_ds, + ): + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + + def test_open_dataset_with_empty(self, hdf5_empty): + object_store = obstore_local(file_url=hdf5_empty) + parser = HDFParser() + with open_virtual_dataset( + file_url=hdf5_empty, object_store=object_store, parser=parser + ) as vds: + assert vds.empty.dims == () + assert vds.empty.attrs == {"empty": "true"} + + def test_open_dataset_with_scalar(self, hdf5_scalar): + object_store = obstore_local(file_url=hdf5_scalar) + parser = HDFParser() + with open_virtual_dataset( + file_url=hdf5_scalar, object_store=object_store, parser=parser + ) as vds: + assert vds.scalar.dims == () + assert vds.scalar.attrs == {"scalar": "true"} + + +preprocess_func = functools.partial( + xr.Dataset.rename_vars, + air="nair", +) + + +@requires_hdf5plugin +@requires_imagecodecs +class TestOpenVirtualMFDataset: + @pytest.mark.parametrize("invalid_parallel_kwarg", ["ray", Dataset]) + def test_invalid_parallel_kwarg( + self, + netcdf4_files_factory, + invalid_parallel_kwarg, + ): + filepath1, filepath2 = netcdf4_files_factory() + store = obstore_local(file_url=filepath1) + parser = HDFParser() + with pytest.raises(ValueError, match="Unrecognized argument"): + open_virtual_mfdataset( + [filepath1, filepath2], + object_store=store, + parser=parser, + combine="nested", + concat_dim="time", + parallel=invalid_parallel_kwarg, + ) + + @pytest.mark.parametrize( + "parallel", + [ + False, + ThreadPoolExecutor, + pytest.param("dask", marks=requires_dask), + pytest.param("lithops", marks=requires_lithops), + ], + ) + @pytest.mark.parametrize( + "preprocess", + [ + None, + preprocess_func, + ], + ) + def test_parallel_open(self, netcdf4_files_factory, parallel, preprocess): + if parallel == "lithops": + pytest.xfail( + "TODO - investigate intermittent test failures with lithops executor" + ) + filepath1, filepath2 = netcdf4_files_factory() + store = obstore_local(file_url=filepath1) + parser = HDFParser() + with ( + open_virtual_dataset( + file_url=filepath1, object_store=store, parser=parser + ) as vds1, + open_virtual_dataset( + file_url=filepath2, + object_store=store, + parser=parser, + ) as vds2, + ): + expected_vds = xr.concat([vds1, vds2], dim="time") + if preprocess: + expected_vds = preprocess_func(expected_vds) + + # test combine nested, which doesn't use in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], + object_store=store, + parser=parser, + combine="nested", + concat_dim="time", + parallel=parallel, + preprocess=preprocess, + ) + xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords using in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], + object_store=store, + parser=parser, + combine="by_coords", + parallel=parallel, + preprocess=preprocess, + ) + xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords again using in-memory indexes but for a glob + file_glob = Path(filepath1).parent.glob("air*.nc") + combined_vds = open_virtual_mfdataset( + file_glob, + object_store=store, + parser=parser, + combine="by_coords", + parallel=parallel, + preprocess=preprocess, + ) + xrt.assert_identical(combined_vds, expected_vds) + + +def test_drop_variables(netcdf4_file): + store = obstore_local(netcdf4_file) + parser = HDFParser() + with open_virtual_dataset( + file_url=netcdf4_file, object_store=store, parser=parser, drop_variables=["air"] + ) as vds: + assert "air" not in vds.variables diff --git a/virtualizarr/tests/utils.py b/virtualizarr/tests/utils.py new file mode 100644 index 000000000..a52ca2e31 --- /dev/null +++ b/virtualizarr/tests/utils.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import os +from pathlib import Path +from urllib.parse import urlparse + +from obstore.store import LocalStore, ObjectStore, from_url + + +def obstore_local(file_url: str) -> ObjectStore: + path = Path(file_url) + store = LocalStore(prefix=path.parent) + return store + + +def obstore_s3(file_url: str, region: str) -> ObjectStore: + parsed = urlparse(file_url) + bucket = parsed.netloc + key_prefix = os.path.dirname(parsed.path.lstrip("/")) + base_path = f"s3://{bucket}/{key_prefix}" + store = from_url(url=base_path, region=region, skip_signature=True) + return store + + +def obstore_http(file_url: str) -> ObjectStore: + parsed = urlparse(file_url) + key_prefix = os.path.dirname(parsed.path.lstrip("/")) + base_path = f"{parsed.scheme}://{parsed.netloc}/{key_prefix}" + store = from_url(url=base_path) + return store diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py index 20cde52a7..219246e92 100644 --- a/virtualizarr/translators/kerchunk.py +++ b/virtualizarr/translators/kerchunk.py @@ -1,10 +1,10 @@ -from typing import Any, Mapping, MutableMapping, cast +from __future__ import annotations + +from collections.abc import Iterable +from typing import cast import numpy as np from numcodecs.abc import Codec -from xarray import Dataset -from xarray.core.indexes import Index -from xarray.core.variable import Variable from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -12,7 +12,11 @@ from virtualizarr.codecs import ( numcodec_config_to_configurable, ) -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests import ( + ChunkManifest, + ManifestArray, + ManifestGroup, +) from virtualizarr.manifests.manifest import ChunkEntry, ChunkKey from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.types.kerchunk import ( @@ -20,7 +24,6 @@ KerchunkStoreRefs, ) from virtualizarr.utils import determine_chunk_grid_shape -from virtualizarr.xarray import separate_coords def to_kerchunk_json(v2_metadata: ArrayV2Metadata) -> str: @@ -43,7 +46,7 @@ def to_kerchunk_json(v2_metadata: ArrayV2Metadata) -> str: return json.dumps(zarray_dict, separators=(",", ":"), cls=NumpyEncoder) -def from_kerchunk_refs(decoded_arr_refs_zarray) -> "ArrayV3Metadata": +def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata": """ Convert a decoded zarr array (.zarray) reference to an ArrayV3Metadata object. This function processes the given decoded Zarr array reference dictionary, @@ -86,41 +89,63 @@ def from_kerchunk_refs(decoded_arr_refs_zarray) -> "ArrayV3Metadata": numcodec_configs = [ numcodec_config_to_configurable(config) for config in codec_configs ] + dimension_names = decoded_arr_refs_zarray["dimension_names"] return create_v3_array_metadata( chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]), data_type=dtype, codecs=numcodec_configs, fill_value=fill_value, shape=tuple(decoded_arr_refs_zarray["shape"]), + dimension_names=dimension_names, + attributes=zattrs, ) -def virtual_vars_and_metadata_from_kerchunk_refs( - vds_refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, +def manifestgroup_from_kerchunk_refs( + refs: KerchunkStoreRefs, + group: str | None = None, fs_root: str | None = None, -) -> tuple[Mapping[str, Variable], dict[str, Any], list[str]]: + skip_variables: Iterable[str] | None = None, +) -> ManifestGroup: """ - Parses all useful information from a set kerchunk references (for a single group). + Construct a ManifestGroup from a dictionary of kerchunk references. Parameters ---------- - drop_variables - Variables in the file to not bother generating chunk metadata for. + refs + The Kerchunk references, as a dictionary. + group + Default is to build a store from the root group. fs_root The root of the fsspec filesystem on which these references were generated. Required if any paths are relative in order to turn them into absolute paths (which virtualizarr requires). + skip_variables + Variables to ignore when creating the ManifestGroup. + + Returns + ------- + ManifestGroup + ManifestGroup representation of the virtual chunk references. """ + # both group=None and group='' mean to read root group + if group: + refs = extract_group(refs, group) + + arr_names = find_var_names(refs) + if skip_variables: + arr_names = [var for var in arr_names if var not in skip_variables] + + # TODO support iterating over multiple nested groups + marrs = { + arr_name: manifestarray_from_kerchunk_refs(refs, arr_name, fs_root=fs_root) + for arr_name in arr_names + } - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables, - fs_root=fs_root, - ) - ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) + # TODO probably need to parse the group-level attributes more here + attributes = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - return virtual_vars, ds_attrs, coord_names + manifestgroup = ManifestGroup(arrays=marrs, attributes=attributes) + return manifestgroup def extract_group(vds_refs: KerchunkStoreRefs, group: str) -> KerchunkStoreRefs: @@ -162,79 +187,21 @@ def extract_group(vds_refs: KerchunkStoreRefs, group: str) -> KerchunkStoreRefs: return KerchunkStoreRefs(vds_refs) -def virtual_vars_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, - fs_root: str | None = None, -) -> dict[str, Variable]: - """ - Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - - Parameters - ---------- - drop_variables - Variables in the file to drop before returning. - """ - - var_names = find_var_names(refs) - if drop_variables is None: - drop_variables = [] - var_names_to_keep = [ - var_name for var_name in var_names if var_name not in drop_variables - ] - - vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, fs_root=fs_root) - for var_name in var_names_to_keep - } - return vars - - -def dataset_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] = [], - indexes: MutableMapping[str, Index] | None = None, - fs_root: str | None = None, -) -> Dataset: - """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. - - drop_variables - Variables in the file to drop before returning. - """ - - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, fs_root=fs_root) - ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None: - indexes = {} - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return vds - - -def variable_from_kerchunk_refs( +def manifestarray_from_kerchunk_refs( refs: KerchunkStoreRefs, var_name: str, fs_root: str | None = None, -) -> Variable: - """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" +) -> ManifestArray: + """Create a single ManifestArray by reading specific keys of a kerchunk references dict.""" arr_refs = extract_array_refs(refs, var_name) + + # TODO probably need to update internals of this to use ArrayV3Metadata more neatly chunk_dict, metadata, zattrs = parse_array_refs(arr_refs) # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs - dims = zattrs.pop("_ARRAY_DIMENSIONS") if chunk_dict: manifest = manifest_from_kerchunk_chunk_dict(chunk_dict, fs_root=fs_root) - varr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) elif len(metadata.shape) != 0: # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D @@ -244,14 +211,14 @@ def variable_from_kerchunk_refs( metadata.chunks, ) manifest = ChunkManifest(entries={}, shape=shape) - varr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) else: # This means we encountered a scalar variable of dimension 0, # very likely that it actually has no numeric value and its only purpose # is to communicate dataset attributes. - varr = metadata.fill_value + marr = metadata.fill_value - return Variable(data=varr, dims=dims, attrs=zattrs) + return marr def manifest_from_kerchunk_chunk_dict( @@ -299,7 +266,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: found_var_names = [] for key in refs.keys(): # has to capture "foo/.zarray", but ignore ".zgroup", ".zattrs", and "subgroup/bar/.zarray" - # TODO this might be a sign that we should introduce a KerchunkGroupRefs type and cut down the references before getting to this point... + # TODO this might be a sign that we shoulzd introduce a KerchunkGroupRefs type and cut down the references before getting to this point... if key not in (".zgroup", ".zattrs", ".zmetadata"): first_part, second_part, *_ = key.split("/") if second_part == ".zarray": @@ -336,8 +303,11 @@ def extract_array_refs( def parse_array_refs( arr_refs: KerchunkArrRefs, ) -> tuple[dict, ArrayV3Metadata, dict[str, JSON]]: - metadata = from_kerchunk_refs(arr_refs.pop(".zarray")) zattrs = arr_refs.pop(".zattrs", {}) + dims = zattrs.pop("_ARRAY_DIMENSIONS") + zarray = arr_refs.pop(".zarray") + zarray["dimension_names"] = dims + metadata = from_kerchunk_refs(zarray, zattrs) chunk_dict = arr_refs return chunk_dict, metadata, zattrs diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 3b263acf8..98fa21a38 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -2,7 +2,7 @@ import importlib import io -from dataclasses import dataclass, field +import os from typing import TYPE_CHECKING, Any, Iterable, Optional, Union from urllib.parse import urlparse @@ -14,7 +14,6 @@ if TYPE_CHECKING: import fsspec.core import fsspec.spec - import upath from obstore import ReadableFile from obstore.store import ObjectStore @@ -31,12 +30,19 @@ def __init__(self, store: ObjectStore, path: str) -> None: import obstore as obs parsed = urlparse(path) + if hasattr(store, "prefix") and store.prefix: + filepath = os.path.basename(parsed.path) + else: + filepath = parsed.path - self._reader = obs.open_reader(store, parsed.path) + self._reader = obs.open_reader(store, filepath) def read(self, size: int, /) -> bytes: return self._reader.read(size).to_bytes() + def readall(self) -> bytes: + return self._reader.read().to_bytes() + def seek(self, offset: int, whence: int = 0, /): # TODO: Check on default for whence return self._reader.seek(offset, whence) @@ -45,64 +51,6 @@ def tell(self) -> int: return self._reader.tell() -@dataclass -class _FsspecFSFromFilepath: - """Class to create fsspec Filesystem from input filepath. - - - - Attributes - ---------- - filepath : str - Input filepath - reader_options : dict, optional - dict containing kwargs to pass to file opener, by default {} - fs : fsspec.AbstractFileSystem - The fsspec filesystem object, created in __post_init__ - upath : upath.core.UPath - The upath object, created in __post_init__ - """ - - filepath: str - reader_options: Optional[dict] = field(default_factory=dict) - fs: fsspec.AbstractFileSystem = field(init=False) - upath: upath.core.UPath = field(init=False) - - def open_file(self) -> OpenFileType: - """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. - - Returns - ------- - OpenFileType - file opened with fsspec - """ - return self.fs.open(self.filepath) - - def read_bytes(self, bytes: int) -> bytes: - with self.open_file() as of: - return of.read(bytes) - - def get_mapper(self): - """Returns a mapper for use with Zarr""" - return self.fs.get_mapper(self.filepath) - - def __post_init__(self) -> None: - """Initialize the fsspec filesystem object""" - import fsspec - from upath import UPath - - if not isinstance(self.filepath, UPath): - upath = UPath(self.filepath) - - self.upath = upath - self.protocol = upath.protocol - - self.reader_options = self.reader_options or {} - storage_options = self.reader_options.get("storage_options", {}) # type: ignore - - self.fs = fsspec.filesystem(self.protocol, **storage_options) - - def check_for_collisions( drop_variables: Iterable[str] | None, loadable_variables: Iterable[str] | None, diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 36b810b27..766fbf8ca 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -1,16 +1,256 @@ -from collections.abc import Iterable, Mapping +from __future__ import annotations + +import os +from collections.abc import Callable, Iterable, Mapping, MutableMapping, Sequence +from concurrent.futures import Executor +from pathlib import Path from typing import ( + TYPE_CHECKING, Any, Hashable, - MutableMapping, + Literal, Optional, + cast, ) import xarray as xr import xarray.indexes +from obstore.store import ObjectStore +from xarray import DataArray, Dataset, Index, combine_by_coords +from xarray.backends.common import _find_absolute_paths +from xarray.core.types import NestedSequence +from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine from virtualizarr.manifests import ManifestStore -from virtualizarr.utils import _FsspecFSFromFilepath +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri +from virtualizarr.parallel import get_executor +from virtualizarr.parsers import Parser + +if TYPE_CHECKING: + from xarray.core.types import ( + CombineAttrsOptions, + CompatOptions, + JoinOptions, + ) + + +def open_virtual_dataset( + file_url: str, + object_store: ObjectStore, + parser: Parser, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + cftime_variables: Iterable[str] | None = None, + indexes: Mapping[str, xr.Index] | None = None, +) -> xr.Dataset: + filepath = validate_and_normalize_path_to_uri(file_url, fs_root=Path.cwd().as_uri()) + + manifest_store = parser( + file_url=filepath, + object_store=object_store, + ) + + ds = manifest_store.to_virtual_dataset( + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + ) + return ds.drop_vars(list(drop_variables or ())) + + +def open_virtual_mfdataset( + paths: ( + str + | os.PathLike + | Sequence[str | os.PathLike] + | NestedSequence[str | os.PathLike] + ), + object_store: ObjectStore, + parser: Parser, + concat_dim: ( + str + | DataArray + | Index + | Sequence[str] + | Sequence[DataArray] + | Sequence[Index] + | None + ) = None, + compat: "CompatOptions" = "no_conflicts", + preprocess: Callable[[Dataset], Dataset] | None = None, + data_vars: Literal["all", "minimal", "different"] | list[str] = "all", + coords="different", + combine: Literal["by_coords", "nested"] = "by_coords", + parallel: Literal["dask", "lithops", False] | type[Executor] = False, + join: "JoinOptions" = "outer", + attrs_file: str | os.PathLike | None = None, + combine_attrs: "CombineAttrsOptions" = "override", + **kwargs, +) -> Dataset: + """ + Open multiple files as a single virtual dataset. + + If combine='by_coords' then the function ``combine_by_coords`` is used to combine + the datasets into one before returning the result, and if combine='nested' then + ``combine_nested`` is used. The filepaths must be structured according to which + combining function is used, the details of which are given in the documentation for + ``combine_by_coords`` and ``combine_nested``. By default ``combine='by_coords'`` + will be used. Global attributes from the ``attrs_file`` are used + for the combined dataset. + + Parameters + ---------- + paths + Same as in xarray.open_mfdataset + concat_dim + Same as in xarray.open_mfdataset + compat + Same as in xarray.open_mfdataset + preprocess + Same as in xarray.open_mfdataset + data_vars + Same as in xarray.open_mfdataset + coords + Same as in xarray.open_mfdataset + combine + Same as in xarray.open_mfdataset + parallel : "dask", "lithops", False, or type of subclass of ``concurrent.futures.Executor`` + Specify whether the open and preprocess steps of this function will be + performed in parallel using lithops, dask.delayed, or any executor compatible + with the ``concurrent.futures`` interface, or in serial. + Default is False, which will execute these steps in serial. + join + Same as in xarray.open_mfdataset + attrs_file + Same as in xarray.open_mfdataset + combine_attrs + Same as in xarray.open_mfdataset + **kwargs : optional + Additional arguments passed on to :py:func:`virtualizarr.open_virtual_dataset`. For an + overview of some of the possible options, see the documentation of + :py:func:`virtualizarr.open_virtual_dataset`. + + Returns + ------- + xarray.Dataset + + Notes + ----- + The results of opening each virtual dataset in parallel are sent back to the client process, so must not be too large. + """ + + # TODO this is practically all just copied from xarray.open_mfdataset - an argument for writing a virtualizarr engine for xarray? + + # TODO list kwargs passed to open_virtual_dataset explicitly in docstring? + + paths = cast(NestedSequence[str], _find_absolute_paths(paths)) + + if not paths: + raise OSError("no files to open") + + paths1d: list[str] + if combine == "nested": + if isinstance(concat_dim, str | DataArray) or concat_dim is None: + concat_dim = [concat_dim] # type: ignore[assignment] + + # This creates a flat list which is easier to iterate over, whilst + # encoding the originally-supplied structure as "ids". + # The "ids" are not used at all if combine='by_coords`. + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids, paths1d = ( + list(combined_ids_paths.keys()), + list(combined_ids_paths.values()), + ) + elif concat_dim is not None: + raise ValueError( + "When combine='by_coords', passing a value for `concat_dim` has no " + "effect. To manually combine along a specific dimension you should " + "instead specify combine='nested' along with a value for `concat_dim`.", + ) + else: + paths1d = paths # type: ignore[assignment] + + # TODO this refactored preprocess and executor logic should be upstreamed into xarray - see https://github.com/pydata/xarray/pull/9932 + + if preprocess: + # TODO we could reexpress these using functools.partial but then we would hit this lithops bug: https://github.com/lithops-cloud/lithops/issues/1428 + + def _open_and_preprocess(path: str) -> xr.Dataset: + ds = open_virtual_dataset( + file_url=path, object_store=object_store, parser=parser, **kwargs + ) + return preprocess(ds) + + open_func = _open_and_preprocess + else: + + def _open(path: str) -> xr.Dataset: + return open_virtual_dataset( + file_url=path, object_store=object_store, parser=parser, **kwargs + ) + + open_func = _open + + executor = get_executor(parallel=parallel) + with executor() as exec: + # wait for all the workers to finish, and send their resulting virtual datasets back to the client for concatenation there + virtual_datasets = list( + exec.map( + open_func, + paths1d, + ) + ) + + # TODO add file closers + + # Combine all datasets, closing them in case of a ValueError + try: + if combine == "nested": + # Combined nested list by successive concat and merge operations + # along each dimension, using structure given by "ids" + combined_vds = _nested_combine( + virtual_datasets, + concat_dims=concat_dim, + compat=compat, + data_vars=data_vars, + coords=coords, + ids=ids, + join=join, + combine_attrs=combine_attrs, + ) + elif combine == "by_coords": + # Redo ordering from coordinates, ignoring how they were ordered + # previously + combined_vds = combine_by_coords( + virtual_datasets, + compat=compat, + data_vars=data_vars, + coords=coords, + join=join, + combine_attrs=combine_attrs, + ) + else: + raise ValueError( + f"{combine} is an invalid option for the keyword argument ``combine``" + ) + except ValueError: + for vds in virtual_datasets: + vds.close() + raise + + # combined_vds.set_close(partial(_multi_file_closer, closers)) + + # read global attributes from the attrs_file or from the first dataset + if attrs_file is not None: + if isinstance(attrs_file, os.PathLike): + attrs_file = cast(str, os.fspath(attrs_file)) + combined_vds.attrs = virtual_datasets[paths1d.index(attrs_file)].attrs + + # TODO should we just immediately close everything? + # TODO If loadable_variables is eager then we should have already read everything we're ever going to read into memory at this point + + return combined_vds def construct_fully_virtual_dataset( @@ -36,10 +276,7 @@ def construct_fully_virtual_dataset( def construct_virtual_dataset( - manifest_store: ManifestStore | None = None, - # TODO remove filepath option once all readers use ManifestStore approach - fully_virtual_ds: xr.Dataset | None = None, - filepath: str | None = None, + manifest_store: ManifestStore, group: str | None = None, loadable_variables: Iterable[Hashable] | None = None, decode_times: bool | None = None, @@ -47,54 +284,32 @@ def construct_virtual_dataset( reader_options: Optional[dict] = None, ) -> xr.Dataset: """ - Construct a fully or partly virtual dataset from a ManifestStore (or filepath for backwards compatibility), + Construct a fully or partly virtual dataset from a ManifestStore containing the contents of one group. - Accepts EITHER manifest_store OR fully_virtual_ds and filepath. The latter option should be removed once all readers use ManifestStore approach. """ if indexes is not None: raise NotImplementedError() - if manifest_store: - if group: - raise NotImplementedError( - "ManifestStore does not yet support nested groups" - ) - else: - manifestgroup = manifest_store._group - - fully_virtual_ds = manifestgroup.to_virtual_dataset() - - with xr.open_zarr( - manifest_store, - group=group, - consolidated=False, - zarr_format=3, - chunks=None, - decode_times=decode_times, - ) as loadable_ds: - return replace_virtual_with_loadable_vars( - fully_virtual_ds, loadable_ds, loadable_variables - ) + if group: + raise NotImplementedError("ManifestStore does not yet support nested groups") else: - # TODO pre-ManifestStore codepath, remove once all readers use ManifestStore approach - - fpath = _FsspecFSFromFilepath( - filepath=filepath, # type: ignore[arg-type] - reader_options=reader_options, - ).open_file() - - with xr.open_dataset( - fpath, # type: ignore[arg-type] - group=group, - decode_times=decode_times, - ) as loadable_ds: - return replace_virtual_with_loadable_vars( - fully_virtual_ds, # type: ignore[arg-type] - loadable_ds, - loadable_variables, - ) + manifestgroup = manifest_store._group + + fully_virtual_ds = manifestgroup.to_virtual_dataset() + + with xr.open_zarr( + manifest_store, + group=group, + consolidated=False, + zarr_format=3, + chunks=None, + decode_times=decode_times, + ) as loadable_ds: + return replace_virtual_with_loadable_vars( + fully_virtual_ds, loadable_ds, loadable_variables + ) def replace_virtual_with_loadable_vars(