diff --git a/conftest.py b/conftest.py index c858cac8..f696e66c 100644 --- a/conftest.py +++ b/conftest.py @@ -339,6 +339,15 @@ def hdf5_empty(tmp_path: Path) -> str: return str(filepath) +@pytest.fixture +def hdf5_missing_value(tmp_path: Path) -> str: + """Create an empty HDF5 file.""" + filepath = tmp_path / "compact_lowlevel.h5" + with h5py.File(filepath, "w") as f: + f.create_dataset("my_dataset", shape=(10,), dtype="int64") + return str(filepath) + + @pytest.fixture def hdf5_scalar(tmp_path: Path) -> str: """Create an HDF5 file with a scalar dataset.""" diff --git a/docs/releases.md b/docs/releases.md index 2b0a29ba..75841d2e 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -16,6 +16,10 @@ ### Internal changes +- Migrates bulk of `dmrpp parser` to `Pydap`. + ([#902](https://github.com/zarr-developers/VirtualiZarr/pull/902)). + By [Miguel Jimenez-Urias](https://github.com/Mikejmnez). + ## v2.5.0 (23rd March 2026) Brings `region`-writing support in `.to_icechunk()`, a `ZarrParser` with orders of magnitude better performance, more FAQ docs, and various bugfixes. @@ -69,6 +73,7 @@ Brings `region`-writing support in `.to_icechunk()`, a `ZarrParser` with orders - Fix `ZarrParser` to use public attribute instead of private one ([#916](https://github.com/zarr-developers/VirtualiZarr/pull/916)). By [Max Jones](https://github.com/maxrjones). + ### Documentation - Added FAQ answer comparing the Kerchunk and Icechunk serialization formats. ([#818](https://github.com/zarr-developers/VirtualiZarr/pull/818)). diff --git a/pyproject.toml b/pyproject.toml index a99ac862..ee304e3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ remote = [ "requests", "aiohttp", "s3fs", + "pydap>=3.5.9", ] # non-kerchunk-based parsers @@ -110,6 +111,7 @@ upstream = [ 'kerchunk @ git+https://github.com/fsspec/kerchunk', 'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python', 'virtual_tiff @ git+https://github.com/virtual-zarr/virtual-tiff', + 'pydap @ git+https://github.com/pydap/pydap.git', ] docs = [ "mkdocs-material[imaging]>=9.6.14", @@ -262,6 +264,7 @@ module = [ "ujson", "zarr", "requests", + "pydap.*", ] ignore_missing_imports = true diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 53495c5f..78b2fd09 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,13 +1,13 @@ import io import warnings from pathlib import Path -from typing import Any, Iterable +from typing import Iterable from xml.etree import ElementTree as ET -import numpy as np from obspec_utils.protocols import ReadableStore from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry +from pydap.parsers.dmr import DMRPPParser as _DMRPPParser from virtualizarr.manifests import ( ChunkManifest, @@ -17,7 +17,6 @@ ) from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.parsers.utils import encode_cf_fill_value -from virtualizarr.types import ChunkKey class DMRPPParser: @@ -66,9 +65,15 @@ def __call__( file_bytes = reader.readall() stream = io.BytesIO(file_bytes) + url = ( + url.removesuffix(".dap.dmrpp") + if url.endswith(".dap.dmrpp") + else url.removesuffix(".dmrpp") + ) + parser = DMRParser( root=ET.parse(stream).getroot(), - data_filepath=url.removesuffix(".dmrpp"), + data_filepath=url, skip_variables=self.skip_variables, ) manifest_store = parser.parse_dataset(object_store=store, group=self.group) @@ -90,27 +95,7 @@ class DMRParser: "dap": "http://xml.opendap.org/ns/DAP/4.0#", "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", } - # DAP data types to numpy data types - _DAP_NP_DTYPE = { - "Byte": "uint8", - "UByte": "uint8", - "Int8": "int8", - "UInt8": "uint8", - "Int16": "int16", - "UInt16": "uint16", - "Int32": "int32", - "UInt32": "uint32", - "Int64": "int64", - "UInt64": "uint64", - "Url": "object", - "Float32": "float32", - "Float64": "float64", - "String": "object", - } - # Default zlib compression value - _DEFAULT_ZLIB_VALUE = 6 - # Encoding keys that should be removed from attributes and placed in xarray encoding dict - # _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + root: ET.Element data_filepath: str @@ -132,57 +117,21 @@ def __init__( If None, the data file path is taken from the DMR++ file. """ self.root = root - self._validation_issues: list[str] = [] - data_filepath_from_root = self._get_attrib(self.root, "name", required=True) - assert data_filepath_from_root is not None # required=True guarantees non-None self.data_filepath = ( - data_filepath if data_filepath is not None else data_filepath_from_root + data_filepath if data_filepath is not None else self.root.attrib["name"] ) self.skip_variables = skip_variables or () + self._validation_issues: list[str] = [] - def _get_attrib( - self, element: ET.Element, attrib_name: str, required: bool = False - ) -> str | None: - """ - Safely get an attribute from an XML element, logging validation issues. - - Parameters - ---------- - element - The XML element to get the attribute from. - attrib_name - The name of the attribute to get. - required - If True, raises a ValueError when the attribute is missing. If False, - returns None and logs the issue. - - Returns - ------- - str | None - The attribute value if found, None otherwise. - - Raises - ------ - ValueError - If required is True and the attribute is not found. - """ - if attrib_name in element.attrib: - return element.attrib[attrib_name] - - element_info = ( - element.tag - if "name" not in element.attrib - else f"{element.tag}[@name='{element.attrib['name']}']" - ) - issue_msg = ( - f"Missing required attribute '{attrib_name}' in element: {element_info}" + def dmrparser(self) -> _DMRPPParser: + """Exposes the _DMRParser to external use (avoids breaking changes)""" + parser = _DMRPPParser( + root=self.root, + data_filepath=self.data_filepath, + skip_variables=self.skip_variables, ) - self._validation_issues.append(issue_msg) - - if required: - raise ValueError(issue_msg) - - return None + self._validation_issues = parser._validation_issues + return parser def parse_dataset( self, @@ -218,437 +167,48 @@ def parse_dataset( ) group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") - dataset_element = self._split_groups(self.root).get(group_path) + + dataset_element = self.dmrparser()._split_groups(self.root).get(group_path) if dataset_element is None: raise ValueError( f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" ) - manifest_group = self._parse_dataset(dataset_element) - registry: ObjectStoreRegistry = ObjectStoreRegistry() - registry.register(self.data_filepath, object_store) - - return ManifestStore(registry=registry, group=manifest_group) - - def find_node_fqn(self, fqn: str) -> ET.Element: - """ - Find the element in the root element by converting the fully qualified name to an xpath query. - - E.g. fqn = "/a/b" --> root.find("./*[@name='a']/*[@name='b']") - - See more about OPeNDAP fully qualified names (FQN) here: https://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Fully_Qualified_Names - - Parameters - ---------- - fqn - The fully qualified name of an element. For example, "/a/b". - - Returns - ------- - ET.Element - The matching node found within the root element. - - Raises - ------ - ValueError - If the fully qualified name is not found in the root element. - """ - if fqn == "/": - return self.root - - elements = fqn.strip("/").split("/") # /a/b/ --> ['a', 'b'] - xpath_segments = [f"*[@name='{element}']" for element in elements] - xpath_query = "/".join([".", *xpath_segments]) # "./[*[@name='a']/*[@name='b']" - - if (element := self.root.find(xpath_query, self._NS)) is None: - raise ValueError(f"Path {fqn} not found in provided root") - - return element - - def _split_groups(self, root: ET.Element) -> dict[Path, ET.Element]: - """ - Split the input element into several ET.Elements by name. - E.g. {"/": , "left": , "right": } - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - dict[Path, ET.Element] - """ - all_groups: dict[Path, ET.Element] = {} - dataset_tags = [ - d for d in root if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - if len(dataset_tags) > 0: - all_groups[Path("/")] = ET.Element(root.tag, root.attrib) - all_groups[Path("/")].extend(dataset_tags) - all_groups.update(self._split_groups_recursive(root, Path("/"))) - return all_groups - - def _split_groups_recursive( - self, root: ET.Element, current_path=Path("") - ) -> dict[Path, ET.Element]: - group_dict: dict[Path, ET.Element] = {} - for g in root.iterfind("dap:Group", self._NS): - group_name = self._get_attrib(g, "name", required=True) - if group_name is None: - continue - new_path = current_path / Path(group_name) - dataset_tags = [ - d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group" - ] - group_dict[new_path] = ET.Element(g.tag, g.attrib) - group_dict[new_path].extend(dataset_tags) - group_dict.update(self._split_groups_recursive(g, new_path)) - return group_dict - - def _parse_dataset( - self, - root: ET.Element, - ) -> ManifestGroup: - """ - Parse the dataset using the root element of the DMR++ file. - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - ManifestGroup - """ + # get two dictionary containing relevant metadata + vars_dict, attrs = self.dmrparser()._parse_dataset(dataset_element) manifest_dict: dict[str, ManifestArray] = {} - for var_tag in self._find_var_tags(root): - var_name = self._get_attrib(var_tag, "name") - if var_name and var_name not in self.skip_variables: - try: - variable = self._parse_variable(var_tag) - manifest_dict[var_name] = variable - except (UnboundLocalError, ValueError): - warnings.warn( - f"This DMRpp contains the variable {var_name} that could not" - " be parsed. Consider adding it to the list of skipped " - "variables, or opening an issue to help resolve this" - ) - - # Attributes - attrs: dict[str, str] = {} - # Look for an attribute tag called "HDF5_GLOBAL" and unpack it - hdf5_global_attrs = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._NS) - if hdf5_global_attrs is not None: - # Remove the container attribute and add its children to the root dataset - root.remove(hdf5_global_attrs) - root.extend(hdf5_global_attrs) - for attr_tag in root.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - - return ManifestGroup( - arrays=manifest_dict, - attributes=attrs, - ) - - def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find all variable tags in the DMR++ file. Also known as array tags. - Tags are labeled with the DAP data type. E.g. , , - - Parameters - ---------- - root : ET.Element - The root element of the DMR++ file. - - Returns - ------- - list[ET.Element] - """ - vars_tags: list[ET.Element] = [] - for dap_dtype in self._DAP_NP_DTYPE: - vars_tags += root.findall(f"dap:{dap_dtype}", self._NS) - return vars_tags - - def _parse_dim(self, root: ET.Element, dim_index: int = 0) -> dict[str, int]: - """ - Parse single or tag - - If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim_0": 300} - If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} - - Parameters - ---------- - root : ET.Element - The root element Dim/Dimension tag - dim_index : int - Index of the dimension, used for naming phony dimensions - - Returns - ------- - dict - E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0": 300}, {"time": None, "lat": None, "lon": None} - """ - size_attr = self._get_attrib(root, "size") - name_attr = self._get_attrib(root, "name") - - if size_attr is not None: - size = int(size_attr) - if name_attr is not None: - return {Path(name_attr).name: size} - else: - return {f"phony_dim_{dim_index}": size} - - raise ValueError("Not enough information to parse Dim/Dimension tag") - - def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]: - """ - Find the all tags with dimension information. - - First attempts to find Dimension tags, then falls back to Dim tags. - If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag. - If Dim tags have no name attribute, they are phony dimensions and used directly. - Parameters - ---------- - root : ET.Element - An ElementTree Element from a DMR++ file. - - Returns - ------- - list[ET.Element] - """ - dimension_tags = root.findall("dap:Dimension", self._NS) - if not dimension_tags: - # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++ - # or they are phony dimensions (have size but no name) - dim_tags = root.findall("dap:Dim", self._NS) - for d in dim_tags: - dim_name = self._get_attrib(d, "name") - if dim_name is not None: - dimension_tag = self.find_node_fqn(dim_name) - if dimension_tag is not None: - dimension_tags.append(dimension_tag) - else: - # Phony dimension - use the Dim tag directly - dimension_tags.append(d) - return dimension_tags - - def _parse_variable(self, var_tag: ET.Element) -> ManifestArray: - """ - Parse a variable from a DMR++ tag. - - Parameters - ---------- - var_tag : ET.Element - An ElementTree Element representing a variable in the DMR++ file. Will have DAP dtype as tag. E.g. - - Returns - ------- - ManifestArray - """ - - # Dimension info - dims: dict[str, int] = {} - dimension_tags = self._find_dimension_tags(var_tag) - for dim_index, dim in enumerate(dimension_tags): - dims.update(self._parse_dim(dim, dim_index=dim_index)) - # convert DAP dtype to numpy dtype - dtype = np.dtype( - self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")] - ) - # Chunks and Filters - shape: tuple[int, ...] = tuple(dims.values()) - chunks_shape = shape - chunks_tag = var_tag.find("dmrpp:chunks", self._NS) - array_fill_value = np.array(0).astype(dtype)[()] - if chunks_tag is not None: - # Chunks - chunk_dim_text = chunks_tag.findtext( - "dmrpp:chunkDimensionSizes", namespaces=self._NS + for var in vars_dict.keys(): + chunkmanifest = vars_dict[var].pop("chunkmanifest", None) + meta = dict( + [ + (k, v) + for k, v in vars_dict[var].items() + if k not in ["Maps", "fqn_dims"] + ] ) - if chunk_dim_text is not None: - # 1 1447 2895 -> (1, 1447, 2895) - chunks_shape = tuple(map(int, chunk_dim_text.split())) - else: - chunks_shape = shape - fill_value = self._get_attrib(chunks_tag, "fillValue") - if fill_value is not None: - array_fill_value = np.array(fill_value).astype(dtype)[()] - if chunks_shape: - chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) - else: - chunkmanifest = ChunkManifest(entries={}, shape=array_fill_value.shape) - # Filters - codecs = self._parse_filters(chunks_tag, dtype) - - # Attributes - attrs: dict[str, Any] = {} - for attr_tag in var_tag.iterfind("dap:Attribute", self._NS): - attrs.update(self._parse_attribute(attr_tag)) - if "_FillValue" in attrs: - encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) - attrs["_FillValue"] = encoded_cf_fill_value - - metadata = create_v3_array_metadata( - shape=shape, - data_type=dtype, - chunk_shape=chunks_shape, - codecs=codecs, - dimension_names=dims, - attributes=attrs, - fill_value=array_fill_value, - ) - return ManifestArray(metadata=metadata, chunkmanifest=chunkmanifest) - - def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: - """ - Parse an attribute from a DMR++ attr tag. Converts the attribute value to a native python type. - Raises an exception if nested attributes are passed. Container attributes must be unwrapped in the parent function. - - Parameters - ---------- - attr_tag : ET.Element - An ElementTree Element with an tag. - - Returns - ------- - dict - """ - attr: dict[str, Any] = {} - values = [] - attr_type = self._get_attrib(attr_tag, "type") - attr_name = self._get_attrib(attr_tag, "name", required=True) - - if attr_name is None: - return {} - - if attr_type == "Container": - # DMR++ build information that is not part of the dataset - if attr_name == "build_dmrpp_metadata": - return {} - else: - warnings.warn( - "This DMRpp contains a nested attribute " - f"{attr_name}. Nested attributes cannot " - "be assigned to a variable or dataset and will be dropped" + if "_FillValue" in meta["attributes"]: + encoded_cf_fill_value = encode_cf_fill_value( + meta["attributes"]["_FillValue"], meta["data_type"] ) - return {} - - if attr_type is None: - return {} - - dtype = np.dtype(self._DAP_NP_DTYPE[attr_type]) - # if multiple Value tags are present, store as "key": "[v1, v2, ...]" - for value_tag in attr_tag: - # cast attribute to native python type using dmr provided dtype - val = ( - dtype.type(value_tag.text).item() - if dtype != np.object_ - else value_tag.text - ) - # "*" may represent nan values in DMR++ - if val == "*": - val = np.nan - values.append(val) - attr[attr_name] = values[0] if len(values) == 1 else values - return attr - - def _parse_filters( - self, chunks_tag: ET.Element, dtype: np.dtype - ) -> list[dict] | None: - """ - Parse filters from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. + meta["attributes"]["_FillValue"] = encoded_cf_fill_value - dtype : np.dtype - The numpy dtype of the variable. - - Returns - ------- - list[dict] | None - E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] - """ - compression_type = self._get_attrib(chunks_tag, "compressionType") - if compression_type is not None: - filters: list[dict] = [] - # shuffle deflate --> ["shuffle", "deflate"] - compression_types = compression_type.split(" ") - for c in compression_types: - if c == "shuffle": - filters.append( - { - "name": "numcodecs.shuffle", - "configuration": {"elementsize": dtype.itemsize}, - } - ) - elif c == "deflate": - deflate_level = self._get_attrib(chunks_tag, "deflateLevel") - level = ( - int(deflate_level) - if deflate_level is not None - else self._DEFAULT_ZLIB_VALUE - ) - filters.append( - { - "name": "numcodecs.zlib", - "configuration": { - "level": level, - }, - } - ) - return filters - return None - - def _parse_chunks( - self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] - ) -> ChunkManifest: - """ - Parse the chunk manifest from a DMR++ chunks tag. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. + if "inline" in meta: + raise NotImplementedError( + "Reading inlined reference data is currently not supported. " + "See https://github.com/zarr-developers/VirtualiZarr/issues/489", + ) + meta.pop("inline", None) - chunks_shape : tuple - Chunk sizes for each dimension. E.g. (1, 1447, 2895) + chunkmanifest = ChunkManifest(chunkmanifest) + metadata = create_v3_array_metadata(**meta) + manifest_dict[var] = ManifestArray( + metadata=metadata, chunkmanifest=chunkmanifest + ) + manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) + registry: ObjectStoreRegistry = ObjectStoreRegistry() + registry.register(self.data_filepath, object_store) - Returns - ------- - ChunkManifest - """ - chunkmanifest: dict[ChunkKey, object] = {} - default_num: list[int] = ( - [0 for i in range(len(chunks_shape))] if chunks_shape else [0] - ) - chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) - for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS): - chunk_num = default_num - chunk_position = self._get_attrib(chunk_tag, "chunkPositionInArray") - if chunk_position is not None: - # "[0,1023,10235]" -> ["0","1023","10235"] - chunk_pos = chunk_position[1:-1].split(",") - # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] - chunk_num = [ - int(chunk_pos[i]) // chunks_shape[i] - for i in range(len(chunks_shape)) - ] - # [0,1,5] -> "0.1.5" - chunk_key = ChunkKey(chunk_key_template.format(*chunk_num)) - offset = self._get_attrib(chunk_tag, "offset", required=True) - n_bytes = self._get_attrib(chunk_tag, "nBytes", required=True) - if offset is not None and n_bytes is not None: - chunkmanifest[chunk_key] = { - "path": self.data_filepath, - "offset": int(offset), - "length": int(n_bytes), - } - return ChunkManifest(entries=chunkmanifest) + return ManifestStore(registry=registry, group=manifest_group) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 19bbda8e..9d4f8625 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -1,10 +1,9 @@ -import platform import textwrap from contextlib import nullcontext -from pathlib import Path from xml.etree import ElementTree as ET import pytest +import requests import xarray as xr import xarray.testing as xrt from obspec_utils.registry import ObjectStoreRegistry @@ -435,60 +434,7 @@ def test_NASA_dmrpp_load(data_url, dmrpp_url): assert ds.load() -@pytest.mark.parametrize( - "fqn_path, expected_xpath", - [ - ("/", "."), - ("/air", "./*[@name='air']"), - ], -) -def test_find_node_fqn_simple(netcdf4_file, fqn_path, expected_xpath): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file - ) - result = parser_instance.find_node_fqn(fqn_path) - expected = parser_instance.root.find(expected_xpath, parser_instance._NS) - assert result == expected - - -def test_find_node_fqn_grouped(hdf5_groups_file): - parser_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - result = parser_instance.find_node_fqn("/test/group/air") - expected = parser_instance.root.find( - "./*[@name='test']/*[@name='group']/*[@name='air']", parser_instance._NS - ) - assert result == expected - - -@pytest.mark.parametrize( - "group_path", - [ - ("/"), - ("/test"), - ("/test/group"), - ], -) -def test_split_groups(hdf5_groups_file, group_path): - dmrpp_instance = dmrparser( - DMRPP_XML_STRINGS["hdf5_groups_file"], filepath=hdf5_groups_file - ) - - # get all tags in a dataset (so all tags excluding nested groups) - dataset_tags = lambda x: [ - d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" - ] - # check that contents of the split groups dataset match contents of the original dataset - result_tags = dataset_tags( - dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] - ) - expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) - assert result_tags == expected_tags - - @pytest.mark.xfail( - platform.system() == "Linux", reason="See https://github.com/zarr-developers/VirtualiZarr/issues/904.", ) @pytest.mark.parametrize( @@ -558,157 +504,6 @@ def test_parse_dataset_nested(hdf5_groups_file): assert vds_g2.data_vars["air"].dims == ("time", "lat", "lon") -def test_parse_variable(netcdf4_file): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - var = parser._parse_variable(parser.find_node_fqn("/air")) - assert var.metadata.dtype.to_native_dtype() == "int16" - assert var.metadata.dimension_names == ("time", "lat", "lon") - assert var.shape == (2920, 25, 53) - assert var.chunks == (2920, 25, 53) - # _FillValue is encoded for array dtype - assert var.metadata.attributes["scale_factor"] == 0.01 - assert ( - var.metadata.attributes["long_name"] - == "4xDaily Air temperature at sigma level 995" - ) - - -@pytest.mark.parametrize( - "attr_path, expected", - [ - ("air/long_name", {"long_name": "4xDaily Air temperature at sigma level 995"}), - ("air/scale_factor", {"scale_factor": 0.01}), - ], -) -def test_parse_attribute(netcdf4_file, attr_path, expected): - parser = dmrparser(DMRPP_XML_STRINGS["netcdf4_file"], filepath=netcdf4_file) - - result = parser._parse_attribute(parser.find_node_fqn(attr_path)) - assert result == expected - - -def test_dmrpp_empty_scalar_warns_container(fill_value_scalar_no_chunks_nc4_url): - parsed_dmrpp = dmrparser( - DMRPP_XML_STRINGS["fill_value_scalar_no_chunks_nc4_url"], - filepath=fill_value_scalar_no_chunks_nc4_url, - ) - store = obstore_local(url=f"file://{parsed_dmrpp.data_filepath}") - with pytest.warns(UserWarning): - parsed_vds = parsed_dmrpp.parse_dataset(object_store=store) - vds_g1 = parsed_vds.to_virtual_dataset() - assert vds_g1["data"].attrs == {"_FillValue": -999} - - -def test_dmrpp_phony_dim_naming(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///phony_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("phony_dim_0", "phony_dim_1") - assert var.shape == (10, 20) - - -def test_dmrpp_validation_issues_accumulation(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - 1.0 - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///validation_test.nc") - parser._parse_dataset(parser.root) - assert len(parser._validation_issues) > 0 - assert any( - "Missing required attribute 'name'" in issue - for issue in parser._validation_issues - ) - - -def test_dmrpp_get_attrib_with_missing_optional(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - result = parser._get_attrib(dimension, "nonexistent") - assert result is None - assert len(parser._validation_issues) == 1 - - -def test_dmrpp_get_attrib_with_required_missing(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///test.nc") - dimension = parser.root.find("dap:Dimension", parser._NS) - with pytest.raises(ValueError, match="Missing required attribute 'nonexistent'"): - parser._get_attrib(dimension, "nonexistent", required=True) - - -def test_dmrpp_mixed_named_and_unnamed_dimensions(): - dmrpp_xml_str = textwrap.dedent( - """\ - - - - - - - - - - NaN - - - - - - - """ - ) - parser = dmrparser(dmrpp_xml_str, filepath="file:///mixed_test.nc") - var = parser._parse_variable(parser.find_node_fqn("/data")) - assert var.metadata.dimension_names == ("time", "phony_dim_1", "lat") - assert var.shape == (10, 20, 30) - - def test_dmrpp_simple(dmrpp_xml_simple): """Test parsing a simple valid DMR++ XML creates virtual chunk manifests.""" parser = dmrparser(dmrpp_xml_simple, filepath="file:///simple.nc") @@ -767,3 +562,20 @@ def test_dmrpp_missing_attrib_validation(dmrpp_xml_with_missing_attrib): # Verify manifest store was still created (parser continues despite issues) assert manifest_store is not None assert manifest_store._group is not None + + +@requires_network +def test_nonimplement_inlinevalue(hdf5_missing_value): + dmrpp_file = ( + "http://test.opendap.org/opendap/data/dmrpp/compact_lowlevel.h5.dmrpp.file" + ) + session = requests.Session() + dmrpp = session.get(dmrpp_file).content.decode() + parser = dmrparser(dmrpp, filepath=f"file:///{hdf5_missing_value}") + store = obstore_local(url=parser.data_filepath) + + with pytest.raises( + NotImplementedError, + match="Reading inlined reference data is currently not supported", + ): + parser.parse_dataset(object_store=store)