zarr-developers
diff --git a/‎docs/api/parsers/dmrpp.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/api/parsers/dmrpp.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎virtualizarr/parsers/dmrpp.py‎
Lines changed: 122 additions & 41 deletions b/‎virtualizarr/parsers/dmrpp.py‎
Lines changed: 122 additions & 41 deletions
diff --git a/‎virtualizarr/tests/test_parsers/conftest.py‎
Lines changed: 52 additions & 0 deletions b/‎virtualizarr/tests/test_parsers/conftest.py‎
Lines changed: 52 additions & 0 deletions
@@ -1,3 +1,22 @@
 # DMR++
 
+The DMRPP parser reads DMR++ (Data Model Response Protocol Plus) XML files, which contain metadata describing datasets and their chunk locations. It creates virtual Zarr arrays backed by the original data files.
+
 ::: virtualizarr.parsers.DMRPPParser
+
+## Usage
+
+```python
+import virtualizarr
+from virtualizarr.parsers import DMRPPParser
+
+# Open a DMR++ file as a virtual dataset
+ds = virtualizarr.open_virtual_dataset(
+    "file:///path/to/dataset.dmrpp",
+    parser=DMRPPParser(),
+)
+```
+
+## Validation
+
+The DMRPP parser includes validation for missing attributes in the DMR++ XML. When required attributes are missing, the parser raises a `ValueError`. When optional attributes are missing, validation issues are accumulated and can be accessed via the `_validation_issues` attribute of the parser instance.
@@ -132,11 +132,58 @@ def __init__(
             If None, the data file path is taken from the DMR++ file.
         """
         self.root = root
+        self._validation_issues: list[str] = []
+        data_filepath_from_root = self._get_attrib(self.root, "name", required=True)
+        assert data_filepath_from_root is not None  # required=True guarantees non-None
         self.data_filepath = (
-            data_filepath if data_filepath is not None else self.root.attrib["name"]
+            data_filepath if data_filepath is not None else data_filepath_from_root
         )
         self.skip_variables = skip_variables or ()
 
+    def _get_attrib(
+        self, element: ET.Element, attrib_name: str, required: bool = False
+    ) -> str | None:
+        """
+        Safely get an attribute from an XML element, logging validation issues.
+
+        Parameters
+        ----------
+        element
+            The XML element to get the attribute from.
+        attrib_name
+            The name of the attribute to get.
+        required
+            If True, raises a ValueError when the attribute is missing. If False,
+            returns None and logs the issue.
+
+        Returns
+        -------
+        str | None
+            The attribute value if found, None otherwise.
+
+        Raises
+        ------
+        ValueError
+            If required is True and the attribute is not found.
+        """
+        if attrib_name in element.attrib:
+            return element.attrib[attrib_name]
+
+        element_info = (
+            element.tag
+            if "name" not in element.attrib
+            else f"{element.tag}[@name='{element.attrib['name']}']"
+        )
+        issue_msg = (
+            f"Missing required attribute '{attrib_name}' in element: {element_info}"
+        )
+        self._validation_issues.append(issue_msg)
+
+        if required:
+            raise ValueError(issue_msg)
+
+        return None
+
     def parse_dataset(
         self,
         object_store: ReadableStore,
@@ -248,7 +295,10 @@ def _split_groups_recursive(
     ) -> dict[Path, ET.Element]:
         group_dict: dict[Path, ET.Element] = {}
         for g in root.iterfind("dap:Group", self._NS):
-            new_path = current_path / Path(g.attrib["name"])
+            group_name = self._get_attrib(g, "name", required=True)
+            if group_name is None:
+                continue
+            new_path = current_path / Path(group_name)
             dataset_tags = [
                 d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group"
             ]
@@ -276,14 +326,14 @@ def _parse_dataset(
 
         manifest_dict: dict[str, ManifestArray] = {}
         for var_tag in self._find_var_tags(root):
-            if var_tag.attrib["name"] not in self.skip_variables:
+            var_name = self._get_attrib(var_tag, "name")
+            if var_name and var_name not in self.skip_variables:
                 try:
                     variable = self._parse_variable(var_tag)
-                    manifest_dict[var_tag.attrib["name"]] = variable
+                    manifest_dict[var_name] = variable
                 except (UnboundLocalError, ValueError):
-                    name = var_tag.attrib["name"]
                     warnings.warn(
-                        f"This DMRpp contains the variable {name} that could not"
+                        f"This DMRpp contains the variable {var_name} that could not"
                         " be parsed. Consider adding it to the list  of skipped "
                         "variables, or opening an issue to help resolve this"
                     )
@@ -323,27 +373,35 @@ def _find_var_tags(self, root: ET.Element) -> list[ET.Element]:
             vars_tags += root.findall(f"dap:{dap_dtype}", self._NS)
         return vars_tags
 
-    def _parse_dim(self, root: ET.Element) -> dict[str, int]:
+    def _parse_dim(self, root: ET.Element, dim_index: int = 0) -> dict[str, int]:
         """
         Parse single <Dim> or <Dimension> tag
 
-        If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim": 300}
+        If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim_0": 300}
         If the tag has both name and size attributes, it is a regular dimension. E.g. <Dim name="lat" size="1447"/> --> {"lat": 1447}
 
         Parameters
         ----------
         root : ET.Element
             The root element Dim/Dimension tag
+        dim_index : int
+            Index of the dimension, used for naming phony dimensions
 
         Returns
         -------
         dict
-            E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None}
+            E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0": 300}, {"time": None, "lat": None, "lon": None}
         """
-        if "name" not in root.attrib and "size" in root.attrib:
-            return {"phony_dim": int(root.attrib["size"])}
-        if "name" in root.attrib and "size" in root.attrib:
-            return {Path(root.attrib["name"]).name: int(root.attrib["size"])}
+        size_attr = self._get_attrib(root, "size")
+        name_attr = self._get_attrib(root, "name")
+
+        if size_attr is not None:
+            size = int(size_attr)
+            if name_attr is not None:
+                return {Path(name_attr).name: size}
+            else:
+                return {f"phony_dim_{dim_index}": size}
+
         raise ValueError("Not enough information to parse Dim/Dimension tag")
 
     def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
@@ -352,6 +410,7 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
 
         First attempts to find Dimension tags, then falls back to Dim tags.
         If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag.
+        If Dim tags have no name attribute, they are phony dimensions and used directly.
 
         Parameters
         ----------
@@ -365,11 +424,17 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
         dimension_tags = root.findall("dap:Dimension", self._NS)
         if not dimension_tags:
             # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++
+            # or they are phony dimensions (have size but no name)
             dim_tags = root.findall("dap:Dim", self._NS)
             for d in dim_tags:
-                dimension_tag = self.find_node_fqn(d.attrib["name"])
-                if dimension_tag is not None:
-                    dimension_tags.append(dimension_tag)
+                dim_name = self._get_attrib(d, "name")
+                if dim_name is not None:
+                    dimension_tag = self.find_node_fqn(dim_name)
+                    if dimension_tag is not None:
+                        dimension_tags.append(dimension_tag)
+                else:
+                    # Phony dimension - use the Dim tag directly
+                    dimension_tags.append(d)
         return dimension_tags
 
     def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
@@ -389,8 +454,8 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
         # Dimension info
         dims: dict[str, int] = {}
         dimension_tags = self._find_dimension_tags(var_tag)
-        for dim in dimension_tags:
-            dims.update(self._parse_dim(dim))
+        for dim_index, dim in enumerate(dimension_tags):
+            dims.update(self._parse_dim(dim, dim_index=dim_index))
         # convert DAP dtype to numpy dtype
         dtype = np.dtype(
             self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")]
@@ -410,9 +475,9 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
                 chunks_shape = tuple(map(int, chunk_dim_text.split()))
             else:
                 chunks_shape = shape
-            if "fillValue" in chunks_tag.attrib:
-                fillValue_attrib = chunks_tag.attrib["fillValue"]
-                array_fill_value = np.array(fillValue_attrib).astype(dtype)[()]
+            fill_value = self._get_attrib(chunks_tag, "fillValue")
+            if fill_value is not None:
+                array_fill_value = np.array(fill_value).astype(dtype)[()]
             if chunks_shape:
                 chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape)
             else:
@@ -455,19 +520,28 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
         """
         attr: dict[str, Any] = {}
         values = []
-        if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container":
+        attr_type = self._get_attrib(attr_tag, "type")
+        attr_name = self._get_attrib(attr_tag, "name", required=True)
+
+        if attr_name is None:
+            return {}
+
+        if attr_type == "Container":
             # DMR++ build information that is not part of the dataset
-            if attr_tag.attrib["name"] == "build_dmrpp_metadata":
+            if attr_name == "build_dmrpp_metadata":
                 return {}
             else:
-                container_attr = attr_tag.attrib["name"]
                 warnings.warn(
                     "This DMRpp contains a nested attribute "
-                    f"{container_attr}. Nested attributes cannot "
+                    f"{attr_name}. Nested attributes cannot "
                     "be assigned to a variable or dataset and will be dropped"
                 )
                 return {}
-        dtype = np.dtype(self._DAP_NP_DTYPE[attr_tag.attrib["type"]])
+
+        if attr_type is None:
+            return {}
+
+        dtype = np.dtype(self._DAP_NP_DTYPE[attr_type])
         # if multiple Value tags are present, store as "key": "[v1, v2, ...]"
         for value_tag in attr_tag:
             # cast attribute to native python type using dmr provided dtype
@@ -480,7 +554,7 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
             if val == "*":
                 val = np.nan
             values.append(val)
-        attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values
+        attr[attr_name] = values[0] if len(values) == 1 else values
         return attr
 
     def _parse_filters(
@@ -502,10 +576,11 @@ def _parse_filters(
         list[dict] | None
             E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}]
         """
-        if "compressionType" in chunks_tag.attrib:
+        compression_type = self._get_attrib(chunks_tag, "compressionType")
+        if compression_type is not None:
             filters: list[dict] = []
             # shuffle deflate --> ["shuffle", "deflate"]
-            compression_types = chunks_tag.attrib["compressionType"].split(" ")
+            compression_types = compression_type.split(" ")
             for c in compression_types:
                 if c == "shuffle":
                     filters.append(
@@ -515,15 +590,17 @@ def _parse_filters(
                         }
                     )
                 elif c == "deflate":
+                    deflate_level = self._get_attrib(chunks_tag, "deflateLevel")
+                    level = (
+                        int(deflate_level)
+                        if deflate_level is not None
+                        else self._DEFAULT_ZLIB_VALUE
+                    )
                     filters.append(
                         {
                             "name": "numcodecs.zlib",
                             "configuration": {
-                                "level": int(
-                                    chunks_tag.attrib.get(
-                                        "deflateLevel", self._DEFAULT_ZLIB_VALUE
-                                    )
-                                ),
+                                "level": level,
                             },
                         }
                     )
@@ -555,19 +632,23 @@ def _parse_chunks(
         chunk_key_template = ".".join(["{}" for i in range(len(default_num))])
         for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS):
             chunk_num = default_num
-            if "chunkPositionInArray" in chunk_tag.attrib:
+            chunk_position = self._get_attrib(chunk_tag, "chunkPositionInArray")
+            if chunk_position is not None:
                 # "[0,1023,10235]" -> ["0","1023","10235"]
-                chunk_pos = chunk_tag.attrib["chunkPositionInArray"][1:-1].split(",")
+                chunk_pos = chunk_position[1:-1].split(",")
                 # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5]
                 chunk_num = [
                     int(chunk_pos[i]) // chunks_shape[i]
                     for i in range(len(chunks_shape))
                 ]
             # [0,1,5] -> "0.1.5"
             chunk_key = ChunkKey(chunk_key_template.format(*chunk_num))
-            chunkmanifest[chunk_key] = {
-                "path": self.data_filepath,
-                "offset": int(chunk_tag.attrib["offset"]),
-                "length": int(chunk_tag.attrib["nBytes"]),
-            }
+            offset = self._get_attrib(chunk_tag, "offset", required=True)
+            n_bytes = self._get_attrib(chunk_tag, "nBytes", required=True)
+            if offset is not None and n_bytes is not None:
+                chunkmanifest[chunk_key] = {
+                    "path": self.data_filepath,
+                    "offset": int(offset),
+                    "length": int(n_bytes),
+                }
         return ChunkManifest(entries=chunkmanifest)
@@ -1,3 +1,4 @@
+import textwrap
 import warnings
 from pathlib import Path
 
@@ -476,3 +477,54 @@ def big_endian_dtype_hdf5_file(tmpdir):
     dset = f["data"]
     dset[...] = 10
     return filepath
+
+
+@pytest.fixture()
+def dmrpp_xml_simple():
+    """Return a minimal valid DMR++ XML string for testing."""
+    return textwrap.dedent(
+        """\
+        <?xml version="1.0" encoding="ISO-8859-1"?>
+        <Dataset xmlns="http://xml.opendap.org/ns/DAP/4.0#" xmlns:dmrpp="http://xml.opendap.org/dap/dmrpp/1.0.0#" dapVersion="4.0" dmrVersion="1.0" name="simple.nc" dmrpp:href="OPeNDAP_DMRpp_DATA_ACCESS_URL" dmrpp:version="3.21.1-451">
+            <Dimension name="lat" size="25"/>
+            <Dimension name="lon" size="53"/>
+            <Float32 name="temperature">
+                <Dim name="/lat"/>
+                <Dim name="/lon"/>
+                <Attribute name="long_name" type="String">
+                    <Value>Temperature</Value>
+                </Attribute>
+                <Attribute name="units" type="String">
+                    <Value>K</Value>
+                </Attribute>
+                <dmrpp:chunks fillValue="0" byteOrder="LE">
+                    <dmrpp:chunkDimensionSizes>25 53</dmrpp:chunkDimensionSizes>
+                    <dmrpp:chunk offset="100" nBytes="5300"/>
+                </dmrpp:chunks>
+            </Float32>
+        </Dataset>
+        """
+    )
+
+
+@pytest.fixture()
+def dmrpp_xml_with_missing_attrib():
+    """Return a DMR++ XML string with missing attributes for validation testing."""
+    return textwrap.dedent(
+        """\
+        <?xml version="1.0" encoding="ISO-8859-1"?>
+        <Dataset xmlns="http://xml.opendap.org/ns/DAP/4.0#" xmlns:dmrpp="http://xml.opendap.org/dap/dmrpp/1.0.0#" dapVersion="4.0" dmrVersion="1.0" name="validation_test.nc" dmrpp:href="OPeNDAP_DMRpp_DATA_ACCESS_URL" dmrpp:version="3.21.1-451">
+            <Dimension size="10"/>
+            <Float32 name="data">
+                <Dim size="10"/>
+                <Attribute name="long_name" type="String">
+                    <Value>Test Data</Value>
+                </Attribute>
+                <dmrpp:chunks fillValue="0" byteOrder="LE">
+                    <dmrpp:chunkDimensionSizes>10</dmrpp:chunkDimensionSizes>
+                    <dmrpp:chunk offset="100" nBytes="400"/>
+                </dmrpp:chunks>
+            </Float32>
+        </Dataset>
+        """
+    )