Skip to content

Commit 085bcb6

Browse files
authored
Fix dmrpp error handling (#880)
* add check to dim names * update dmrpp parser * update docstrings to include dmrpp * fix mypy
1 parent 9ac9e65 commit 085bcb6

5 files changed

Lines changed: 364 additions & 42 deletions

File tree

docs/api/parsers/dmrpp.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
11
# DMR++
22

3+
The DMRPP parser reads DMR++ (Data Model Response Protocol Plus) XML files, which contain metadata describing datasets and their chunk locations. It creates virtual Zarr arrays backed by the original data files.
4+
35
::: virtualizarr.parsers.DMRPPParser
6+
7+
## Usage
8+
9+
```python
10+
import virtualizarr
11+
from virtualizarr.parsers import DMRPPParser
12+
13+
# Open a DMR++ file as a virtual dataset
14+
ds = virtualizarr.open_virtual_dataset(
15+
"file:///path/to/dataset.dmrpp",
16+
parser=DMRPPParser(),
17+
)
18+
```
19+
20+
## Validation
21+
22+
The DMRPP parser includes validation for missing attributes in the DMR++ XML. When required attributes are missing, the parser raises a `ValueError`. When optional attributes are missing, validation issues are accumulated and can be accessed via the `_validation_issues` attribute of the parser instance.

virtualizarr/parsers/dmrpp.py

Lines changed: 122 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,58 @@ def __init__(
132132
If None, the data file path is taken from the DMR++ file.
133133
"""
134134
self.root = root
135+
self._validation_issues: list[str] = []
136+
data_filepath_from_root = self._get_attrib(self.root, "name", required=True)
137+
assert data_filepath_from_root is not None # required=True guarantees non-None
135138
self.data_filepath = (
136-
data_filepath if data_filepath is not None else self.root.attrib["name"]
139+
data_filepath if data_filepath is not None else data_filepath_from_root
137140
)
138141
self.skip_variables = skip_variables or ()
139142

143+
def _get_attrib(
144+
self, element: ET.Element, attrib_name: str, required: bool = False
145+
) -> str | None:
146+
"""
147+
Safely get an attribute from an XML element, logging validation issues.
148+
149+
Parameters
150+
----------
151+
element
152+
The XML element to get the attribute from.
153+
attrib_name
154+
The name of the attribute to get.
155+
required
156+
If True, raises a ValueError when the attribute is missing. If False,
157+
returns None and logs the issue.
158+
159+
Returns
160+
-------
161+
str | None
162+
The attribute value if found, None otherwise.
163+
164+
Raises
165+
------
166+
ValueError
167+
If required is True and the attribute is not found.
168+
"""
169+
if attrib_name in element.attrib:
170+
return element.attrib[attrib_name]
171+
172+
element_info = (
173+
element.tag
174+
if "name" not in element.attrib
175+
else f"{element.tag}[@name='{element.attrib['name']}']"
176+
)
177+
issue_msg = (
178+
f"Missing required attribute '{attrib_name}' in element: {element_info}"
179+
)
180+
self._validation_issues.append(issue_msg)
181+
182+
if required:
183+
raise ValueError(issue_msg)
184+
185+
return None
186+
140187
def parse_dataset(
141188
self,
142189
object_store: ReadableStore,
@@ -248,7 +295,10 @@ def _split_groups_recursive(
248295
) -> dict[Path, ET.Element]:
249296
group_dict: dict[Path, ET.Element] = {}
250297
for g in root.iterfind("dap:Group", self._NS):
251-
new_path = current_path / Path(g.attrib["name"])
298+
group_name = self._get_attrib(g, "name", required=True)
299+
if group_name is None:
300+
continue
301+
new_path = current_path / Path(group_name)
252302
dataset_tags = [
253303
d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group"
254304
]
@@ -276,14 +326,14 @@ def _parse_dataset(
276326

277327
manifest_dict: dict[str, ManifestArray] = {}
278328
for var_tag in self._find_var_tags(root):
279-
if var_tag.attrib["name"] not in self.skip_variables:
329+
var_name = self._get_attrib(var_tag, "name")
330+
if var_name and var_name not in self.skip_variables:
280331
try:
281332
variable = self._parse_variable(var_tag)
282-
manifest_dict[var_tag.attrib["name"]] = variable
333+
manifest_dict[var_name] = variable
283334
except (UnboundLocalError, ValueError):
284-
name = var_tag.attrib["name"]
285335
warnings.warn(
286-
f"This DMRpp contains the variable {name} that could not"
336+
f"This DMRpp contains the variable {var_name} that could not"
287337
" be parsed. Consider adding it to the list of skipped "
288338
"variables, or opening an issue to help resolve this"
289339
)
@@ -323,27 +373,35 @@ def _find_var_tags(self, root: ET.Element) -> list[ET.Element]:
323373
vars_tags += root.findall(f"dap:{dap_dtype}", self._NS)
324374
return vars_tags
325375

326-
def _parse_dim(self, root: ET.Element) -> dict[str, int]:
376+
def _parse_dim(self, root: ET.Element, dim_index: int = 0) -> dict[str, int]:
327377
"""
328378
Parse single <Dim> or <Dimension> tag
329379
330-
If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim": 300}
380+
If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim_0": 300}
331381
If the tag has both name and size attributes, it is a regular dimension. E.g. <Dim name="lat" size="1447"/> --> {"lat": 1447}
332382
333383
Parameters
334384
----------
335385
root : ET.Element
336386
The root element Dim/Dimension tag
387+
dim_index : int
388+
Index of the dimension, used for naming phony dimensions
337389
338390
Returns
339391
-------
340392
dict
341-
E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None}
393+
E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0": 300}, {"time": None, "lat": None, "lon": None}
342394
"""
343-
if "name" not in root.attrib and "size" in root.attrib:
344-
return {"phony_dim": int(root.attrib["size"])}
345-
if "name" in root.attrib and "size" in root.attrib:
346-
return {Path(root.attrib["name"]).name: int(root.attrib["size"])}
395+
size_attr = self._get_attrib(root, "size")
396+
name_attr = self._get_attrib(root, "name")
397+
398+
if size_attr is not None:
399+
size = int(size_attr)
400+
if name_attr is not None:
401+
return {Path(name_attr).name: size}
402+
else:
403+
return {f"phony_dim_{dim_index}": size}
404+
347405
raise ValueError("Not enough information to parse Dim/Dimension tag")
348406

349407
def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
@@ -352,6 +410,7 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
352410
353411
First attempts to find Dimension tags, then falls back to Dim tags.
354412
If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag.
413+
If Dim tags have no name attribute, they are phony dimensions and used directly.
355414
356415
Parameters
357416
----------
@@ -365,11 +424,17 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
365424
dimension_tags = root.findall("dap:Dimension", self._NS)
366425
if not dimension_tags:
367426
# Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++
427+
# or they are phony dimensions (have size but no name)
368428
dim_tags = root.findall("dap:Dim", self._NS)
369429
for d in dim_tags:
370-
dimension_tag = self.find_node_fqn(d.attrib["name"])
371-
if dimension_tag is not None:
372-
dimension_tags.append(dimension_tag)
430+
dim_name = self._get_attrib(d, "name")
431+
if dim_name is not None:
432+
dimension_tag = self.find_node_fqn(dim_name)
433+
if dimension_tag is not None:
434+
dimension_tags.append(dimension_tag)
435+
else:
436+
# Phony dimension - use the Dim tag directly
437+
dimension_tags.append(d)
373438
return dimension_tags
374439

375440
def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
@@ -389,8 +454,8 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
389454
# Dimension info
390455
dims: dict[str, int] = {}
391456
dimension_tags = self._find_dimension_tags(var_tag)
392-
for dim in dimension_tags:
393-
dims.update(self._parse_dim(dim))
457+
for dim_index, dim in enumerate(dimension_tags):
458+
dims.update(self._parse_dim(dim, dim_index=dim_index))
394459
# convert DAP dtype to numpy dtype
395460
dtype = np.dtype(
396461
self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")]
@@ -410,9 +475,9 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
410475
chunks_shape = tuple(map(int, chunk_dim_text.split()))
411476
else:
412477
chunks_shape = shape
413-
if "fillValue" in chunks_tag.attrib:
414-
fillValue_attrib = chunks_tag.attrib["fillValue"]
415-
array_fill_value = np.array(fillValue_attrib).astype(dtype)[()]
478+
fill_value = self._get_attrib(chunks_tag, "fillValue")
479+
if fill_value is not None:
480+
array_fill_value = np.array(fill_value).astype(dtype)[()]
416481
if chunks_shape:
417482
chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape)
418483
else:
@@ -455,19 +520,28 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
455520
"""
456521
attr: dict[str, Any] = {}
457522
values = []
458-
if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container":
523+
attr_type = self._get_attrib(attr_tag, "type")
524+
attr_name = self._get_attrib(attr_tag, "name", required=True)
525+
526+
if attr_name is None:
527+
return {}
528+
529+
if attr_type == "Container":
459530
# DMR++ build information that is not part of the dataset
460-
if attr_tag.attrib["name"] == "build_dmrpp_metadata":
531+
if attr_name == "build_dmrpp_metadata":
461532
return {}
462533
else:
463-
container_attr = attr_tag.attrib["name"]
464534
warnings.warn(
465535
"This DMRpp contains a nested attribute "
466-
f"{container_attr}. Nested attributes cannot "
536+
f"{attr_name}. Nested attributes cannot "
467537
"be assigned to a variable or dataset and will be dropped"
468538
)
469539
return {}
470-
dtype = np.dtype(self._DAP_NP_DTYPE[attr_tag.attrib["type"]])
540+
541+
if attr_type is None:
542+
return {}
543+
544+
dtype = np.dtype(self._DAP_NP_DTYPE[attr_type])
471545
# if multiple Value tags are present, store as "key": "[v1, v2, ...]"
472546
for value_tag in attr_tag:
473547
# cast attribute to native python type using dmr provided dtype
@@ -480,7 +554,7 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
480554
if val == "*":
481555
val = np.nan
482556
values.append(val)
483-
attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values
557+
attr[attr_name] = values[0] if len(values) == 1 else values
484558
return attr
485559

486560
def _parse_filters(
@@ -502,10 +576,11 @@ def _parse_filters(
502576
list[dict] | None
503577
E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}]
504578
"""
505-
if "compressionType" in chunks_tag.attrib:
579+
compression_type = self._get_attrib(chunks_tag, "compressionType")
580+
if compression_type is not None:
506581
filters: list[dict] = []
507582
# shuffle deflate --> ["shuffle", "deflate"]
508-
compression_types = chunks_tag.attrib["compressionType"].split(" ")
583+
compression_types = compression_type.split(" ")
509584
for c in compression_types:
510585
if c == "shuffle":
511586
filters.append(
@@ -515,15 +590,17 @@ def _parse_filters(
515590
}
516591
)
517592
elif c == "deflate":
593+
deflate_level = self._get_attrib(chunks_tag, "deflateLevel")
594+
level = (
595+
int(deflate_level)
596+
if deflate_level is not None
597+
else self._DEFAULT_ZLIB_VALUE
598+
)
518599
filters.append(
519600
{
520601
"name": "numcodecs.zlib",
521602
"configuration": {
522-
"level": int(
523-
chunks_tag.attrib.get(
524-
"deflateLevel", self._DEFAULT_ZLIB_VALUE
525-
)
526-
),
603+
"level": level,
527604
},
528605
}
529606
)
@@ -555,19 +632,23 @@ def _parse_chunks(
555632
chunk_key_template = ".".join(["{}" for i in range(len(default_num))])
556633
for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS):
557634
chunk_num = default_num
558-
if "chunkPositionInArray" in chunk_tag.attrib:
635+
chunk_position = self._get_attrib(chunk_tag, "chunkPositionInArray")
636+
if chunk_position is not None:
559637
# "[0,1023,10235]" -> ["0","1023","10235"]
560-
chunk_pos = chunk_tag.attrib["chunkPositionInArray"][1:-1].split(",")
638+
chunk_pos = chunk_position[1:-1].split(",")
561639
# [0,1023,10235] // [1, 1023, 2047] -> [0,1,5]
562640
chunk_num = [
563641
int(chunk_pos[i]) // chunks_shape[i]
564642
for i in range(len(chunks_shape))
565643
]
566644
# [0,1,5] -> "0.1.5"
567645
chunk_key = ChunkKey(chunk_key_template.format(*chunk_num))
568-
chunkmanifest[chunk_key] = {
569-
"path": self.data_filepath,
570-
"offset": int(chunk_tag.attrib["offset"]),
571-
"length": int(chunk_tag.attrib["nBytes"]),
572-
}
646+
offset = self._get_attrib(chunk_tag, "offset", required=True)
647+
n_bytes = self._get_attrib(chunk_tag, "nBytes", required=True)
648+
if offset is not None and n_bytes is not None:
649+
chunkmanifest[chunk_key] = {
650+
"path": self.data_filepath,
651+
"offset": int(offset),
652+
"length": int(n_bytes),
653+
}
573654
return ChunkManifest(entries=chunkmanifest)

virtualizarr/tests/test_parsers/conftest.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import textwrap
12
import warnings
23
from pathlib import Path
34

@@ -476,3 +477,54 @@ def big_endian_dtype_hdf5_file(tmpdir):
476477
dset = f["data"]
477478
dset[...] = 10
478479
return filepath
480+
481+
482+
@pytest.fixture()
483+
def dmrpp_xml_simple():
484+
"""Return a minimal valid DMR++ XML string for testing."""
485+
return textwrap.dedent(
486+
"""\
487+
<?xml version="1.0" encoding="ISO-8859-1"?>
488+
<Dataset xmlns="http://xml.opendap.org/ns/DAP/4.0#" xmlns:dmrpp="http://xml.opendap.org/dap/dmrpp/1.0.0#" dapVersion="4.0" dmrVersion="1.0" name="simple.nc" dmrpp:href="OPeNDAP_DMRpp_DATA_ACCESS_URL" dmrpp:version="3.21.1-451">
489+
<Dimension name="lat" size="25"/>
490+
<Dimension name="lon" size="53"/>
491+
<Float32 name="temperature">
492+
<Dim name="/lat"/>
493+
<Dim name="/lon"/>
494+
<Attribute name="long_name" type="String">
495+
<Value>Temperature</Value>
496+
</Attribute>
497+
<Attribute name="units" type="String">
498+
<Value>K</Value>
499+
</Attribute>
500+
<dmrpp:chunks fillValue="0" byteOrder="LE">
501+
<dmrpp:chunkDimensionSizes>25 53</dmrpp:chunkDimensionSizes>
502+
<dmrpp:chunk offset="100" nBytes="5300"/>
503+
</dmrpp:chunks>
504+
</Float32>
505+
</Dataset>
506+
"""
507+
)
508+
509+
510+
@pytest.fixture()
511+
def dmrpp_xml_with_missing_attrib():
512+
"""Return a DMR++ XML string with missing attributes for validation testing."""
513+
return textwrap.dedent(
514+
"""\
515+
<?xml version="1.0" encoding="ISO-8859-1"?>
516+
<Dataset xmlns="http://xml.opendap.org/ns/DAP/4.0#" xmlns:dmrpp="http://xml.opendap.org/dap/dmrpp/1.0.0#" dapVersion="4.0" dmrVersion="1.0" name="validation_test.nc" dmrpp:href="OPeNDAP_DMRpp_DATA_ACCESS_URL" dmrpp:version="3.21.1-451">
517+
<Dimension size="10"/>
518+
<Float32 name="data">
519+
<Dim size="10"/>
520+
<Attribute name="long_name" type="String">
521+
<Value>Test Data</Value>
522+
</Attribute>
523+
<dmrpp:chunks fillValue="0" byteOrder="LE">
524+
<dmrpp:chunkDimensionSizes>10</dmrpp:chunkDimensionSizes>
525+
<dmrpp:chunk offset="100" nBytes="400"/>
526+
</dmrpp:chunks>
527+
</Float32>
528+
</Dataset>
529+
"""
530+
)

0 commit comments

Comments
 (0)