Skip to content

Commit 5fa3d8f

Browse files
committed
update dmrpp migration - soft imports and code refactor
1 parent 93837b9 commit 5fa3d8f

4 files changed

Lines changed: 20 additions & 596 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ hdf = [
5151
]
5252

5353
# dmrpp
54-
dmrpp = ["pydap>=3.5.10"]
54+
dmrpp = [
55+
"pydap @ git+https://github.com/pydap/pydap.git@refs/pull/697/head",
56+
]
5557

5658
zarr = ["arro3-core"]
5759

virtualizarr/parsers/dmrpp.py

Lines changed: 7 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,15 @@
1-
import base64
21
import io
3-
import warnings
4-
from pathlib import Path
52
from typing import Iterable
63
from xml.etree import ElementTree as ET
74

8-
from obspec_utils.protocols import ReadableStore
5+
# from obspec_utils.protocols import ReadableStore
96
from obspec_utils.readers import EagerStoreReader
107
from obspec_utils.registry import ObjectStoreRegistry
11-
from pydap.parsers.dmr import DMRPPParser as _DMRPPParser
128

13-
from virtualizarr.manifests import (
14-
ChunkManifest,
15-
ManifestArray,
16-
ManifestGroup,
17-
ManifestStore,
18-
)
19-
from virtualizarr.manifests.utils import create_v3_array_metadata
20-
from virtualizarr.parsers.utils import encode_cf_fill_value
9+
from virtualizarr.manifests import ManifestStore
10+
from virtualizarr.utils import soft_import
11+
12+
pydap = soft_import("pydap", "parsing dmrpp references", strict=False)
2113

2214

2315
class DMRPPParser:
@@ -71,159 +63,12 @@ def __call__(
7163
else url.removesuffix(".dmrpp")
7264
)
7365

66+
from pydap.virtualizarr.parser import DMRParser
67+
7468
parser = DMRParser(
7569
root=ET.parse(stream).getroot(),
7670
data_filepath=url,
7771
skip_variables=self.skip_variables,
7872
)
7973
manifest_store = parser.parse_dataset(object_store=store, group=self.group)
8074
return manifest_store
81-
82-
83-
class DMRParser:
84-
"""
85-
Parser for the OPeNDAP DMR++ XML format.
86-
Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes.
87-
Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract
88-
different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more.
89-
90-
OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B
91-
"""
92-
93-
# DAP and DMRPP XML namespaces
94-
_NS = {
95-
"dap": "http://xml.opendap.org/ns/DAP/4.0#",
96-
"dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#",
97-
}
98-
99-
root: ET.Element
100-
data_filepath: str
101-
102-
def __init__(
103-
self,
104-
root: ET.Element,
105-
data_filepath: str | None = None,
106-
skip_variables: Iterable[str] | None = None,
107-
):
108-
"""
109-
Initialize the DMRParser with the given DMR++ file contents and source data file path.
110-
111-
Parameters
112-
----------
113-
root
114-
Root of the xml tree structure of a DMR++ file.
115-
data_filepath
116-
The path to the actual data file that will be set in the chunk manifests.
117-
If None, the data file path is taken from the DMR++ file.
118-
"""
119-
self.root = root
120-
self.data_filepath = (
121-
data_filepath if data_filepath is not None else self.root.attrib["name"]
122-
)
123-
self.skip_variables = skip_variables or ()
124-
self._validation_issues: list[str] = []
125-
126-
def dmrparser(self) -> _DMRPPParser:
127-
"""Exposes the _DMRParser to external use (avoids breaking changes)"""
128-
parser = _DMRPPParser(
129-
root=self.root,
130-
data_filepath=self.data_filepath,
131-
skip_variables=self.skip_variables,
132-
)
133-
self._validation_issues = parser._validation_issues
134-
return parser
135-
136-
def parse_dataset(
137-
self,
138-
object_store: ReadableStore,
139-
group: str | None = None,
140-
) -> ManifestStore:
141-
"""
142-
Parses the given file and creates a ManifestStore.
143-
144-
Parameters
145-
----------
146-
group
147-
The group to parse. Ignored if no groups are present, and the entire
148-
dataset is parsed. If `None` or "/", and groups are present, the first group
149-
is parsed. If not `None` or "/", and no groups are present, a UserWarning
150-
is issued indicating that the group will be ignored.
151-
152-
Returns
153-
-------
154-
ManifestStore
155-
156-
Examples
157-
--------
158-
Open a sample DMR++ file and parse the dataset
159-
"""
160-
group = group or "/"
161-
ngroups = len(self.root.findall("dap:Group", self._NS))
162-
163-
if ngroups == 0 and group != "/":
164-
warnings.warn(
165-
f"No groups in DMR++ file {self.data_filepath!r}; "
166-
f"ignoring group parameter {group!r}"
167-
)
168-
169-
group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/")
170-
171-
dataset_element = self.dmrparser()._split_groups(self.root).get(group_path)
172-
173-
if dataset_element is None:
174-
raise ValueError(
175-
f"Group {group_path} not found in DMR++ file {self.data_filepath!r}"
176-
)
177-
178-
# get two dictionary containing relevant metadata
179-
vars_dict, attrs = self.dmrparser()._parse_dataset(dataset_element)
180-
181-
manifest_dict: dict[str, ManifestArray] = {}
182-
183-
for var in vars_dict.keys():
184-
chunkmanifest = vars_dict[var].pop("chunkmanifest", None)
185-
# remove opendap-related metadata
186-
meta = dict(
187-
[
188-
(k, v)
189-
for k, v in vars_dict[var].items()
190-
if k not in ["Maps", "fqn_dims"]
191-
]
192-
)
193-
if "_FillValue" in meta["attributes"]:
194-
encoded_cf_fill_value = encode_cf_fill_value(
195-
meta["attributes"]["_FillValue"], meta["data_type"]
196-
)
197-
meta["attributes"]["_FillValue"] = encoded_cf_fill_value
198-
199-
if "inline" in meta:
200-
# extract data already decoded into array/string
201-
data = meta.pop("inline", None)
202-
bdata = base64.b64encode(data)
203-
# chunk_entry = ChunkEntry(
204-
# path="", offset=0, length=len(bdata), data=bdata
205-
# )
206-
# chunkmanifest = ChunkManifest(entries=chunk_entry)
207-
208-
chunks = {
209-
"0.0": {
210-
"path": "__inline__",
211-
"offset": 0,
212-
"length": len(bdata),
213-
"data": bdata,
214-
},
215-
}
216-
chunkmanifest = ChunkManifest(entries=chunks)
217-
else:
218-
chunkmanifest = ChunkManifest(chunkmanifest)
219-
220-
metadata = create_v3_array_metadata(**meta)
221-
manifest_dict[var] = ManifestArray(
222-
metadata=metadata, chunkmanifest=chunkmanifest
223-
)
224-
225-
manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs)
226-
registry: ObjectStoreRegistry = ObjectStoreRegistry()
227-
registry.register(self.data_filepath, object_store)
228-
229-
return ManifestStore(registry=registry, group=manifest_group)

virtualizarr/tests/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,5 @@ def _importorskip(
4444
has_arro3, requires_arro3 = _importorskip("arro3.core")
4545
# The GribberishParser is new in gribberish 1.0.0.
4646
has_grib, requires_grib = _importorskip("gribberish", minversion="1.0.0")
47+
# The DMRPPParser is new in pydap 3.5.10
48+
has_pydap, requires_pydap = _importorskip("pydap")

0 commit comments

Comments
 (0)