|
1 | | -import base64 |
2 | 1 | import io |
3 | | -import warnings |
4 | | -from pathlib import Path |
5 | 2 | from typing import Iterable |
6 | 3 | from xml.etree import ElementTree as ET |
7 | 4 |
|
8 | | -from obspec_utils.protocols import ReadableStore |
| 5 | +# from obspec_utils.protocols import ReadableStore |
9 | 6 | from obspec_utils.readers import EagerStoreReader |
10 | 7 | from obspec_utils.registry import ObjectStoreRegistry |
11 | | -from pydap.parsers.dmr import DMRPPParser as _DMRPPParser |
12 | 8 |
|
13 | | -from virtualizarr.manifests import ( |
14 | | - ChunkManifest, |
15 | | - ManifestArray, |
16 | | - ManifestGroup, |
17 | | - ManifestStore, |
18 | | -) |
19 | | -from virtualizarr.manifests.utils import create_v3_array_metadata |
20 | | -from virtualizarr.parsers.utils import encode_cf_fill_value |
| 9 | +from virtualizarr.manifests import ManifestStore |
| 10 | +from virtualizarr.utils import soft_import |
| 11 | + |
| 12 | +pydap = soft_import("pydap", "parsing dmrpp references", strict=False) |
21 | 13 |
|
22 | 14 |
|
23 | 15 | class DMRPPParser: |
@@ -71,159 +63,12 @@ def __call__( |
71 | 63 | else url.removesuffix(".dmrpp") |
72 | 64 | ) |
73 | 65 |
|
| 66 | + from pydap.virtualizarr.parser import DMRParser |
| 67 | + |
74 | 68 | parser = DMRParser( |
75 | 69 | root=ET.parse(stream).getroot(), |
76 | 70 | data_filepath=url, |
77 | 71 | skip_variables=self.skip_variables, |
78 | 72 | ) |
79 | 73 | manifest_store = parser.parse_dataset(object_store=store, group=self.group) |
80 | 74 | return manifest_store |
81 | | - |
82 | | - |
83 | | -class DMRParser: |
84 | | - """ |
85 | | - Parser for the OPeNDAP DMR++ XML format. |
86 | | - Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. |
87 | | - Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract |
88 | | - different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more. |
89 | | -
|
90 | | - OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B |
91 | | - """ |
92 | | - |
93 | | - # DAP and DMRPP XML namespaces |
94 | | - _NS = { |
95 | | - "dap": "http://xml.opendap.org/ns/DAP/4.0#", |
96 | | - "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", |
97 | | - } |
98 | | - |
99 | | - root: ET.Element |
100 | | - data_filepath: str |
101 | | - |
102 | | - def __init__( |
103 | | - self, |
104 | | - root: ET.Element, |
105 | | - data_filepath: str | None = None, |
106 | | - skip_variables: Iterable[str] | None = None, |
107 | | - ): |
108 | | - """ |
109 | | - Initialize the DMRParser with the given DMR++ file contents and source data file path. |
110 | | -
|
111 | | - Parameters |
112 | | - ---------- |
113 | | - root |
114 | | - Root of the xml tree structure of a DMR++ file. |
115 | | - data_filepath |
116 | | - The path to the actual data file that will be set in the chunk manifests. |
117 | | - If None, the data file path is taken from the DMR++ file. |
118 | | - """ |
119 | | - self.root = root |
120 | | - self.data_filepath = ( |
121 | | - data_filepath if data_filepath is not None else self.root.attrib["name"] |
122 | | - ) |
123 | | - self.skip_variables = skip_variables or () |
124 | | - self._validation_issues: list[str] = [] |
125 | | - |
126 | | - def dmrparser(self) -> _DMRPPParser: |
127 | | - """Exposes the _DMRParser to external use (avoids breaking changes)""" |
128 | | - parser = _DMRPPParser( |
129 | | - root=self.root, |
130 | | - data_filepath=self.data_filepath, |
131 | | - skip_variables=self.skip_variables, |
132 | | - ) |
133 | | - self._validation_issues = parser._validation_issues |
134 | | - return parser |
135 | | - |
136 | | - def parse_dataset( |
137 | | - self, |
138 | | - object_store: ReadableStore, |
139 | | - group: str | None = None, |
140 | | - ) -> ManifestStore: |
141 | | - """ |
142 | | - Parses the given file and creates a ManifestStore. |
143 | | -
|
144 | | - Parameters |
145 | | - ---------- |
146 | | - group |
147 | | - The group to parse. Ignored if no groups are present, and the entire |
148 | | - dataset is parsed. If `None` or "/", and groups are present, the first group |
149 | | - is parsed. If not `None` or "/", and no groups are present, a UserWarning |
150 | | - is issued indicating that the group will be ignored. |
151 | | -
|
152 | | - Returns |
153 | | - ------- |
154 | | - ManifestStore |
155 | | -
|
156 | | - Examples |
157 | | - -------- |
158 | | - Open a sample DMR++ file and parse the dataset |
159 | | - """ |
160 | | - group = group or "/" |
161 | | - ngroups = len(self.root.findall("dap:Group", self._NS)) |
162 | | - |
163 | | - if ngroups == 0 and group != "/": |
164 | | - warnings.warn( |
165 | | - f"No groups in DMR++ file {self.data_filepath!r}; " |
166 | | - f"ignoring group parameter {group!r}" |
167 | | - ) |
168 | | - |
169 | | - group_path = Path("/") if ngroups == 0 else Path("/") / group.removeprefix("/") |
170 | | - |
171 | | - dataset_element = self.dmrparser()._split_groups(self.root).get(group_path) |
172 | | - |
173 | | - if dataset_element is None: |
174 | | - raise ValueError( |
175 | | - f"Group {group_path} not found in DMR++ file {self.data_filepath!r}" |
176 | | - ) |
177 | | - |
178 | | - # get two dictionary containing relevant metadata |
179 | | - vars_dict, attrs = self.dmrparser()._parse_dataset(dataset_element) |
180 | | - |
181 | | - manifest_dict: dict[str, ManifestArray] = {} |
182 | | - |
183 | | - for var in vars_dict.keys(): |
184 | | - chunkmanifest = vars_dict[var].pop("chunkmanifest", None) |
185 | | - # remove opendap-related metadata |
186 | | - meta = dict( |
187 | | - [ |
188 | | - (k, v) |
189 | | - for k, v in vars_dict[var].items() |
190 | | - if k not in ["Maps", "fqn_dims"] |
191 | | - ] |
192 | | - ) |
193 | | - if "_FillValue" in meta["attributes"]: |
194 | | - encoded_cf_fill_value = encode_cf_fill_value( |
195 | | - meta["attributes"]["_FillValue"], meta["data_type"] |
196 | | - ) |
197 | | - meta["attributes"]["_FillValue"] = encoded_cf_fill_value |
198 | | - |
199 | | - if "inline" in meta: |
200 | | - # extract data already decoded into array/string |
201 | | - data = meta.pop("inline", None) |
202 | | - bdata = base64.b64encode(data) |
203 | | - # chunk_entry = ChunkEntry( |
204 | | - # path="", offset=0, length=len(bdata), data=bdata |
205 | | - # ) |
206 | | - # chunkmanifest = ChunkManifest(entries=chunk_entry) |
207 | | - |
208 | | - chunks = { |
209 | | - "0.0": { |
210 | | - "path": "__inline__", |
211 | | - "offset": 0, |
212 | | - "length": len(bdata), |
213 | | - "data": bdata, |
214 | | - }, |
215 | | - } |
216 | | - chunkmanifest = ChunkManifest(entries=chunks) |
217 | | - else: |
218 | | - chunkmanifest = ChunkManifest(chunkmanifest) |
219 | | - |
220 | | - metadata = create_v3_array_metadata(**meta) |
221 | | - manifest_dict[var] = ManifestArray( |
222 | | - metadata=metadata, chunkmanifest=chunkmanifest |
223 | | - ) |
224 | | - |
225 | | - manifest_group = ManifestGroup(arrays=manifest_dict, attributes=attrs) |
226 | | - registry: ObjectStoreRegistry = ObjectStoreRegistry() |
227 | | - registry.register(self.data_filepath, object_store) |
228 | | - |
229 | | - return ManifestStore(registry=registry, group=manifest_group) |
0 commit comments