@@ -132,11 +132,58 @@ def __init__(
132132 If None, the data file path is taken from the DMR++ file.
133133 """
134134 self .root = root
135+ self ._validation_issues : list [str ] = []
136+ data_filepath_from_root = self ._get_attrib (self .root , "name" , required = True )
137+ assert data_filepath_from_root is not None # required=True guarantees non-None
135138 self .data_filepath = (
136- data_filepath if data_filepath is not None else self . root . attrib [ "name" ]
139+ data_filepath if data_filepath is not None else data_filepath_from_root
137140 )
138141 self .skip_variables = skip_variables or ()
139142
143+ def _get_attrib (
144+ self , element : ET .Element , attrib_name : str , required : bool = False
145+ ) -> str | None :
146+ """
147+ Safely get an attribute from an XML element, logging validation issues.
148+
149+ Parameters
150+ ----------
151+ element
152+ The XML element to get the attribute from.
153+ attrib_name
154+ The name of the attribute to get.
155+ required
156+ If True, raises a ValueError when the attribute is missing. If False,
157+ returns None and logs the issue.
158+
159+ Returns
160+ -------
161+ str | None
162+ The attribute value if found, None otherwise.
163+
164+ Raises
165+ ------
166+ ValueError
167+ If required is True and the attribute is not found.
168+ """
169+ if attrib_name in element .attrib :
170+ return element .attrib [attrib_name ]
171+
172+ element_info = (
173+ element .tag
174+ if "name" not in element .attrib
175+ else f"{ element .tag } [@name='{ element .attrib ['name' ]} ']"
176+ )
177+ issue_msg = (
178+ f"Missing required attribute '{ attrib_name } ' in element: { element_info } "
179+ )
180+ self ._validation_issues .append (issue_msg )
181+
182+ if required :
183+ raise ValueError (issue_msg )
184+
185+ return None
186+
140187 def parse_dataset (
141188 self ,
142189 object_store : ReadableStore ,
@@ -248,7 +295,10 @@ def _split_groups_recursive(
248295 ) -> dict [Path , ET .Element ]:
249296 group_dict : dict [Path , ET .Element ] = {}
250297 for g in root .iterfind ("dap:Group" , self ._NS ):
251- new_path = current_path / Path (g .attrib ["name" ])
298+ group_name = self ._get_attrib (g , "name" , required = True )
299+ if group_name is None :
300+ continue
301+ new_path = current_path / Path (group_name )
252302 dataset_tags = [
253303 d for d in g if d .tag != "{" + self ._NS ["dap" ] + "}" + "Group"
254304 ]
@@ -276,14 +326,14 @@ def _parse_dataset(
276326
277327 manifest_dict : dict [str , ManifestArray ] = {}
278328 for var_tag in self ._find_var_tags (root ):
279- if var_tag .attrib ["name" ] not in self .skip_variables :
329+ var_name = self ._get_attrib (var_tag , "name" )
330+ if var_name and var_name not in self .skip_variables :
280331 try :
281332 variable = self ._parse_variable (var_tag )
282- manifest_dict [var_tag . attrib [ "name" ] ] = variable
333+ manifest_dict [var_name ] = variable
283334 except (UnboundLocalError , ValueError ):
284- name = var_tag .attrib ["name" ]
285335 warnings .warn (
286- f"This DMRpp contains the variable { name } that could not"
336+ f"This DMRpp contains the variable { var_name } that could not"
287337 " be parsed. Consider adding it to the list of skipped "
288338 "variables, or opening an issue to help resolve this"
289339 )
@@ -323,27 +373,35 @@ def _find_var_tags(self, root: ET.Element) -> list[ET.Element]:
323373 vars_tags += root .findall (f"dap:{ dap_dtype } " , self ._NS )
324374 return vars_tags
325375
326- def _parse_dim (self , root : ET .Element ) -> dict [str , int ]:
376+ def _parse_dim (self , root : ET .Element , dim_index : int = 0 ) -> dict [str , int ]:
327377 """
328378 Parse single <Dim> or <Dimension> tag
329379
330- If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim ": 300}
380+ If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim_0 ": 300}
331381 If the tag has both name and size attributes, it is a regular dimension. E.g. <Dim name="lat" size="1447"/> --> {"lat": 1447}
332382
333383 Parameters
334384 ----------
335385 root : ET.Element
336386 The root element Dim/Dimension tag
387+ dim_index : int
388+ Index of the dimension, used for naming phony dimensions
337389
338390 Returns
339391 -------
340392 dict
341- E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim ": 300}, {"time": None, "lat": None, "lon": None}
393+ E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim_0 ": 300}, {"time": None, "lat": None, "lon": None}
342394 """
343- if "name" not in root .attrib and "size" in root .attrib :
344- return {"phony_dim" : int (root .attrib ["size" ])}
345- if "name" in root .attrib and "size" in root .attrib :
346- return {Path (root .attrib ["name" ]).name : int (root .attrib ["size" ])}
395+ size_attr = self ._get_attrib (root , "size" )
396+ name_attr = self ._get_attrib (root , "name" )
397+
398+ if size_attr is not None :
399+ size = int (size_attr )
400+ if name_attr is not None :
401+ return {Path (name_attr ).name : size }
402+ else :
403+ return {f"phony_dim_{ dim_index } " : size }
404+
347405 raise ValueError ("Not enough information to parse Dim/Dimension tag" )
348406
349407 def _find_dimension_tags (self , root : ET .Element ) -> list [ET .Element ]:
@@ -352,6 +410,7 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
352410
353411 First attempts to find Dimension tags, then falls back to Dim tags.
354412 If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag.
413+ If Dim tags have no name attribute, they are phony dimensions and used directly.
355414
356415 Parameters
357416 ----------
@@ -365,11 +424,17 @@ def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]:
365424 dimension_tags = root .findall ("dap:Dimension" , self ._NS )
366425 if not dimension_tags :
367426 # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++
427+ # or they are phony dimensions (have size but no name)
368428 dim_tags = root .findall ("dap:Dim" , self ._NS )
369429 for d in dim_tags :
370- dimension_tag = self .find_node_fqn (d .attrib ["name" ])
371- if dimension_tag is not None :
372- dimension_tags .append (dimension_tag )
430+ dim_name = self ._get_attrib (d , "name" )
431+ if dim_name is not None :
432+ dimension_tag = self .find_node_fqn (dim_name )
433+ if dimension_tag is not None :
434+ dimension_tags .append (dimension_tag )
435+ else :
436+ # Phony dimension - use the Dim tag directly
437+ dimension_tags .append (d )
373438 return dimension_tags
374439
375440 def _parse_variable (self , var_tag : ET .Element ) -> ManifestArray :
@@ -389,8 +454,8 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
389454 # Dimension info
390455 dims : dict [str , int ] = {}
391456 dimension_tags = self ._find_dimension_tags (var_tag )
392- for dim in dimension_tags :
393- dims .update (self ._parse_dim (dim ))
457+ for dim_index , dim in enumerate ( dimension_tags ) :
458+ dims .update (self ._parse_dim (dim , dim_index = dim_index ))
394459 # convert DAP dtype to numpy dtype
395460 dtype = np .dtype (
396461 self ._DAP_NP_DTYPE [var_tag .tag .removeprefix ("{" + self ._NS ["dap" ] + "}" )]
@@ -410,9 +475,9 @@ def _parse_variable(self, var_tag: ET.Element) -> ManifestArray:
410475 chunks_shape = tuple (map (int , chunk_dim_text .split ()))
411476 else :
412477 chunks_shape = shape
413- if "fillValue" in chunks_tag . attrib :
414- fillValue_attrib = chunks_tag . attrib [ "fillValue" ]
415- array_fill_value = np .array (fillValue_attrib ).astype (dtype )[()]
478+ fill_value = self . _get_attrib ( chunks_tag , "fillValue" )
479+ if fill_value is not None :
480+ array_fill_value = np .array (fill_value ).astype (dtype )[()]
416481 if chunks_shape :
417482 chunkmanifest = self ._parse_chunks (chunks_tag , chunks_shape )
418483 else :
@@ -455,19 +520,28 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
455520 """
456521 attr : dict [str , Any ] = {}
457522 values = []
458- if "type" in attr_tag .attrib and attr_tag .attrib ["type" ] == "Container" :
523+ attr_type = self ._get_attrib (attr_tag , "type" )
524+ attr_name = self ._get_attrib (attr_tag , "name" , required = True )
525+
526+ if attr_name is None :
527+ return {}
528+
529+ if attr_type == "Container" :
459530 # DMR++ build information that is not part of the dataset
460- if attr_tag . attrib [ "name" ] == "build_dmrpp_metadata" :
531+ if attr_name == "build_dmrpp_metadata" :
461532 return {}
462533 else :
463- container_attr = attr_tag .attrib ["name" ]
464534 warnings .warn (
465535 "This DMRpp contains a nested attribute "
466- f"{ container_attr } . Nested attributes cannot "
536+ f"{ attr_name } . Nested attributes cannot "
467537 "be assigned to a variable or dataset and will be dropped"
468538 )
469539 return {}
470- dtype = np .dtype (self ._DAP_NP_DTYPE [attr_tag .attrib ["type" ]])
540+
541+ if attr_type is None :
542+ return {}
543+
544+ dtype = np .dtype (self ._DAP_NP_DTYPE [attr_type ])
471545 # if multiple Value tags are present, store as "key": "[v1, v2, ...]"
472546 for value_tag in attr_tag :
473547 # cast attribute to native python type using dmr provided dtype
@@ -480,7 +554,7 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
480554 if val == "*" :
481555 val = np .nan
482556 values .append (val )
483- attr [attr_tag . attrib [ "name" ] ] = values [0 ] if len (values ) == 1 else values
557+ attr [attr_name ] = values [0 ] if len (values ) == 1 else values
484558 return attr
485559
486560 def _parse_filters (
@@ -502,10 +576,11 @@ def _parse_filters(
502576 list[dict] | None
503577 E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}]
504578 """
505- if "compressionType" in chunks_tag .attrib :
579+ compression_type = self ._get_attrib (chunks_tag , "compressionType" )
580+ if compression_type is not None :
506581 filters : list [dict ] = []
507582 # shuffle deflate --> ["shuffle", "deflate"]
508- compression_types = chunks_tag . attrib [ "compressionType" ] .split (" " )
583+ compression_types = compression_type .split (" " )
509584 for c in compression_types :
510585 if c == "shuffle" :
511586 filters .append (
@@ -515,15 +590,17 @@ def _parse_filters(
515590 }
516591 )
517592 elif c == "deflate" :
593+ deflate_level = self ._get_attrib (chunks_tag , "deflateLevel" )
594+ level = (
595+ int (deflate_level )
596+ if deflate_level is not None
597+ else self ._DEFAULT_ZLIB_VALUE
598+ )
518599 filters .append (
519600 {
520601 "name" : "numcodecs.zlib" ,
521602 "configuration" : {
522- "level" : int (
523- chunks_tag .attrib .get (
524- "deflateLevel" , self ._DEFAULT_ZLIB_VALUE
525- )
526- ),
603+ "level" : level ,
527604 },
528605 }
529606 )
@@ -555,19 +632,23 @@ def _parse_chunks(
555632 chunk_key_template = "." .join (["{}" for i in range (len (default_num ))])
556633 for chunk_tag in chunks_tag .iterfind ("dmrpp:chunk" , self ._NS ):
557634 chunk_num = default_num
558- if "chunkPositionInArray" in chunk_tag .attrib :
635+ chunk_position = self ._get_attrib (chunk_tag , "chunkPositionInArray" )
636+ if chunk_position is not None :
559637 # "[0,1023,10235]" -> ["0","1023","10235"]
560- chunk_pos = chunk_tag . attrib [ "chunkPositionInArray" ] [1 :- 1 ].split ("," )
638+ chunk_pos = chunk_position [1 :- 1 ].split ("," )
561639 # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5]
562640 chunk_num = [
563641 int (chunk_pos [i ]) // chunks_shape [i ]
564642 for i in range (len (chunks_shape ))
565643 ]
566644 # [0,1,5] -> "0.1.5"
567645 chunk_key = ChunkKey (chunk_key_template .format (* chunk_num ))
568- chunkmanifest [chunk_key ] = {
569- "path" : self .data_filepath ,
570- "offset" : int (chunk_tag .attrib ["offset" ]),
571- "length" : int (chunk_tag .attrib ["nBytes" ]),
572- }
646+ offset = self ._get_attrib (chunk_tag , "offset" , required = True )
647+ n_bytes = self ._get_attrib (chunk_tag , "nBytes" , required = True )
648+ if offset is not None and n_bytes is not None :
649+ chunkmanifest [chunk_key ] = {
650+ "path" : self .data_filepath ,
651+ "offset" : int (offset ),
652+ "length" : int (n_bytes ),
653+ }
573654 return ChunkManifest (entries = chunkmanifest )
0 commit comments