Skip to content

Bug in HDFParser? Different length obtained to h5dump #968

@charles-turner-1

Description

@charles-turner-1

I'm trying to virtualise some netCDF files in a ceph store, and running into a slightly weird issue. The bucket has public read set, so this should function as a minimal reproducible example:

Code:

from virtualizarr import open_virtual_dataset 
from virtualizarr.parsers import HDFParser
from obstore.store import S3Store
from obspec_utils.registry import ObjectStoreRegistry

import numcodecs.zarr3


endpoint = "https://projects.pawsey.org.au"
bucket = "s3://zarr-data-stream-test"
file_pseudopath = 'output/SWWA/WA-DWER/ERA5/historical/r1i1p1f1/R3/v1/mon/pr/pr_SWWA_ERA5_historical_r1i1p1f1_R3_v1_mon_198001-198012.nc'


store = S3Store.from_url(
    f"{bucket}",
    endpoint=endpoint,
    skip_signature=True,
)

registry = ObjectStoreRegistry({f"{bucket}": store})

parser = HDFParser()


url = f"{bucket}/{file_pseudopath}"

open_virtual_dataset(
    url = url,
    parser=parser,
    registry=registry,
    decode_times=False,
)

Output:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], [line 31](vscode-notebook-cell:?execution_count=3&line=31)
     27 
     28 
     29 url = f"{bucket}/{file_pseudopath}"
     30 
---> [31](vscode-notebook-cell:?execution_count=3&line=31) open_virtual_dataset(
     32     url = url,
     33     parser=parser,
     34     registry=registry,

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/xarray.py:231, in open_virtual_dataset(url, registry, parser, drop_variables, loadable_variables, decode_times)
    224 filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri())
    226 manifest_store = parser(
    227     url=filepath,
    228     registry=registry,
    229 )
--> [231](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/xarray.py:231) ds = manifest_store.to_virtual_dataset(
    232     loadable_variables=loadable_variables,
    233     decode_times=decode_times,
    234 )
    235 return ds.drop_vars(list(drop_variables or ()))

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/manifests/store.py:328, in ManifestStore.to_virtual_dataset(self, group, loadable_variables, decode_times)
    323 if loadable_variables and self._registry.map is None:
    324     raise ValueError(
    325         f"ManifestStore contains an empty store registry, but {loadable_variables} were provided as loadable variables. Must provide an ObjectStore instance in order to load variables."
    326     )
--> [328](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/manifests/store.py:328) return construct_virtual_dataset(
    329     manifest_store=self,
    330     group=group,
    331     loadable_variables=loadable_variables,
    332     decode_times=decode_times,
    333 )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/xarray.py:463, in construct_virtual_dataset(manifest_store, group, loadable_variables, decode_times, reader_options)
    459     manifestgroup = manifest_store._group
    461 fully_virtual_ds = manifestgroup.to_virtual_dataset()
--> [463](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/virtualizarr/xarray.py:463) with xr.open_zarr(
    464     manifest_store,
    465     group=group,
    466     consolidated=False,
    467     zarr_format=3,
    468     chunks=None,
    469     decode_times=decode_times,
    470 ) as loadable_ds:
    471     return replace_virtual_with_loadable_vars(
    472         fully_virtual_ds, loadable_ds, loadable_variables
    473     )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:1609, in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, storage_options, decode_timedelta, use_cftime, zarr_version, zarr_format, use_zarr_fill_value_as_mask, chunked_array_type, from_array_kwargs, create_default_indexes, **kwargs)
   1595     raise TypeError(
   1596         "open_zarr() got unexpected keyword arguments " + ",".join(kwargs.keys())
   1597     )
   1599 backend_kwargs = {
   1600     "synchronizer": synchronizer,
   1601     "consolidated": consolidated,
   (...)   1606     "zarr_format": zarr_format,
   1607 }
-> [1609](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:1609) ds = open_dataset(
   1610     filename_or_obj=store,
   1611     group=group,
   1612     decode_cf=decode_cf,
   1613     mask_and_scale=mask_and_scale,
   1614     decode_times=decode_times,
   1615     concat_characters=concat_characters,
   1616     decode_coords=decode_coords,
   1617     engine="zarr",
   1618     chunks=chunks,
   1619     drop_variables=drop_variables,
   1620     create_default_indexes=create_default_indexes,
   1621     chunked_array_type=chunked_array_type,
   1622     from_array_kwargs=from_array_kwargs,
   1623     backend_kwargs=backend_kwargs,
   1624     decode_timedelta=decode_timedelta,
   1625     use_cftime=use_cftime,
   1626     zarr_version=zarr_version,
   1627     use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
   1628 )
   1629 return ds

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:613, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, create_default_indexes, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
    606 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
    607 backend_ds = backend.open_dataset(
    608     filename_or_obj,
    609     drop_variables=drop_variables,
    610     **decoders,
    611     **kwargs,
    612 )
--> [613](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:613) ds = _dataset_from_backend_dataset(
    614     backend_ds,
    615     filename_or_obj,
    616     engine,
    617     chunks,
    618     cache,
    619     overwrite_encoded_chunks,
    620     inline_array,
    621     chunked_array_type,
    622     from_array_kwargs,
    623     drop_variables=drop_variables,
    624     create_default_indexes=create_default_indexes,
    625     **decoders,
    626     **kwargs,
    627 )
    628 return ds

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:303, in _dataset_from_backend_dataset(backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, inline_array, chunked_array_type, from_array_kwargs, create_default_indexes, **extra_tokens)
    300 _protect_dataset_variables_inplace(backend_ds, cache)
    302 if create_default_indexes:
--> [303](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:303)     ds = _maybe_create_default_indexes(backend_ds)
    304 else:
    305     ds = backend_ds

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:279, in _maybe_create_default_indexes(ds)
    273 def _maybe_create_default_indexes(ds):
    274     to_index = {
    275         name: coord.variable
    276         for name, coord in ds.coords.items()
    277         if coord.dims == (name,) and name not in ds.xindexes
    278     }
--> [279](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/api.py:279)     return ds.assign_coords(Coordinates(to_index))

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/coordinates.py:324, in Coordinates.__init__(self, coords, indexes)
    322 var = as_variable(data, name=name, auto_convert=False)
    323 if var.dims == (name,) and indexes is None:
--> [324](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/coordinates.py:324)     index, index_vars = create_default_index_implicit(var, list(coords))
    325     default_indexes.update(dict.fromkeys(index_vars, index))
    326     variables.update(index_vars)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexes.py:1665, in create_default_index_implicit(dim_variable, all_variables)
   1663 else:
   1664     dim_var = {name: dim_variable}
-> [1665](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexes.py:1665)     index = PandasIndex.from_variables(dim_var, options={})
   1666     index_vars = index.create_variables(dim_var)
   1668 return index, index_vars

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexes.py:728, in PandasIndex.from_variables(cls, variables, options)
    720 dim = var.dims[0]
    722 # TODO: (benbovy - explicit indexes): add __index__ to ExplicitlyIndexesNDArrayMixin?
    723 # this could be eventually used by Variable.to_index() and would remove the need to perform
    724 # the checks below.
    725 
    726 # preserve wrapped pd.Index (if any)
    727 # accessing `.data` can load data from disk, so we only access if needed
--> [728](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexes.py:728) data = var._data if isinstance(var._data, PandasIndexingAdapter) else var.data  # type: ignore[redundant-expr]
    729 # multi-index level variable: get level index
    730 if isinstance(var._data, PandasMultiIndexingAdapter):

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/variable.py:456, in Variable.data(self)
    454     duck_array = self._data.array
    455 elif isinstance(self._data, indexing.ExplicitlyIndexed):
--> [456](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/variable.py:456)     duck_array = self._data.get_duck_array()
    457 elif is_duck_array(self._data):
    458     duck_array = self._data

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:970, in MemoryCachedArray.get_duck_array(self)
    969 def get_duck_array(self):
--> [970](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:970)     duck_array = self.array.get_duck_array()
    971     # ensure the array object is cached in-memory
    972     self.array = as_indexable(duck_array)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:924, in CopyOnWriteArray.get_duck_array(self)
    923 def get_duck_array(self):
--> [924](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:924)     return self.array.get_duck_array()

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:764, in LazilyIndexedArray.get_duck_array(self)
    761 from xarray.backends.common import BackendArray
    763 if isinstance(self.array, BackendArray):
--> [764](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:764)     array = self.array[self.key]
    765 else:
    766     array = apply_indexer(self.array, self.key)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:267, in ZarrArrayWrapper.__getitem__(self, key)
    265 elif isinstance(key, indexing.OuterIndexer):
    266     method = self._oindex
--> [267](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:267) return indexing.explicit_indexing_adapter(
    268     key, array.shape, indexing.IndexingSupport.VECTORIZED, method
    269 )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:1156, in explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method)
   1134 """Support explicit indexing by delegating to a raw indexing method.
   1135 
   1136 Outer and/or vectorized indexers are supported by indexing a second time
   (...)   1153 Indexing result, in the form of a duck numpy-array.
   1154 """
   1155 raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support)
-> [1156](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/core/indexing.py:1156) result = raw_indexing_method(raw_key.tuple)
   1157 if numpy_indices.tuple:
   1158     # index the loaded duck array
   1159     indexable = as_indexable(result)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:230, in ZarrArrayWrapper._getitem(self, key)
    229 def _getitem(self, key):
--> [230](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/xarray/backends/zarr.py:230)     return self._array[key]

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:2868, in Array.__getitem__(self, selection)
   2866     return self.vindex[cast("CoordinateSelection | MaskSelection", selection)]
   2867 elif is_pure_orthogonal_indexing(pure_selection, self.ndim):
-> [2868](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:2868)     return self.get_orthogonal_selection(pure_selection, fields=fields)
   2869 else:
   2870     return self.get_basic_selection(cast("BasicSelection", pure_selection), fields=fields)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:3339, in Array.get_orthogonal_selection(self, selection, out, fields, prototype)
   3337     prototype = default_buffer_prototype()
   3338 indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid)
-> [3339](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:3339) return sync(
   3340     self.async_array._get_selection(
   3341         indexer=indexer, out=out, fields=fields, prototype=prototype
   3342     )
   3343 )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/sync.py:159, in sync(coro, loop, timeout)
    156 return_result = next(iter(finished)).result()
    158 if isinstance(return_result, BaseException):
--> [159](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/sync.py:159)     raise return_result
    160 else:
    161     return return_result

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/sync.py:119, in _runner(coro)
    114 """
    115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
    116 exception, the exception will be returned.
    117 """
    118 try:
--> [119](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/sync.py:119)     return await coro
    120 except Exception as ex:
    121     return ex

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:1565, in AsyncArray._get_selection(self, indexer, prototype, out, fields)
   1562         _config = replace(_config, order=self.order)
   1564     # reading chunks and decoding them
-> [1565](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/array.py:1565)     await self.codec_pipeline.read(
   1566         [
   1567             (
   1568                 self.store_path / self.metadata.encode_chunk_key(chunk_coords),
   1569                 self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
   1570                 chunk_selection,
   1571                 out_selection,
   1572                 is_complete_chunk,
   1573             )
   1574             for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
   1575         ],
   1576         out_buffer,
   1577         drop_axes=indexer.drop_axes,
   1578     )
   1579 if isinstance(indexer, BasicIndexer) and indexer.shape == ():
   1580     return out_buffer.as_scalar()

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:473, in BatchedCodecPipeline.read(self, batch_info, out, drop_axes)
    467 async def read(
    468     self,
    469     batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
    470     out: NDBuffer,
    471     drop_axes: tuple[int, ...] = (),
    472 ) -> None:
--> [473](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:473)     await concurrent_map(
    474         [
    475             (single_batch_info, out, drop_axes)
    476             for single_batch_info in batched(batch_info, self.batch_size)
    477         ],
    478         self.read_batch,
    479         config.get("async.concurrency"),
    480     )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:[116](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:116), in concurrent_map(items, func, limit)
    113     async with sem:
    114         return await func(*item)
--> 116 return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:[114](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:114), in concurrent_map.<locals>.run(item)
    112 async def run(item: tuple[Any]) -> V:
    113     async with sem:
--> 114         return await func(*item)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:275, in BatchedCodecPipeline.read_batch(self, batch_info, out, drop_axes)
    269 else:
    270     chunk_bytes_batch = await concurrent_map(
    271         [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info],
    272         lambda byte_getter, prototype: byte_getter.get(prototype),
    273         config.get("async.concurrency"),
    274     )
--> [275](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:275)     chunk_array_batch = await self.decode_batch(
    276         [
    277             (chunk_bytes, chunk_spec)
    278             for chunk_bytes, (_, chunk_spec, *_) in zip(
    279                 chunk_bytes_batch, batch_info, strict=False
    280             )
    281         ],
    282     )
    283     for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
    284         chunk_array_batch, batch_info, strict=False
    285     ):
    286         if chunk_array is not None:

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:190, in BatchedCodecPipeline.decode_batch(self, chunk_bytes_and_specs)
    183 (
    184     aa_codecs_with_spec,
    185     ab_codec_with_spec,
    186     bb_codecs_with_spec,
    187 ) = self._codecs_with_resolved_metadata_batched(chunk_specs)
    189 for bb_codec, chunk_spec_batch in bb_codecs_with_spec[::-1]:
--> [190](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/codec_pipeline.py:190)     chunk_bytes_batch = await bb_codec.decode(
    191         zip(chunk_bytes_batch, chunk_spec_batch, strict=False)
    192     )
    194 ab_codec, chunk_spec_batch = ab_codec_with_spec
    195 chunk_array_batch = await ab_codec.decode(
    196     zip(chunk_bytes_batch, chunk_spec_batch, strict=False)
    197 )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:159, in BaseCodec.decode(self, chunks_and_specs)
    143 async def decode(
    144     self,
    145     chunks_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]],
    146 ) -> Iterable[CodecInput | None]:
    147     """Decodes a batch of chunks.
    148     Chunks can be None in which case they are ignored by the codec.
    149 
   (...)    157     Iterable[CodecInput | None]
    158     """
--> [159](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:159)     return await _batching_helper(self._decode_single, chunks_and_specs)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:467, in _batching_helper(func, batch_info)
    463 async def _batching_helper(
    464     func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]],
    465     batch_info: Iterable[tuple[CodecInput | None, ArraySpec]],
    466 ) -> list[CodecOutput | None]:
--> [467](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:467)     return await concurrent_map(
    468         list(batch_info),
    469         _noop_for_none(func),
    470         config.get("async.concurrency"),
    471     )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:116, in concurrent_map(items, func, limit)
    113     async with sem:
    114         return await func(*item)
--> [116](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:116) return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:114, in concurrent_map.<locals>.run(item)
    112 async def run(item: tuple[Any]) -> V:
    113     async with sem:
--> [114](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/common.py:114)         return await func(*item)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:480, in _noop_for_none.<locals>.wrap(chunk, chunk_spec)
    478 if chunk is None:
    479     return None
--> [480](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/abc/codec.py:480) return await func(chunk, chunk_spec)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/codecs/numcodecs/_codecs.py:144, in _NumcodecsBytesBytesCodec._decode_single(self, chunk_data, chunk_spec)
    143 async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
--> [144](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/codecs/numcodecs/_codecs.py:144)     return await asyncio.to_thread(
    145         as_numpy_array_wrapper,
    146         self._codec.decode,
    147         chunk_data,
    148         chunk_spec.prototype,
    149     )

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/asyncio/threads.py:25, in to_thread(func, *args, **kwargs)
     23 ctx = contextvars.copy_context()
     24 func_call = functools.partial(ctx.run, func, *args, **kwargs)
---> [25](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/asyncio/threads.py:25) return await loop.run_in_executor(None, func_call)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/concurrent/futures/thread.py:86, in _WorkItem.run(self, ctx)
     83     return
     85 try:
---> [86](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/concurrent/futures/thread.py:86)     result = ctx.run(self.task)
     87 except BaseException as exc:
     88     self.future.set_exception(exc)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/concurrent/futures/thread.py:73, in WorkerContext.run(self, task)
     71 def run(self, task):
     72     fn, args, kwargs = task
---> [73](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/concurrent/futures/thread.py:73)     return fn(*args, **kwargs)

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/buffer/cpu.py:219, in as_numpy_array_wrapper(func, buf, prototype)
    194 def as_numpy_array_wrapper(
    195     func: Callable[[npt.NDArray[Any]], bytes], buf: core.Buffer, prototype: core.BufferPrototype
    196 ) -> core.Buffer:
    197     """Converts the input of `func` to a numpy array and the output back to `Buffer`.
    198 
    199     This function is useful when calling a `func` that only support host memory such
   (...)    217         The result of `func` converted to a `Buffer`
    218     """
--> [219](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/zarr/core/buffer/cpu.py:219)     return prototype.buffer.from_bytes(func(buf.as_numpy_array()))

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/numcodecs/shuffle.py:51, in Shuffle.decode(self, buf, out)
     50 def decode(self, buf, out=None):
---> [51](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/numcodecs/shuffle.py:51)     buf, out = self._prepare_arrays(buf, out)
     53     if self.elementsize <= 1:
     54         return out  # no shuffling needed

File ~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/numcodecs/shuffle.py:36, in Shuffle._prepare_arrays(self, buf, out)
     33     return buf, out
     35 if buf.nbytes % self.elementsize != 0:
---> [36](https://file+.vscode-resource.vscode-cdn.net/Users/u1166368/vue/dwer-csi-streamer/~/vue/dwer-csi-streamer/python/.pixi/envs/default/lib/python3.14/site-packages/numcodecs/shuffle.py:36)     raise ValueError("Shuffle buffer is not an integer multiple of elementsize")
     38 return buf, out

ValueError: Shuffle buffer is not an integer multiple of elementsize

If I use h5dump to look at what's inside the netCDF file itself (I'm just looking at time_bnds here, but this seems to affect other variables too - specifying drop_variables=['time_bnds'] doesn't solve the issue):

!cd ~ && curl -Os https://projects.pawsey.org.au/zarr-data-stream-test/output/SWWA/WA-DWER/ERA5/historical/r1i1p1f1/R3/v1/mon/tasmax/tasmax_SWWA_ERA5_historical_r1i1p1f1_R3_v1_mon_201001-201012.nc && h5dump -pH ~/tasmax_SWWA_ERA5_historical_r1i1p1f1_R3_v1_mon_201001-201012.nc | tail -43

DATASET "time_bnds" {
      DATATYPE  H5T_IEEE_F64LE
      DATASPACE  SIMPLE { ( 12, 2 ) [/](https://file+.vscode-resource.vscode-cdn.net/) ( 12, 2 ) }
      STORAGE_LAYOUT {
         CHUNKED ( 12, 2 )
         SIZE 61 (3.148:1 COMPRESSION)
      }
      FILTERS {
         CHECKSUM FLETCHER32
         PREPROCESSING SHUFFLE
         COMPRESSION DEFLATE { LEVEL 5 }
      }
      FILLVALUE {
         FILL_TIME H5D_FILL_TIME_NEVER
         VALUE  H5D_FILL_VALUE_DEFAULT
      }
      ALLOCATION_TIME {
         H5D_ALLOC_TIME_INCR
      }
      ATTRIBUTE "DIMENSION_LIST" {
         DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT } }
         DATASPACE  SIMPLE { ( 2 ) [/](https://file+.vscode-resource.vscode-cdn.net/) ( 2 ) }
      }
      ATTRIBUTE "_Netcdf4Coordinates" {
         DATATYPE  H5T_STD_I32LE
         DATASPACE  SIMPLE { ( 2 ) [/](https://file+.vscode-resource.vscode-cdn.net/) ( 2 ) }
      }
      ATTRIBUTE "_Netcdf4Dimid" {
         DATATYPE  H5T_STD_I32LE
         DATASPACE  SCALAR
      }
      ATTRIBUTE "long_name" {
         DATATYPE  H5T_STRING {
            STRSIZE 11;
            STRPAD H5T_STR_NULLTERM;
            CSET H5T_CSET_ASCII;
            CTYPE H5T_C_S1;
         }
         DATASPACE  SCALAR
      }
   }
}
}

If I look at the size in the manifest:

>>> manifest_store = parser(
    url=url,
    registry=registry,
)

>>> manifest_store._group.arrays['time_bnds'].manifest['0.0'].get('length')

np.uint64(59)

So it looks like the HDFParser is somehow off by two bytes here to me? It looks like this might actually be a bug in numcodecs, but I found it via virtualizarr - happy to close this and move the issue over there if that's a more appropriate place for the issue, and I'm not totally sure either.


Pixi Environment:

[workspace]
authors = ["Charles Turner <charles.turner@anu.edu.au>"]
channels = ["conda-forge"]
name = "python"
platforms = ["osx-arm64"]
version = "0.1.0"

[tasks]

[dependencies]
s3fs = ">=2026.3.0,<2027"
boto3 = ">=1.42.70,<2"
botocore = ">=1.42.70,<2"
ipykernel = ">=7.2.0,<8"
jupyterlab = ">=4.5.6,<5"
virtualizarr = ">=2.5.0,<3"
zarr = ">=3.1.5,<4"
xarray = ">=2026.2.0,<2027"
obstore = ">=0.9.2,<0.10"
dask = ">=2026.3.0,<2027"
h5netcdf = ">=1.8.1,<2"
matplotlib = ">=3.10.8,<4"

[pypi-dependencies]
dotenv = ">=0.9.9, <0.10"
pdbpp = ">=0.12.1, <0.13"

Metadata

Metadata

Assignees

No one assigned

    Labels

    HDF parserNon-kerchunk-based HDF parserbugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions