Skip to content
50 changes: 44 additions & 6 deletions virtualizarr/parsers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@
from h5py import Group as H5Group


def _squeeze_indices(shape: tuple) -> list[int]:
"""Return indices of dimensions where shape is greater than 1."""
return [i for i, s in enumerate(shape) if s > 1]


def _construct_manifest_array(
filepath: str,
dataset: H5Dataset,
group: str,
squeeze: bool = False,
) -> ManifestArray:
"""
Construct a ManifestArray from an h5py dataset
Expand All @@ -60,6 +66,15 @@ def _construct_manifest_array(
# chunk dimensions (enforced by zarr-python >= 3.2.0). See
# https://github.com/zarr-developers/zarr-python/issues/3711.
chunks = dataset.chunks or tuple(max(s, 1) for s in dataset.shape)
# When squeeze=True, we keep only dimensions of size > 1.
# So squeeze=True on its own drops any dim that was length 0 (or less) in the original dataset and the clamp is technically redundant.
# But when squeeze=False, the clamp is necessary to prevent having zero-length chunk dimensions.
keep_indices = (
_squeeze_indices(dataset.shape) if squeeze else list(range(len(dataset.shape)))
)
keep_chunks = tuple(chunks[i] for i in keep_indices)
keep_shape = tuple(dataset.shape[i] for i in keep_indices)

codecs = codecs_from_dataset(dataset)
attrs = _extract_attrs(dataset)
dtype = dataset.dtype
Expand All @@ -82,16 +97,17 @@ def _construct_manifest_array(

fill_value = dataset.fillvalue.item()
dims = tuple(_dataset_dims(dataset, group=group))
keep_dims = tuple(dims[i] for i in keep_indices)
metadata = create_v3_array_metadata(
shape=dataset.shape,
shape=keep_shape,
data_type=dtype,
chunk_shape=chunks,
chunk_shape=keep_chunks,
fill_value=fill_value,
codecs=codec_configs,
dimension_names=dims,
dimension_names=keep_dims,
attributes=attrs,
)
manifest = _dataset_chunk_manifest(filepath, dataset)
manifest = _dataset_chunk_manifest(filepath, dataset, squeeze=squeeze)
return ManifestArray(metadata=metadata, chunkmanifest=manifest)


Expand All @@ -101,6 +117,7 @@ def _construct_manifest_group(
*,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
squeeze: bool = False,
) -> ManifestGroup:
"""
Construct a virtual Group from a HDF dataset.
Expand All @@ -120,7 +137,9 @@ def _construct_manifest_group(
drop_variables = set(drop_variables or ()) | set(non_coordinate_dimension_vars)
group_name = str(g.name) # NOTE: this will always include leading "/"
arrays = {
key: _construct_manifest_array(filepath, dataset, group_name)
key: _construct_manifest_array(
filepath, dataset, group_name, squeeze=squeeze
)
for key in g.keys()
if key not in drop_variables
if isinstance(dataset := g[key], h5py.Dataset)
Expand All @@ -130,6 +149,7 @@ def _construct_manifest_group(
filepath,
reader,
group=str(Path(group) / key) if group is not None else key,
squeeze=squeeze,
)
for key in g.keys()
if key not in drop_variables
Expand All @@ -146,6 +166,7 @@ def __init__(
group: str | None = None,
drop_variables: Iterable[str] | None = None,
reader_factory: ReaderFactory = BlockStoreReader,
squeeze: bool = False,
):
"""
Instantiate a parser that can be used to virtualize HDF5/NetCDF4 files using the
Expand All @@ -163,10 +184,13 @@ def __init__(
Must return an object implementing the
[ReadableFile][obspec_utils.protocols.ReadableFile] protocol.
Default is [BlockStoreReader][obspec_utils.readers.BlockStoreReader].
squeeze
If `True`, remove dimensions of size 1 from arrays (default: `False`).
"""
self.group = group
self.drop_variables = drop_variables
self.reader_factory = reader_factory
self.squeeze = squeeze

def __call__(
self,
Expand Down Expand Up @@ -196,6 +220,7 @@ def __call__(
reader=reader,
group=self.group,
drop_variables=self.drop_variables,
squeeze=self.squeeze,
)
# Convert to a manifest store
return ManifestStore(registry=registry, group=manifest_group)
Expand All @@ -204,6 +229,7 @@ def __call__(
def _dataset_chunk_manifest(
filepath: str,
dataset: H5Dataset,
squeeze: bool = False,
) -> ChunkManifest:
"""
Generate ChunkManifest for HDF5 dataset.
Expand All @@ -221,6 +247,9 @@ def _dataset_chunk_manifest(
A Virtualizarr ChunkManifest
"""
dsid = dataset.id
keep_indices = (
_squeeze_indices(dataset.shape) if squeeze else list(range(len(dataset.shape)))
)
if dataset.chunks is None:
if dsid.get_offset() is None:
chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape)
Expand All @@ -231,7 +260,7 @@ def _dataset_chunk_manifest(
lengths=np.array(dsid.get_storage_size(), dtype=np.uint64),
)
else:
key_list = [0] * (len(dataset.shape) or 1)
key_list = [0] * (len(keep_indices) or 1)
key = ".".join(map(str, key_list))

chunk_entry: ChunkEntry = ChunkEntry.with_validation( # type: ignore[attr-defined]
Expand Down Expand Up @@ -268,6 +297,15 @@ def add_chunk_info(blob):
for index in range(num_chunks):
add_chunk_info(dsid.get_chunk_info(index))

# we squeeze here rather than in get_key
squeeze_axes = tuple(
i for i in range(len(dataset.chunks)) if i not in set(keep_indices)
)
if squeeze_axes:
paths = np.squeeze(paths, axis=squeeze_axes)
offsets = np.squeeze(offsets, axis=squeeze_axes)
lengths = np.squeeze(lengths, axis=squeeze_axes)

chunk_manifest = ChunkManifest.from_arrays(
paths=paths, # type: ignore
offsets=offsets,
Expand Down
41 changes: 41 additions & 0 deletions virtualizarr/tests/test_parsers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,47 @@ def big_endian_dtype_hdf5_file(tmpdir):
return filepath


@pytest.fixture(
params=[
{"N": 50, "M": 100, "chunked": True, "chunks": (5, 25)},
{"N": 50, "M": 100, "chunked": False, "chunks": None},
{"N": 1, "M": 100, "chunked": True, "chunks": (1, 25)},
],
ids=["chunked", "not_chunked", "singleton_chunked"],
)
def singleton_padded_dimension_hdf5_file(tmp_path: Path, request) -> tuple:
"""HDF5 file mimicking MATLAB layout: a 2D array (N, M) plus coordinate
arrays shaped (N, 1) and (1, M)."""
N = request.param["N"]
M = request.param["M"]
chunked = request.param["chunked"]
chunks = request.param["chunks"]
filepath = str(tmp_path / "singleton_dimension_layout.nc")

dataset_args = {
"data": {
"name": "data",
"data": np.random.random((N, M)),
},
"row_coord": {
"name": "row_coord",
"data": np.random.random((N, 1)),
},
"col_coord": {
"name": "col_coord",
"data": np.random.random((1, M)),
},
}
if chunks is not None:
dataset_args["data"]["chunks"] = chunks

with h5py.File(filepath, "w") as f:
for v in dataset_args.values():
f.create_dataset(**v)

return f"file://{filepath}", N, M, chunked, chunks


@pytest.fixture()
def dmrpp_xml_simple():
"""Return a minimal valid DMR++ XML string for testing."""
Expand Down
26 changes: 26 additions & 0 deletions virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,32 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_url):
manifest_store = manifest_store_from_hdf_url(chunked_roundtrip_hdf5_url)
assert manifest_store._group.arrays["var2"].manifest.shape_chunk_grid == (2, 8)

def test_singleton_dimensions_squeezed(self, singleton_padded_dimension_hdf5_file):
url, N, M, chunked, chunks = singleton_padded_dimension_hdf5_file
manifest_store = manifest_store_from_hdf_url(url, squeeze=True)
expected_data_shape = (M,) if N == 1 else (N, M)
assert manifest_store._group.arrays["data"].shape == expected_data_shape
if chunked:
expected_chunks = tuple(c for c, n in zip(chunks, (N, M)) if n != 1)
else:
expected_chunks = expected_data_shape
assert manifest_store._group.arrays["data"].chunks == expected_chunks
assert manifest_store._group.arrays["row_coord"].shape == (
() if N == 1 else (N,)
)
assert manifest_store._group.arrays["col_coord"].shape == (M,)

def test_singleton_dimensions_not_squeezed(
self, singleton_padded_dimension_hdf5_file
):
url, N, M, chunked, chunks = singleton_padded_dimension_hdf5_file
manifest_store = manifest_store_from_hdf_url(url)
assert manifest_store._group.arrays["data"].shape == (N, M)
expected_chunks = chunks if chunked else (N, M)
assert manifest_store._group.arrays["data"].chunks == expected_chunks
assert manifest_store._group.arrays["row_coord"].shape == (N, 1)
assert manifest_store._group.arrays["col_coord"].shape == (1, M)


@requires_hdf5plugin
@requires_imagecodecs
Expand Down
26 changes: 26 additions & 0 deletions virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,32 @@ def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim, local_registry):
with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip:
xrt.assert_equal(ds, roundtrip)

def test_singleton_dim_roundtrip(
self, tmp_path, singleton_padded_dimension_hdf5_file, local_registry
):
import numpy as np

parser = HDFParser(squeeze=True)
filepath, *_ = singleton_padded_dimension_hdf5_file
with (
xr.open_dataset(
filepath, engine="h5netcdf", backend_kwargs={"phony_dims": "sort"}
).squeeze(drop=True) as ds,
open_virtual_dataset(
url=filepath,
registry=local_registry,
parser=parser,
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.vz.to_kerchunk(kerchunk_file, format="json")
with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip:
for var_name in ds.data_vars:
assert ds[var_name].shape == roundtrip[var_name].shape
np.testing.assert_array_equal(
ds[var_name].values, roundtrip[var_name].values
)

@requires_icechunk
def test_cf_fill_value_roundtrip(
self, tmp_path, cf_fill_value_hdf5_file, local_registry
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def obstore_http(url: str) -> ObjectStore:
return store


def manifest_store_from_hdf_url(url, group: str | None = None):
def manifest_store_from_hdf_url(url, group: str | None = None, squeeze: bool = False):
registry: ObjectStoreRegistry = ObjectStoreRegistry()
registry.register(url, obstore_local(url=url))
parser = HDFParser(group=group)
parser = HDFParser(group=group, squeeze=squeeze)
return parser(url=url, registry=registry)
Loading