From 632c7f040a4e10c8312e65762c08f3e535c91743 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 17 Apr 2026 09:28:38 -0700 Subject: [PATCH 1/8] Add failing test --- virtualizarr/tests/test_parsers/conftest.py | 15 +++++++++++++++ .../tests/test_parsers/test_hdf/test_hdf.py | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index 23d30a4f..ee5d1490 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -479,6 +479,21 @@ def big_endian_dtype_hdf5_file(tmpdir): return filepath +@pytest.fixture +def singleton_padded_dimension_hdf5_file(tmp_path: Path) -> str: + """HDF5 file mimicking MATLAB layout: a 2D array (N, M) plus coordinate + arrays shaped (N, 1) and (1, M).""" + N, M = 10, 5 + filepath = str(tmp_path / "singleton_dimension_layout.nc") + + with h5py.File(filepath, "w") as f: + f.create_dataset(name="data", data=np.random.random((N, M))) + f.create_dataset(name="row_coord", data=np.random.random((N, 1))) + f.create_dataset(name="col_coord", data=np.random.random((1, M))) + + return f"file://{filepath}" + + @pytest.fixture() def dmrpp_xml_simple(): """Return a minimal valid DMR++ XML string for testing.""" diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 9854174f..4a10b101 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -41,6 +41,18 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_url): manifest_store = manifest_store_from_hdf_url(chunked_roundtrip_hdf5_url) assert manifest_store._group.arrays["var2"].manifest.shape_chunk_grid == (2, 8) + def test_singleton_dimensions_squeezed(self, singleton_padded_dimension_hdf5_file): + manifest_store = manifest_store_from_hdf_url( + singleton_padded_dimension_hdf5_file + ) + assert manifest_store._group.arrays["data"].manifest.shape_chunk_grid == (10, 5) + assert manifest_store._group.arrays["row_coord"].manifest.shape_chunk_grid == ( + 10, + ) + assert manifest_store._group.arrays["col_coord"].manifest.shape_chunk_grid == ( + 5, + ) + @requires_hdf5plugin @requires_imagecodecs From 696c05107aa08622630581891f2cc7369c1d3605 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 17 Apr 2026 11:14:55 -0700 Subject: [PATCH 2/8] Test should test shape not chunk grid --- virtualizarr/tests/test_parsers/test_hdf/test_hdf.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 4a10b101..2543c8bf 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -45,13 +45,9 @@ def test_singleton_dimensions_squeezed(self, singleton_padded_dimension_hdf5_fil manifest_store = manifest_store_from_hdf_url( singleton_padded_dimension_hdf5_file ) - assert manifest_store._group.arrays["data"].manifest.shape_chunk_grid == (10, 5) - assert manifest_store._group.arrays["row_coord"].manifest.shape_chunk_grid == ( - 10, - ) - assert manifest_store._group.arrays["col_coord"].manifest.shape_chunk_grid == ( - 5, - ) + assert manifest_store._group.arrays["data"].shape == (10, 5) + assert manifest_store._group.arrays["row_coord"].shape == (10,) + assert manifest_store._group.arrays["col_coord"].shape == (5,) @requires_hdf5plugin From 03e3a1ca126d364e89ddaf8f7aac4a0ce5358a25 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 17 Apr 2026 11:15:54 -0700 Subject: [PATCH 3/8] Add function for squeeze indices and use it to squeeze path, offset, lenght --- virtualizarr/parsers/hdf/hdf.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index a180a502..1b371603 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -35,6 +35,11 @@ from h5py import Group as H5Group +def _squeeze_indices(chunks: tuple) -> list[int]: + """Return indices of dimensions where chunk size is not 1.""" + return [i for i, s in enumerate(chunks) if s != 1] + + def _construct_manifest_array( filepath: str, dataset: H5Dataset, @@ -57,6 +62,9 @@ def _construct_manifest_array( ManifestArray """ chunks = dataset.chunks or dataset.shape + keep_indices = _squeeze_indices(chunks) + keep_chunks = tuple(chunks[i] for i in keep_indices) + keep_shape = tuple(dataset.shape[i] for i in keep_indices) codecs = codecs_from_dataset(dataset) attrs = _extract_attrs(dataset) dtype = dataset.dtype @@ -79,13 +87,14 @@ def _construct_manifest_array( fill_value = dataset.fillvalue.item() dims = tuple(_dataset_dims(dataset, group=group)) + keep_dims = tuple(dims[i] for i in keep_indices) metadata = create_v3_array_metadata( - shape=dataset.shape, + shape=keep_shape, data_type=dtype, - chunk_shape=chunks, + chunk_shape=keep_chunks, fill_value=fill_value, codecs=codec_configs, - dimension_names=dims, + dimension_names=keep_dims, attributes=attrs, ) manifest = _dataset_chunk_manifest(filepath, dataset) @@ -218,6 +227,7 @@ def _dataset_chunk_manifest( A Virtualizarr ChunkManifest """ dsid = dataset.id + keep_indices = _squeeze_indices(dataset.chunks or dataset.shape) if dataset.chunks is None: if dsid.get_offset() is None: chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) @@ -228,7 +238,7 @@ def _dataset_chunk_manifest( lengths=np.array(dsid.get_storage_size(), dtype=np.uint64), ) else: - key_list = [0] * (len(dataset.shape) or 1) + key_list = [0] * (len(keep_indices) or 1) key = ".".join(map(str, key_list)) chunk_entry: ChunkEntry = ChunkEntry.with_validation( # type: ignore[attr-defined] @@ -265,6 +275,14 @@ def add_chunk_info(blob): for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index)) + squeeze_axes = tuple( + i for i in range(len(dataset.chunks)) if i not in set(keep_indices) + ) + if squeeze_axes: + paths = np.squeeze(paths, axis=squeeze_axes) + offsets = np.squeeze(offsets, axis=squeeze_axes) + lengths = np.squeeze(lengths, axis=squeeze_axes) + chunk_manifest = ChunkManifest.from_arrays( paths=paths, # type: ignore offsets=offsets, From 553ab4546caa091ceb6091cb17216e0f3f29b1bf Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 17 Apr 2026 11:37:46 -0700 Subject: [PATCH 4/8] Add squeeze boolean parameter to HDFParser.__init__ --- virtualizarr/parsers/hdf/hdf.py | 24 +++++++++++++++---- .../tests/test_parsers/test_hdf/test_hdf.py | 12 +++++++++- virtualizarr/tests/utils.py | 4 ++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index 1b371603..80b8bd69 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -44,6 +44,7 @@ def _construct_manifest_array( filepath: str, dataset: H5Dataset, group: str, + squeeze: bool = False, ) -> ManifestArray: """ Construct a ManifestArray from an h5py dataset @@ -62,7 +63,7 @@ def _construct_manifest_array( ManifestArray """ chunks = dataset.chunks or dataset.shape - keep_indices = _squeeze_indices(chunks) + keep_indices = _squeeze_indices(chunks) if squeeze else list(range(len(chunks))) keep_chunks = tuple(chunks[i] for i in keep_indices) keep_shape = tuple(dataset.shape[i] for i in keep_indices) codecs = codecs_from_dataset(dataset) @@ -97,7 +98,7 @@ def _construct_manifest_array( dimension_names=keep_dims, attributes=attrs, ) - manifest = _dataset_chunk_manifest(filepath, dataset) + manifest = _dataset_chunk_manifest(filepath, dataset, squeeze=squeeze) return ManifestArray(metadata=metadata, chunkmanifest=manifest) @@ -107,6 +108,7 @@ def _construct_manifest_group( *, group: str | None = None, drop_variables: Iterable[str] | None = None, + squeeze: bool = False, ) -> ManifestGroup: """ Construct a virtual Group from a HDF dataset. @@ -126,7 +128,9 @@ def _construct_manifest_group( drop_variables = set(drop_variables or ()) | set(non_coordinate_dimension_vars) group_name = str(g.name) # NOTE: this will always include leading "/" arrays = { - key: _construct_manifest_array(filepath, dataset, group_name) + key: _construct_manifest_array( + filepath, dataset, group_name, squeeze=squeeze + ) for key in g.keys() if key not in drop_variables if isinstance(dataset := g[key], h5py.Dataset) @@ -136,6 +140,7 @@ def _construct_manifest_group( filepath, reader, group=str(Path(group) / key) if group is not None else key, + squeeze=squeeze, ) for key in g.keys() if key not in drop_variables @@ -152,6 +157,7 @@ def __init__( group: str | None = None, drop_variables: Iterable[str] | None = None, reader_factory: ReaderFactory = BlockStoreReader, + squeeze: bool = False, ): """ Instantiate a parser that can be used to virtualize HDF5/NetCDF4 files using the @@ -169,10 +175,13 @@ def __init__( Must return an object implementing the [ReadableFile][obspec_utils.protocols.ReadableFile] protocol. Default is [BlockStoreReader][obspec_utils.readers.BlockStoreReader]. + squeeze + If `True`, remove dimensions of size 1 from arrays (default: `False`). """ self.group = group self.drop_variables = drop_variables self.reader_factory = reader_factory + self.squeeze = squeeze def __call__( self, @@ -202,6 +211,7 @@ def __call__( reader=reader, group=self.group, drop_variables=self.drop_variables, + squeeze=self.squeeze, ) # Convert to a manifest store return ManifestStore(registry=registry, group=manifest_group) @@ -210,6 +220,7 @@ def __call__( def _dataset_chunk_manifest( filepath: str, dataset: H5Dataset, + squeeze: bool = False, ) -> ChunkManifest: """ Generate ChunkManifest for HDF5 dataset. @@ -227,7 +238,12 @@ def _dataset_chunk_manifest( A Virtualizarr ChunkManifest """ dsid = dataset.id - keep_indices = _squeeze_indices(dataset.chunks or dataset.shape) + chunks_or_shape = dataset.chunks or dataset.shape + keep_indices = ( + _squeeze_indices(chunks_or_shape) + if squeeze + else list(range(len(chunks_or_shape))) + ) if dataset.chunks is None: if dsid.get_offset() is None: chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape) diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 2543c8bf..ea289715 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -43,12 +43,22 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_url): def test_singleton_dimensions_squeezed(self, singleton_padded_dimension_hdf5_file): manifest_store = manifest_store_from_hdf_url( - singleton_padded_dimension_hdf5_file + singleton_padded_dimension_hdf5_file, squeeze=True ) assert manifest_store._group.arrays["data"].shape == (10, 5) assert manifest_store._group.arrays["row_coord"].shape == (10,) assert manifest_store._group.arrays["col_coord"].shape == (5,) + def test_singleton_dimensions_not_squeezed( + self, singleton_padded_dimension_hdf5_file + ): + manifest_store = manifest_store_from_hdf_url( + singleton_padded_dimension_hdf5_file, + ) + assert manifest_store._group.arrays["data"].shape == (10, 5) + assert manifest_store._group.arrays["row_coord"].shape == (10, 1) + assert manifest_store._group.arrays["col_coord"].shape == (1, 5) + @requires_hdf5plugin @requires_imagecodecs diff --git a/virtualizarr/tests/utils.py b/virtualizarr/tests/utils.py index 4ce2fece..7d6c1814 100644 --- a/virtualizarr/tests/utils.py +++ b/virtualizarr/tests/utils.py @@ -40,8 +40,8 @@ def obstore_http(url: str) -> ObjectStore: return store -def manifest_store_from_hdf_url(url, group: str | None = None): +def manifest_store_from_hdf_url(url, group: str | None = None, squeeze: bool = False): registry: ObjectStoreRegistry = ObjectStoreRegistry() registry.register(url, obstore_local(url=url)) - parser = HDFParser(group=group) + parser = HDFParser(group=group, squeeze=squeeze) return parser(url=url, registry=registry) From 01a000113b9a4c8001030d2886c5d74eacf191ef Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Sun, 19 Apr 2026 10:01:40 -0700 Subject: [PATCH 5/8] Add test case for when chunks include a singleton axis --- virtualizarr/parsers/hdf/hdf.py | 2 +- virtualizarr/tests/test_parsers/conftest.py | 41 +++++++++++++++---- .../tests/test_parsers/test_hdf/test_hdf.py | 30 +++++++++----- 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index 80b8bd69..4c5f939c 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -290,7 +290,7 @@ def add_chunk_info(blob): else: for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index)) - + # we squeeze here rather than in get_key squeeze_axes = tuple( i for i in range(len(dataset.chunks)) if i not in set(keep_indices) ) diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index ee5d1490..d7082235 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -479,19 +479,46 @@ def big_endian_dtype_hdf5_file(tmpdir): return filepath -@pytest.fixture -def singleton_padded_dimension_hdf5_file(tmp_path: Path) -> str: +@pytest.fixture( + params=[ + # {"N": 50, "M": 100, "chunked": True, "chunks": (5, 25)}, + # {"N": 50, "M": 100, "chunked": False, "chunks": None}, + {"N": 1, "M": 100, "chunked": True, "chunks": (1, 25)}, + ], + # ids=["chunked", "not_chunked", "singleton_chunked"], + ids=["singleton_chunked"], +) +def singleton_padded_dimension_hdf5_file(tmp_path: Path, request) -> tuple: """HDF5 file mimicking MATLAB layout: a 2D array (N, M) plus coordinate arrays shaped (N, 1) and (1, M).""" - N, M = 10, 5 + N = request.param["N"] + M = request.param["M"] + chunked = request.param["chunked"] + chunks = request.param["chunks"] filepath = str(tmp_path / "singleton_dimension_layout.nc") + dataset_args = { + "data": { + "name": "data", + "data": np.random.random((N, M)), + }, + "row_coord": { + "name": "row_coord", + "data": np.random.random((N, 1)), + }, + "col_coord": { + "name": "col_coord", + "data": np.random.random((1, M)), + }, + } + if chunks is not None: + dataset_args["data"]["chunks"] = chunks + with h5py.File(filepath, "w") as f: - f.create_dataset(name="data", data=np.random.random((N, M))) - f.create_dataset(name="row_coord", data=np.random.random((N, 1))) - f.create_dataset(name="col_coord", data=np.random.random((1, M))) + for v in dataset_args.values(): + f.create_dataset(**v) - return f"file://{filepath}" + return f"file://{filepath}", N, M, chunked, chunks @pytest.fixture() diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index ea289715..7bd270fe 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -42,22 +42,30 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_url): assert manifest_store._group.arrays["var2"].manifest.shape_chunk_grid == (2, 8) def test_singleton_dimensions_squeezed(self, singleton_padded_dimension_hdf5_file): - manifest_store = manifest_store_from_hdf_url( - singleton_padded_dimension_hdf5_file, squeeze=True + url, N, M, chunked, chunks = singleton_padded_dimension_hdf5_file + manifest_store = manifest_store_from_hdf_url(url, squeeze=True) + expected_data_shape = (M,) if N == 1 else (N, M) + assert manifest_store._group.arrays["data"].shape == expected_data_shape + if chunked: + expected_chunks = tuple(c for c, n in zip(chunks, (N, M)) if n != 1) + else: + expected_chunks = expected_data_shape + assert manifest_store._group.arrays["data"].chunks == expected_chunks + assert manifest_store._group.arrays["row_coord"].shape == ( + () if N == 1 else (N,) ) - assert manifest_store._group.arrays["data"].shape == (10, 5) - assert manifest_store._group.arrays["row_coord"].shape == (10,) - assert manifest_store._group.arrays["col_coord"].shape == (5,) + assert manifest_store._group.arrays["col_coord"].shape == (M,) def test_singleton_dimensions_not_squeezed( self, singleton_padded_dimension_hdf5_file ): - manifest_store = manifest_store_from_hdf_url( - singleton_padded_dimension_hdf5_file, - ) - assert manifest_store._group.arrays["data"].shape == (10, 5) - assert manifest_store._group.arrays["row_coord"].shape == (10, 1) - assert manifest_store._group.arrays["col_coord"].shape == (1, 5) + url, N, M, chunked, chunks = singleton_padded_dimension_hdf5_file + manifest_store = manifest_store_from_hdf_url(url) + assert manifest_store._group.arrays["data"].shape == (N, M) + expected_chunks = chunks if chunked else (N, M) + assert manifest_store._group.arrays["data"].chunks == expected_chunks + assert manifest_store._group.arrays["row_coord"].shape == (N, 1) + assert manifest_store._group.arrays["col_coord"].shape == (1, M) @requires_hdf5plugin From a7589205ade1628c7e59e64261ab4c02e1c4644d Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Sun, 19 Apr 2026 16:04:13 -0700 Subject: [PATCH 6/8] Add integration test --- virtualizarr/parsers/hdf/hdf.py | 1 + virtualizarr/tests/test_parsers/conftest.py | 7 +++-- .../test_hdf/test_hdf_integration.py | 26 +++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index 4c5f939c..61bd4fff 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -290,6 +290,7 @@ def add_chunk_info(blob): else: for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index)) + # we squeeze here rather than in get_key squeeze_axes = tuple( i for i in range(len(dataset.chunks)) if i not in set(keep_indices) diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index d7082235..78fb1aff 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -481,12 +481,11 @@ def big_endian_dtype_hdf5_file(tmpdir): @pytest.fixture( params=[ - # {"N": 50, "M": 100, "chunked": True, "chunks": (5, 25)}, - # {"N": 50, "M": 100, "chunked": False, "chunks": None}, + {"N": 50, "M": 100, "chunked": True, "chunks": (5, 25)}, + {"N": 50, "M": 100, "chunked": False, "chunks": None}, {"N": 1, "M": 100, "chunked": True, "chunks": (1, 25)}, ], - # ids=["chunked", "not_chunked", "singleton_chunked"], - ids=["singleton_chunked"], + ids=["chunked", "not_chunked", "singleton_chunked"], ) def singleton_padded_dimension_hdf5_file(tmp_path: Path, request) -> tuple: """HDF5 file mimicking MATLAB layout: a 2D array (N, M) plus coordinate diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py index e779317c..b195549b 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_integration.py @@ -96,6 +96,32 @@ def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim, local_registry): with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip: xrt.assert_equal(ds, roundtrip) + def test_singleton_dim_roundtrip( + self, tmp_path, singleton_padded_dimension_hdf5_file, local_registry + ): + import numpy as np + + parser = HDFParser(squeeze=True) + filepath, *_ = singleton_padded_dimension_hdf5_file + with ( + xr.open_dataset( + filepath, engine="h5netcdf", backend_kwargs={"phony_dims": "sort"} + ).squeeze(drop=True) as ds, + open_virtual_dataset( + url=filepath, + registry=local_registry, + parser=parser, + ) as vds, + ): + kerchunk_file = str(tmp_path / "kerchunk.json") + vds.vz.to_kerchunk(kerchunk_file, format="json") + with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip: + for var_name in ds.data_vars: + assert ds[var_name].shape == roundtrip[var_name].shape + np.testing.assert_array_equal( + ds[var_name].values, roundtrip[var_name].values + ) + @requires_icechunk def test_cf_fill_value_roundtrip( self, tmp_path, cf_fill_value_hdf5_file, local_registry From 51dd9111d46dd2b6811d195a8e1b3214bc80f76e Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 24 Apr 2026 14:11:23 -0700 Subject: [PATCH 7/8] Modify comment --- virtualizarr/parsers/hdf/hdf.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index a0e6ecb8..3e75c72d 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -68,10 +68,8 @@ def _construct_manifest_array( # this uses dataset.chunks if present or dataset.shape if chunks are not present # if using dataset.shape, it enforces that each dimension is at least size one chunks = dataset.chunks or tuple(max(s, 1) for s in dataset.shape) - # this says we only want to keep indices of dimensions where the size is greater than 1, - # which only somewhat overrides the logic above since we will remove any dimensions that are of size 1 - # taken together it means we remove any dimension that is of size 1 or less - # and any chunk dimension that is of size 1 or less + # keep only dimensions where chunk size != 1; after clamping above, no dim can be < 1, + # so this effectively drops any dim that was zero-length in the original dataset. keep_indices = _squeeze_indices(chunks) if squeeze else list(range(len(chunks))) keep_chunks = tuple(chunks[i] for i in keep_indices) keep_shape = tuple(dataset.shape[i] for i in keep_indices) From 6dc9337d38ea4456740c951ce493eec1bddd536e Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 24 Apr 2026 14:40:00 -0700 Subject: [PATCH 8/8] Modify comments --- virtualizarr/parsers/hdf/hdf.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index 3e75c72d..874bd153 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -35,9 +35,9 @@ from h5py import Group as H5Group -def _squeeze_indices(chunks: tuple) -> list[int]: - """Return indices of dimensions where chunk size is not 1.""" - return [i for i, s in enumerate(chunks) if s != 1] +def _squeeze_indices(shape: tuple) -> list[int]: + """Return indices of dimensions where shape is greater than 1.""" + return [i for i, s in enumerate(shape) if s > 1] def _construct_manifest_array( @@ -65,12 +65,13 @@ def _construct_manifest_array( # Clamp each dim to >= 1: zarr v3 allows shape=(0,) but forbids zero-length # chunk dimensions (enforced by zarr-python >= 3.2.0). See # https://github.com/zarr-developers/zarr-python/issues/3711. - # this uses dataset.chunks if present or dataset.shape if chunks are not present - # if using dataset.shape, it enforces that each dimension is at least size one chunks = dataset.chunks or tuple(max(s, 1) for s in dataset.shape) - # keep only dimensions where chunk size != 1; after clamping above, no dim can be < 1, - # so this effectively drops any dim that was zero-length in the original dataset. - keep_indices = _squeeze_indices(chunks) if squeeze else list(range(len(chunks))) + # When squeeze=True, we keep only dimensions of size > 1. + # So squeeze=True on its own drops any dim that was length 0 (or less) in the original dataset and the clamp is technically redundant. + # But when squeeze=False, the clamp is necessary to prevent having zero-length chunk dimensions. + keep_indices = ( + _squeeze_indices(dataset.shape) if squeeze else list(range(len(dataset.shape))) + ) keep_chunks = tuple(chunks[i] for i in keep_indices) keep_shape = tuple(dataset.shape[i] for i in keep_indices) @@ -246,11 +247,8 @@ def _dataset_chunk_manifest( A Virtualizarr ChunkManifest """ dsid = dataset.id - chunks_or_shape = dataset.chunks or dataset.shape keep_indices = ( - _squeeze_indices(chunks_or_shape) - if squeeze - else list(range(len(chunks_or_shape))) + _squeeze_indices(dataset.shape) if squeeze else list(range(len(dataset.shape))) ) if dataset.chunks is None: if dsid.get_offset() is None: