diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4ef6e7d86e2..3b8be98b293 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,13 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- When writing to NetCDF files with groups, Xarray no longer redefines dimensions + that have the same size in parent groups (:issue:`10241`). This conforms with + `CF Conventions for group scrope `_ + but may require adjustments for code that consumes NetCDF files produced by + Xarray. + By `Stephan Hoyer `_. + Deprecations ~~~~~~~~~~~~ diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 542ca4c897b..b94b11120ae 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -256,6 +256,20 @@ def find_root_and_group(ds): return ds, group +def collect_ancestor_dimensions(group) -> dict[str, int]: + """Returns dimensions defined in parent groups. + + If dimensions are defined in multiple ancestors, use the size of the closest + ancestor. + """ + dims = {} + while (group := group.parent) is not None: + for k, v in group.dimensions.items(): + if k not in dims: + dims[k] = len(v) + return dims + + def datatree_from_dict_with_io_cleanup(groups_dict: Mapping[str, Dataset]) -> DataTree: """DataTree.from_dict with file clean-up.""" try: @@ -308,6 +322,9 @@ class AbstractDataStore: def get_dimensions(self): # pragma: no cover raise NotImplementedError() + def get_parent_dimensions(self): # pragma: no cover + return {} + def get_attrs(self): # pragma: no cover raise NotImplementedError() @@ -563,13 +580,14 @@ def set_dimensions(self, variables, unlimited_dims=None): if unlimited_dims is None: unlimited_dims = set() + parent_dims = self.get_parent_dimensions() existing_dims = self.get_dimensions() dims = {} for v in unlimited_dims: # put unlimited_dims first dims[v] = None for v in variables.values(): - dims.update(dict(zip(v.dims, v.shape, strict=True))) + dims |= v.sizes for dim, length in dims.items(): if dim in existing_dims and length != existing_dims[dim]: @@ -577,7 +595,7 @@ def set_dimensions(self, variables, unlimited_dims=None): "Unable to update size for existing dimension" f"{dim!r} ({length} != {existing_dims[dim]})" ) - elif dim not in existing_dims: + elif dim not in existing_dims and length != parent_dims.get(dim): is_unlimited = dim in unlimited_dims self.set_dimension(dim, length, is_unlimited) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 24a3324bf62..2c8e3c28f83 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -16,6 +16,7 @@ WritableCFDataStore, _normalize_path, _open_remote_file, + collect_ancestor_dimensions, datatree_from_dict_with_io_cleanup, find_root_and_group, ) @@ -287,6 +288,9 @@ def get_attrs(self): def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) + def get_parent_dimensions(self): + return FrozenDict(collect_ancestor_dimensions(self.ds)) + def get_encoding(self): return { "unlimited_dims": { diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index ab1841461f4..ecd652f50f4 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -16,6 +16,7 @@ T_PathFileOrDataStore, WritableCFDataStore, _normalize_path, + collect_ancestor_dimensions, datatree_from_dict_with_io_cleanup, find_root_and_group, robust_getitem, @@ -518,6 +519,9 @@ def get_attrs(self): def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) + def get_parent_dimensions(self): + return FrozenDict(collect_ancestor_dimensions(self.ds)) + def get_encoding(self): return { "unlimited_dims": { diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ab90723d1aa..eb98df5229f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1651,6 +1651,17 @@ def test_write_groups(self) -> None: with self.open(tmp_file, group="data/2") as actual2: assert_identical(data2, actual2) + def test_child_group_with_inconsistent_dimensions(self) -> None: + base = Dataset(coords={"x": [1, 2]}) + child = Dataset(coords={"x": [1, 2, 3]}) + with create_tmp_file() as tmp_file: + self.save(base, tmp_file) + self.save(child, tmp_file, group="child", mode="a") + with self.open(tmp_file) as actual_base: + assert_identical(base, actual_base) + with self.open(tmp_file, group="child") as actual_child: + assert_identical(child, actual_child) + @pytest.mark.parametrize( "input_strings, is_bytes", [ diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index ec57993c4b2..2bdae6def27 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -265,6 +265,23 @@ def test_write_subgroup(self, tmpdir): assert_equal(original_dt, roundtrip_dt) assert_identical(expected_dt, roundtrip_dt) + @requires_netCDF4 + def test_no_redundant_dimensions(self, tmpdir): + # regression test for https://github.com/pydata/xarray/issues/10241 + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ) + filepath = tmpdir / "test.zarr" + original_dt.to_netcdf(filepath, engine=self.engine) + + root = nc4.Dataset(str(filepath)) + child = root.groups["child"] + assert list(root.dimensions) == ["x"] + assert list(child.dimensions) == [] + @requires_netCDF4 class TestNetCDF4DatatreeIO(DatatreeIOBase):