From f50be15806736b84e18d0558c0da4531fafc2b5c Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 11 Aug 2025 11:02:48 -0700 Subject: [PATCH 1/4] Avoid refining parent dimensions in NetCDF files This changes the way DataTree objects are written to disk in a single call to `DataTree.to_netcdf()`, as well as netCDF groups written by passing an explicit `group` to `Dataset.to_netcdf()`. Conceivably we could only adjust the behavior for `DataTree`, but doing so for `Dataset` was well felt more consistent to me (and was also easier to implement). Fixes: GH10241 --- doc/whats-new.rst | 7 +++++++ xarray/backends/common.py | 8 ++++++-- xarray/backends/h5netcdf_.py | 10 ++++++++++ xarray/backends/netCDF4_.py | 10 ++++++++++ xarray/tests/test_backends.py | 11 +++++++++++ xarray/tests/test_backends_datatree.py | 17 +++++++++++++++++ 6 files changed, 61 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 78ef2875b31..a2a3b7d24ba 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -19,6 +19,13 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- When writing to NetCDF files with groups, Xarray no longer redefines dimensions + that have the same size in parent groups (:issue:`10241`). This conforms with + `CF Conventions `_ + but may require adjustments for code that consumes NetCDF files produced by + Xarray. + By `Stephan Hoyer `_. + Deprecations ~~~~~~~~~~~~ diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 542ca4c897b..d4daea12489 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -308,6 +308,9 @@ class AbstractDataStore: def get_dimensions(self): # pragma: no cover raise NotImplementedError() + def get_parent_dimensions(self): # pragma: no cover + raise {} + def get_attrs(self): # pragma: no cover raise NotImplementedError() @@ -563,13 +566,14 @@ def set_dimensions(self, variables, unlimited_dims=None): if unlimited_dims is None: unlimited_dims = set() + parent_dims = self.get_parent_dimensions() existing_dims = self.get_dimensions() dims = {} for v in unlimited_dims: # put unlimited_dims first dims[v] = None for v in variables.values(): - dims.update(dict(zip(v.dims, v.shape, strict=True))) + dims |= v.sizes for dim, length in dims.items(): if dim in existing_dims and length != existing_dims[dim]: @@ -577,7 +581,7 @@ def set_dimensions(self, variables, unlimited_dims=None): "Unable to update size for existing dimension" f"{dim!r} ({length} != {existing_dims[dim]})" ) - elif dim not in existing_dims: + elif dim not in existing_dims and length != parent_dims.get(dim): is_unlimited = dim in unlimited_dims self.set_dimension(dim, length, is_unlimited) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 24a3324bf62..72d065ff87b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -287,6 +287,16 @@ def get_attrs(self): def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) + def get_parent_dimensions(self): + group = self.ds + dims = {} + # dimensions defined in child groups have higher precedence + while (group := group.parent) is not None: + for k, v in group.dimensions.items(): + if k not in dims: + dims[k] = len(v) + return FrozenDict(dims) + def get_encoding(self): return { "unlimited_dims": { diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index ab1841461f4..d9609507aa5 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -518,6 +518,16 @@ def get_attrs(self): def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) + def get_parent_dimensions(self): + group = self.ds + dims = {} + # dimensions defined in child groups have higher precedence + while (group := group.parent) is not None: + for k, v in group.dimensions.items(): + if k not in dims: + dims[k] = len(v) + return FrozenDict(dims) + def get_encoding(self): return { "unlimited_dims": { diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2ff73203580..672df79ee96 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1610,6 +1610,17 @@ def test_write_groups(self) -> None: with self.open(tmp_file, group="data/2") as actual2: assert_identical(data2, actual2) + def test_child_group_with_inconsistent_dimensions(self) -> None: + base = Dataset(coords={"x": [1, 2]}) + child = Dataset(coords={"x": [1, 2, 3]}) + with create_tmp_file() as tmp_file: + self.save(base, tmp_file) + self.save(child, tmp_file, group="child", mode="a") + with self.open(tmp_file) as actual_base: + assert_identical(base, actual_base) + with self.open(tmp_file, group="child") as actual_child: + assert_identical(child, actual_child) + @pytest.mark.parametrize( "input_strings, is_bytes", [ diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index ec57993c4b2..2bdae6def27 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -265,6 +265,23 @@ def test_write_subgroup(self, tmpdir): assert_equal(original_dt, roundtrip_dt) assert_identical(expected_dt, roundtrip_dt) + @requires_netCDF4 + def test_no_redundant_dimensions(self, tmpdir): + # regression test for https://github.com/pydata/xarray/issues/10241 + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ) + filepath = tmpdir / "test.zarr" + original_dt.to_netcdf(filepath, engine=self.engine) + + root = nc4.Dataset(str(filepath)) + child = root.groups["child"] + assert list(root.dimensions) == ["x"] + assert list(child.dimensions) == [] + @requires_netCDF4 class TestNetCDF4DatatreeIO(DatatreeIOBase): From 09fe0f2141aa0b6dc9ec376987cd310056b0b5e6 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 11 Aug 2025 12:44:39 -0700 Subject: [PATCH 2/4] Typo --- xarray/backends/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index d4daea12489..36706ce0e7d 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -309,7 +309,7 @@ def get_dimensions(self): # pragma: no cover raise NotImplementedError() def get_parent_dimensions(self): # pragma: no cover - raise {} + return {} def get_attrs(self): # pragma: no cover raise NotImplementedError() From bca5053b5ab502f3be46ec0dd279330f93ce8d86 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 11 Aug 2025 13:19:42 -0700 Subject: [PATCH 3/4] doc failure --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a2a3b7d24ba..faac159331f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,7 @@ Breaking changes - When writing to NetCDF files with groups, Xarray no longer redefines dimensions that have the same size in parent groups (:issue:`10241`). This conforms with - `CF Conventions `_ + `CF Conventions for group scrope `_ but may require adjustments for code that consumes NetCDF files produced by Xarray. By `Stephan Hoyer `_. From 7161f9b3ed41328692b7082f1284415e97386529 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 13 Aug 2025 09:38:01 -0700 Subject: [PATCH 4/4] Create collect_ancestor_dimensions() helper function --- xarray/backends/common.py | 14 ++++++++++++++ xarray/backends/h5netcdf_.py | 10 ++-------- xarray/backends/netCDF4_.py | 10 ++-------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 36706ce0e7d..b94b11120ae 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -256,6 +256,20 @@ def find_root_and_group(ds): return ds, group +def collect_ancestor_dimensions(group) -> dict[str, int]: + """Returns dimensions defined in parent groups. + + If dimensions are defined in multiple ancestors, use the size of the closest + ancestor. + """ + dims = {} + while (group := group.parent) is not None: + for k, v in group.dimensions.items(): + if k not in dims: + dims[k] = len(v) + return dims + + def datatree_from_dict_with_io_cleanup(groups_dict: Mapping[str, Dataset]) -> DataTree: """DataTree.from_dict with file clean-up.""" try: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 72d065ff87b..2c8e3c28f83 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -16,6 +16,7 @@ WritableCFDataStore, _normalize_path, _open_remote_file, + collect_ancestor_dimensions, datatree_from_dict_with_io_cleanup, find_root_and_group, ) @@ -288,14 +289,7 @@ def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) def get_parent_dimensions(self): - group = self.ds - dims = {} - # dimensions defined in child groups have higher precedence - while (group := group.parent) is not None: - for k, v in group.dimensions.items(): - if k not in dims: - dims[k] = len(v) - return FrozenDict(dims) + return FrozenDict(collect_ancestor_dimensions(self.ds)) def get_encoding(self): return { diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index d9609507aa5..ecd652f50f4 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -16,6 +16,7 @@ T_PathFileOrDataStore, WritableCFDataStore, _normalize_path, + collect_ancestor_dimensions, datatree_from_dict_with_io_cleanup, find_root_and_group, robust_getitem, @@ -519,14 +520,7 @@ def get_dimensions(self): return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) def get_parent_dimensions(self): - group = self.ds - dims = {} - # dimensions defined in child groups have higher precedence - while (group := group.parent) is not None: - for k, v in group.dimensions.items(): - if k not in dims: - dims[k] = len(v) - return FrozenDict(dims) + return FrozenDict(collect_ancestor_dimensions(self.ds)) def get_encoding(self): return {