From 49c9ea4ba924a04bf705c8bd32f848c8688a7092 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 13:06:53 +0200 Subject: [PATCH 01/14] The align_chunks parameter was not being sent on the to_zarr method of the datasets --- xarray/core/dataset.py | 1 + xarray/tests/test_backends.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6de626a159b..3399d116c6f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2302,6 +2302,7 @@ def to_zarr( append_dim=append_dim, region=region, safe_chunks=safe_chunks, + align_chunks=align_chunks, zarr_version=zarr_version, zarr_format=zarr_format, write_empty_chunks=write_empty_chunks, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..4c828e19227 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6868,6 +6868,21 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): chunk = chunk.chunk() self.save(store, chunk.chunk(), region=region) + @requires_dask + def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: + skip_if_zarr_format_3(tmp_store) + dataset = DataArray( + np.arange(4), dims=["a"], coords={"a": np.arange(4)} + ).chunk(a=(2, 1, 1)).to_dataset(name="foo") + + dataset.to_zarr( + tmp_store, + align_chunks=True, + encoding={"foo": {"chunks": (3,)}}, + ) + with open_dataset(tmp_store, engine="zarr") as loaded_ds: + assert_identical(dataset, loaded_ds) + @requires_h5netcdf @requires_fsspec From fa00c95755684e2204eefcf27a4f62845c03356a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Jul 2025 12:54:49 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4c828e19227..6da34c23a6e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6871,9 +6871,11 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_dask def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: skip_if_zarr_format_3(tmp_store) - dataset = DataArray( - np.arange(4), dims=["a"], coords={"a": np.arange(4)} - ).chunk(a=(2, 1, 1)).to_dataset(name="foo") + dataset = ( + DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) + .chunk(a=(2, 1, 1)) + .to_dataset(name="foo") + ) dataset.to_zarr( tmp_store, From 6d3ff30a9f99983a54bb00ee2e4ad5a75b65fdf5 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 15:05:03 +0200 Subject: [PATCH 03/14] Add a note on the whats-new.rst about the error of the align_chunks for datasets --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8b602e9dc9..cf245750df6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being + passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). + Documentation ~~~~~~~~~~~~~ From 62e3ddbc180acd879b8ad651c725af9dbcbf70d0 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 15:25:19 +0200 Subject: [PATCH 04/14] Fix a ValueError on the test_dataset_to_zarr_align_chunks_true --- xarray/tests/test_backends.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6da34c23a6e..3fff0f78cf2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6870,20 +6870,22 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_dask def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) - dataset = ( - DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) - .chunk(a=(2, 1, 1)) - .to_dataset(name="foo") - ) + # This test is a replica of the one in `test_dataarray_to_zarr_align_chunks_true` + # but for datasets + with self.create_zarr_target() as store: + ds = ( + DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) + .chunk(a=(2, 1, 1)) + .to_dataset(name="foo") + ) - dataset.to_zarr( - tmp_store, - align_chunks=True, - encoding={"foo": {"chunks": (3,)}}, - ) - with open_dataset(tmp_store, engine="zarr") as loaded_ds: - assert_identical(dataset, loaded_ds) + self.save( + store, + ds, + align_chunks=True, + encoding={"foo": {"chunks": (3,)}}, + mode="w", + ) @requires_h5netcdf From a2789f6da1fc77238c36fb732bead1d52727b3b6 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 9 Jul 2025 17:28:10 +0200 Subject: [PATCH 05/14] Fix the case when enc_chunks are bigger than the dask chunks --- xarray/backends/chunks.py | 29 +++++++++++++++++++---------- xarray/tests/test_backends.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index f17f5375976..11744eff194 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -138,9 +138,6 @@ def build_grid_chunks( chunk_size: int, region: slice | None = None, ) -> tuple[int, ...]: - if region is None: - region = slice(0, size) - region_start = region.start or 0 # Generate the zarr chunks inside the region of this dim chunks_on_region = [chunk_size - (region_start % chunk_size)] @@ -159,6 +156,15 @@ def grid_rechunk( if not nd_var_chunks: return v + # This is useful for the scenarios where the enc_chunks are bigger than the + # variable chunks, which happens when the user specifies the enc_chunks manually. + enc_chunks = tuple( + min(enc_chunk, sum(v_chunk)) + for enc_chunk, v_chunk in zip( + enc_chunks, v.chunks, strict=True + ) + ) + nd_grid_chunks = tuple( build_grid_chunks( sum(var_chunks), @@ -191,9 +197,9 @@ def validate_grid_chunks_alignment( base_error = ( "Specified Zarr chunks encoding['chunks']={enc_chunks!r} for " "variable named {name!r} would overlap multiple Dask chunks. " - "Check the chunk at position {var_chunk_pos}, which has a size of " - "{var_chunk_size} on dimension {dim_i}. It is unaligned with " - "backend chunks of size {chunk_size} in region {region}. " + "Please check the Dask chunks at position {var_chunk_pos} and " + "{var_chunk_pos_next}, on axis {axis}, they are overlapped " + "on the same Zarr chunk in the region {region}. " "Writing this array in parallel with Dask could lead to corrupted data. " "To resolve this issue, consider one of the following options: " "- Rechunk the array using `chunk()`. " @@ -202,7 +208,7 @@ def validate_grid_chunks_alignment( "- Enable automatic chunks alignment with `align_chunks=True`." ) - for dim_i, chunk_size, var_chunks, interval, size in zip( + for axis, chunk_size, var_chunks, interval, size in zip( range(len(enc_chunks)), enc_chunks, nd_var_chunks, @@ -215,9 +221,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=i + 1, + var_chunk_pos_next=i + 2, var_chunk_size=chunk, + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -237,9 +244,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=0, + var_chunk_pos_next=0, var_chunk_size=var_chunks[0], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -251,9 +259,10 @@ def validate_grid_chunks_alignment( error_on_last_chunk = base_error.format( var_chunk_pos=len(var_chunks) - 1, + var_chunk_pos_next=len(var_chunks) - 1, var_chunk_size=var_chunks[-1], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fff0f78cf2..017b239e3af 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6874,8 +6874,15 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: # but for datasets with self.create_zarr_target() as store: ds = ( - DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) - .chunk(a=(2, 1, 1)) + DataArray( + np.arange(4).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + } + ) + .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo") ) @@ -6883,9 +6890,31 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: store, ds, align_chunks=True, - encoding={"foo": {"chunks": (3,)}}, + encoding={"foo": {"chunks": (3,3)}}, mode="w", ) + assert_identical(ds, xr.open_zarr(store)) + + ds = ( + DataArray( + np.arange(4, 8).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + } + ) + .chunk(a=(1, 1), b=(1, 1)) + .to_dataset(name="foo") + ) + + self.save( + store, + ds, + align_chunks=True, + region="auto", + ) + assert_identical(ds, xr.open_zarr(store)) @requires_h5netcdf From 60c6c75f902075ae883c06c4991ea6069c6ea965 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 9 Jul 2025 17:28:50 +0200 Subject: [PATCH 06/14] Linter --- xarray/backends/chunks.py | 4 +--- xarray/tests/test_backends.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index 11744eff194..fbf085165c1 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -160,9 +160,7 @@ def grid_rechunk( # variable chunks, which happens when the user specifies the enc_chunks manually. enc_chunks = tuple( min(enc_chunk, sum(v_chunk)) - for enc_chunk, v_chunk in zip( - enc_chunks, v.chunks, strict=True - ) + for enc_chunk, v_chunk in zip(enc_chunks, v.chunks, strict=True) ) nd_grid_chunks = tuple( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 017b239e3af..072b81121f5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6880,7 +6880,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: coords={ "a": np.arange(2), "b": np.arange(2), - } + }, ) .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo") @@ -6890,7 +6890,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: store, ds, align_chunks=True, - encoding={"foo": {"chunks": (3,3)}}, + encoding={"foo": {"chunks": (3, 3)}}, mode="w", ) assert_identical(ds, xr.open_zarr(store)) @@ -6902,7 +6902,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: coords={ "a": np.arange(2), "b": np.arange(2), - } + }, ) .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo") From f0d60a63ce37d9a6cee6ab5442becd61aabbc30c Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Thu, 10 Jul 2025 11:50:07 +0200 Subject: [PATCH 07/14] Fix small reintroduced issue when the region is None --- xarray/backends/chunks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index fbf085165c1..65593933086 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -138,6 +138,9 @@ def build_grid_chunks( chunk_size: int, region: slice | None = None, ) -> tuple[int, ...]: + if region is None: + region = slice(0, size) + region_start = region.start or 0 # Generate the zarr chunks inside the region of this dim chunks_on_region = [chunk_size - (region_start % chunk_size)] From b471a8c5027e6d703b6fc9bace5c5f1afa704f9c Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Thu, 10 Jul 2025 17:13:43 +0200 Subject: [PATCH 08/14] Fix mypy issues --- xarray/backends/chunks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index 65593933086..1fc2681c3e0 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -162,8 +162,8 @@ def grid_rechunk( # This is useful for the scenarios where the enc_chunks are bigger than the # variable chunks, which happens when the user specifies the enc_chunks manually. enc_chunks = tuple( - min(enc_chunk, sum(v_chunk)) - for enc_chunk, v_chunk in zip(enc_chunks, v.chunks, strict=True) + min(enc_chunk, sum(var_chunk)) + for enc_chunk, var_chunk in zip(enc_chunks, nd_var_chunks, strict=True) ) nd_grid_chunks = tuple( From 328161ad8cd4c44570b7df4a5fa80a6b39336943 Mon Sep 17 00:00:00 2001 From: joseph nowak Date: Fri, 11 Jul 2025 11:33:21 +0200 Subject: [PATCH 09/14] Update whats-new.rst --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a280f66c500..951680a0ed2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,8 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being + passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). Documentation ~~~~~~~~~~~~~ @@ -65,9 +67,6 @@ Bug fixes :py:meth:`Dataset.assign_coords` for advanced use cases (:issue:`10499`). By `Dhruva Kumar Kaushal `_. -- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being - passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). - Documentation ~~~~~~~~~~~~~ - A `new gallery `_ showing off the possibilities enabled by flexible indexes. From 8e9c2847689d58280942e5c439f3ecf2001ef6f1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Jul 2025 15:12:07 +0000 Subject: [PATCH 10/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6ad545d0072..262a11ff667 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,7 +27,7 @@ Bug fixes - Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). - + - Fix Pydap Datatree backend testing. Testing now compares elements of (unordered) two sets (before, lists) (:pull:`10525`). By `Miguel Jimenez-Urias `_. From a8c0172a71dd3b4c408331c2bac493807f708245 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:40:22 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4a157febf69..d55fea221bf 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,7 +36,7 @@ Bug fixes redundant computation of Dask arrays with cross-group dependencies (:issue:`10637`). By `Stephan Hoyer `_. - + - Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). From ca72ab62375f2a4bd44832d9d05cd29e719d1b22 Mon Sep 17 00:00:00 2001 From: josephnowak Date: Fri, 5 Sep 2025 16:24:25 +0200 Subject: [PATCH 12/14] Use "v" instead of "var" to follow the name convention used on the rest of Xarray, move the modification of the enc_chunks to the build_grid_chunks function, add additional test to covert the scenario where the chunk is bigger than the size of the array --- xarray/backends/chunks.py | 106 +++++++++++++-------------- xarray/backends/zarr.py | 2 +- xarray/tests/test_backends_chunks.py | 14 ++-- 3 files changed, 60 insertions(+), 62 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index 1fc2681c3e0..c255c7db591 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -4,20 +4,18 @@ def align_nd_chunks( - nd_var_chunks: tuple[tuple[int, ...], ...], + nd_v_chunks: tuple[tuple[int, ...], ...], nd_backend_chunks: tuple[tuple[int, ...], ...], ) -> tuple[tuple[int, ...], ...]: - if len(nd_backend_chunks) != len(nd_var_chunks): + if len(nd_backend_chunks) != len(nd_v_chunks): raise ValueError( "The number of dimensions on the backend and the variable must be the same." ) nd_aligned_chunks: list[tuple[int, ...]] = [] - for backend_chunks, var_chunks in zip( - nd_backend_chunks, nd_var_chunks, strict=True - ): + for backend_chunks, v_chunks in zip(nd_backend_chunks, nd_v_chunks, strict=True): # Validate that they have the same number of elements - if sum(backend_chunks) != sum(var_chunks): + if sum(backend_chunks) != sum(v_chunks): raise ValueError( "The number of elements in the backend does not " "match the number of elements in the variable. " @@ -42,8 +40,8 @@ def align_nd_chunks( nd_aligned_chunks.append(backend_chunks) continue - if len(var_chunks) == 1: - nd_aligned_chunks.append(var_chunks) + if len(v_chunks) == 1: + nd_aligned_chunks.append(v_chunks) continue # Size of the chunk on the backend @@ -51,7 +49,7 @@ def align_nd_chunks( # The ideal size of the chunks is the maximum of the two; this would avoid # that we use more memory than expected - max_chunk = max(fixed_chunk, *var_chunks) + max_chunk = max(fixed_chunk, *v_chunks) # The algorithm assumes that the chunks on this array are aligned except the last one # because it can be considered a partial one @@ -59,22 +57,22 @@ def align_nd_chunks( # For simplicity of the algorithm, let's transform the Array chunks in such a way that # we remove the partial chunks. To achieve this, we add artificial data to the borders - t_var_chunks = list(var_chunks) - t_var_chunks[0] += fixed_chunk - backend_chunks[0] - t_var_chunks[-1] += fixed_chunk - backend_chunks[-1] + t_v_chunks = list(v_chunks) + t_v_chunks[0] += fixed_chunk - backend_chunks[0] + t_v_chunks[-1] += fixed_chunk - backend_chunks[-1] # The unfilled_size is the amount of space that has not been filled on the last # processed chunk; this is equivalent to the amount of data that would need to be # added to a partial Zarr chunk to fill it up to the fixed_chunk size unfilled_size = 0 - for var_chunk in t_var_chunks: + for v_chunk in t_v_chunks: # Ideally, we should try to preserve the original Dask chunks, but this is only # possible if the last processed chunk was aligned (unfilled_size == 0) - ideal_chunk = var_chunk + ideal_chunk = v_chunk if unfilled_size: # If that scenario is not possible, the best option is to merge the chunks - ideal_chunk = var_chunk + aligned_chunks[-1] + ideal_chunk = v_chunk + aligned_chunks[-1] while ideal_chunk: if not unfilled_size: @@ -105,27 +103,27 @@ def align_nd_chunks( border_size = fixed_chunk - backend_chunks[::order][0] aligned_chunks = aligned_chunks[::order] aligned_chunks[0] -= border_size - t_var_chunks = t_var_chunks[::order] - t_var_chunks[0] -= border_size + t_v_chunks = t_v_chunks[::order] + t_v_chunks[0] -= border_size if ( len(aligned_chunks) >= 2 and aligned_chunks[0] + aligned_chunks[1] <= max_chunk - and aligned_chunks[0] != t_var_chunks[0] + and aligned_chunks[0] != t_v_chunks[0] ): # The artificial data added to the border can introduce inefficient chunks # on the borders, for that reason, we will check if we can merge them or not # Example: # backend_chunks = [6, 6, 1] - # var_chunks = [6, 7] - # t_var_chunks = [6, 12] - # The ideal output should preserve the same var_chunks, but the previous loop + # v_chunks = [6, 7] + # t_v_chunks = [6, 12] + # The ideal output should preserve the same v_chunks, but the previous loop # is going to produce aligned_chunks = [6, 6, 6] # And after removing the artificial data, we will end up with aligned_chunks = [6, 6, 1] # which is not ideal and can be merged into a single chunk aligned_chunks[1] += aligned_chunks[0] aligned_chunks = aligned_chunks[1:] - t_var_chunks = t_var_chunks[::order] + t_v_chunks = t_v_chunks[::order] aligned_chunks = aligned_chunks[::order] nd_aligned_chunks.append(tuple(aligned_chunks)) @@ -144,6 +142,11 @@ def build_grid_chunks( region_start = region.start or 0 # Generate the zarr chunks inside the region of this dim chunks_on_region = [chunk_size - (region_start % chunk_size)] + if chunks_on_region[0] >= size: + # This is useful for the scenarios where the chunk_size are bigger + # than the variable chunks, which can happens when the user specifies + # the enc_chunks manually. + return (size,) chunks_on_region.extend([chunk_size] * ((size - chunks_on_region[0]) // chunk_size)) if (size - chunks_on_region[0]) % chunk_size != 0: chunks_on_region.append((size - chunks_on_region[0]) % chunk_size) @@ -155,30 +158,23 @@ def grid_rechunk( enc_chunks: tuple[int, ...], region: tuple[slice, ...], ) -> Variable: - nd_var_chunks = v.chunks - if not nd_var_chunks: + nd_v_chunks = v.chunks + if not nd_v_chunks: return v - # This is useful for the scenarios where the enc_chunks are bigger than the - # variable chunks, which happens when the user specifies the enc_chunks manually. - enc_chunks = tuple( - min(enc_chunk, sum(var_chunk)) - for enc_chunk, var_chunk in zip(enc_chunks, nd_var_chunks, strict=True) - ) - nd_grid_chunks = tuple( build_grid_chunks( - sum(var_chunks), + v_size, region=interval, chunk_size=chunk_size, ) - for var_chunks, chunk_size, interval in zip( - nd_var_chunks, enc_chunks, region, strict=True + for v_size, chunk_size, interval in zip( + v.shape, enc_chunks, region, strict=True ) ) nd_aligned_chunks = align_nd_chunks( - nd_var_chunks=nd_var_chunks, + nd_v_chunks=nd_v_chunks, nd_backend_chunks=nd_grid_chunks, ) v = v.chunk(dict(zip(v.dims, nd_aligned_chunks, strict=True))) @@ -186,20 +182,20 @@ def grid_rechunk( def validate_grid_chunks_alignment( - nd_var_chunks: tuple[tuple[int, ...], ...] | None, + nd_v_chunks: tuple[tuple[int, ...], ...] | None, enc_chunks: tuple[int, ...], backend_shape: tuple[int, ...], region: tuple[slice, ...], allow_partial_chunks: bool, name: str, ): - if nd_var_chunks is None: + if nd_v_chunks is None: return base_error = ( "Specified Zarr chunks encoding['chunks']={enc_chunks!r} for " "variable named {name!r} would overlap multiple Dask chunks. " - "Please check the Dask chunks at position {var_chunk_pos} and " - "{var_chunk_pos_next}, on axis {axis}, they are overlapped " + "Please check the Dask chunks at position {v_chunk_pos} and " + "{v_chunk_pos_next}, on axis {axis}, they are overlapped " "on the same Zarr chunk in the region {region}. " "Writing this array in parallel with Dask could lead to corrupted data. " "To resolve this issue, consider one of the following options: " @@ -209,21 +205,21 @@ def validate_grid_chunks_alignment( "- Enable automatic chunks alignment with `align_chunks=True`." ) - for axis, chunk_size, var_chunks, interval, size in zip( + for axis, chunk_size, v_chunks, interval, size in zip( range(len(enc_chunks)), enc_chunks, - nd_var_chunks, + nd_v_chunks, region, backend_shape, strict=True, ): - for i, chunk in enumerate(var_chunks[1:-1]): + for i, chunk in enumerate(v_chunks[1:-1]): if chunk % chunk_size: raise ValueError( base_error.format( - var_chunk_pos=i + 1, - var_chunk_pos_next=i + 2, - var_chunk_size=chunk, + v_chunk_pos=i + 1, + v_chunk_pos_next=i + 2, + v_chunk_size=chunk, axis=axis, name=name, chunk_size=chunk_size, @@ -234,19 +230,19 @@ def validate_grid_chunks_alignment( interval_start = interval.start or 0 - if len(var_chunks) > 1: + if len(v_chunks) > 1: # The first border size is the amount of data that needs to be updated on the # first chunk taking into account the region slice. first_border_size = chunk_size if allow_partial_chunks: first_border_size = chunk_size - interval_start % chunk_size - if (var_chunks[0] - first_border_size) % chunk_size: + if (v_chunks[0] - first_border_size) % chunk_size: raise ValueError( base_error.format( - var_chunk_pos=0, - var_chunk_pos_next=0, - var_chunk_size=var_chunks[0], + v_chunk_pos=0, + v_chunk_pos_next=0, + v_chunk_size=v_chunks[0], axis=axis, name=name, chunk_size=chunk_size, @@ -259,9 +255,9 @@ def validate_grid_chunks_alignment( region_stop = interval.stop or size error_on_last_chunk = base_error.format( - var_chunk_pos=len(var_chunks) - 1, - var_chunk_pos_next=len(var_chunks) - 1, - var_chunk_size=var_chunks[-1], + v_chunk_pos=len(v_chunks) - 1, + v_chunk_pos_next=len(v_chunks) - 1, + v_chunk_size=v_chunks[-1], axis=axis, name=name, chunk_size=chunk_size, @@ -277,7 +273,7 @@ def validate_grid_chunks_alignment( # If the region is covering the last chunk then check # if the reminder with the default chunk size # is equal to the size of the last chunk - if var_chunks[-1] % chunk_size != size % chunk_size: + if v_chunks[-1] % chunk_size != size % chunk_size: raise ValueError(error_on_last_chunk) - elif var_chunks[-1] % chunk_size: + elif v_chunks[-1] % chunk_size: raise ValueError(error_on_last_chunk) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 452a5751228..f0578ca9352 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1249,7 +1249,7 @@ def set_variables( # threads shape = zarr_shape or v.shape validate_grid_chunks_alignment( - nd_var_chunks=v.chunks, + nd_v_chunks=v.chunks, enc_chunks=encoding["chunks"], region=region, allow_partial_chunks=self._mode != "r+", diff --git a/xarray/tests/test_backends_chunks.py b/xarray/tests/test_backends_chunks.py index 61b844d84be..bb1297d0db3 100644 --- a/xarray/tests/test_backends_chunks.py +++ b/xarray/tests/test_backends_chunks.py @@ -14,6 +14,8 @@ (10, 3, None, (3, 3, 3, 1)), (10, 3, slice(None, 10), (3, 3, 3, 1)), (10, 3, slice(0, None), (3, 3, 3, 1)), + (2, 10, slice(0, 3), (2,)), + (4, 10, slice(7, 10), (3, 1)), ], ) def test_build_grid_chunks(size, chunk_size, region, expected_chunks): @@ -26,16 +28,16 @@ def test_build_grid_chunks(size, chunk_size, region, expected_chunks): @pytest.mark.parametrize( - "nd_var_chunks, nd_backend_chunks, expected_chunks", + "nd_v_chunks, nd_backend_chunks, expected_chunks", [ (((2, 2, 2, 2),), ((3, 3, 2),), ((3, 3, 2),)), # ND cases (((2, 4), (2, 3)), ((2, 2, 2), (3, 2)), ((2, 4), (3, 2))), ], ) -def test_align_nd_chunks(nd_var_chunks, nd_backend_chunks, expected_chunks): +def test_align_nd_chunks(nd_v_chunks, nd_backend_chunks, expected_chunks): aligned_nd_chunks = align_nd_chunks( - nd_var_chunks=nd_var_chunks, + nd_v_chunks=nd_v_chunks, nd_backend_chunks=nd_backend_chunks, ) assert aligned_nd_chunks == expected_chunks @@ -43,7 +45,7 @@ def test_align_nd_chunks(nd_var_chunks, nd_backend_chunks, expected_chunks): @requires_dask @pytest.mark.parametrize( - "enc_chunks, region, nd_var_chunks, expected_chunks", + "enc_chunks, region, nd_v_chunks, expected_chunks", [ ( (3,), @@ -93,7 +95,7 @@ def test_align_nd_chunks(nd_var_chunks, nd_backend_chunks, expected_chunks): ), ], ) -def test_grid_rechunk(enc_chunks, region, nd_var_chunks, expected_chunks): +def test_grid_rechunk(enc_chunks, region, nd_v_chunks, expected_chunks): dims = [f"dim_{i}" for i in range(len(region))] coords = { dim: list(range(r.start, r.stop)) for dim, r in zip(dims, region, strict=False) @@ -104,7 +106,7 @@ def test_grid_rechunk(enc_chunks, region, nd_var_chunks, expected_chunks): dims=dims, coords=coords, ) - arr = arr.chunk(dict(zip(dims, nd_var_chunks, strict=False))) + arr = arr.chunk(dict(zip(dims, nd_v_chunks, strict=False))) result = grid_rechunk( arr.variable, From 6578f0bcbdc6384b5382e5b619656339ed4e5a0b Mon Sep 17 00:00:00 2001 From: josephnowak Date: Fri, 5 Sep 2025 16:30:55 +0200 Subject: [PATCH 13/14] Update the whats-new.rst --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c4bc21ccce3..cf12ae6a28f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,8 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being + passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). Documentation ~~~~~~~~~~~~~ @@ -102,9 +104,6 @@ Bug fixes (:issue:`10637`). By `Stephan Hoyer `_. -- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being - passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). - Documentation ~~~~~~~~~~~~~ From 08c2e9d7141ca20c2e610218a7beb8394e539fa3 Mon Sep 17 00:00:00 2001 From: josephnowak Date: Mon, 8 Sep 2025 13:14:25 +0200 Subject: [PATCH 14/14] Fix whats-new.rst --- doc/whats-new.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cf12ae6a28f..624f161d773 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -17,6 +17,10 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- :py:meth:`Dataset.update` now returns ``None``, instead of the updated dataset. This + completes the deprecation cycle started in version 0.17. The method still updates the + dataset in-place. (:issue:`10167`) + By `Maximilian Roos `_. Deprecations ~~~~~~~~~~~~ @@ -103,10 +107,6 @@ Bug fixes redundant computation of Dask arrays with cross-group dependencies (:issue:`10637`). By `Stephan Hoyer `_. - -Documentation -~~~~~~~~~~~~~ - - :py:meth:`DataTree.to_netcdf` had h5netcdf hard-coded as default (:issue:`10654`). By `Stephan Hoyer `_.