From eb1a967a92501d72af643f3433d6260e767c111e Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Sat, 12 Jul 2025 17:59:34 -0700 Subject: [PATCH 01/36] All works, just need to satisfy mypy and whatnot now --- xarray/namedarray/daskmanager.py | 54 ++++++++++++++++++++++++++++++++ xarray/tests/test_dask.py | 28 +++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 6485ba375f5..3ef0229356b 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -4,8 +4,10 @@ from typing import TYPE_CHECKING, Any import numpy as np +import dask from xarray.core.indexing import ImplicitToExplicitIndexingAdapter +from xarray.core.common import _contains_cftime_datetimes from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.namedarray.utils import is_duck_dask_array, module_available @@ -16,6 +18,7 @@ _NormalizedChunks, duckarray, ) + from xarray.namedarray.parallelcompat import _Chunks try: from dask.array import Array as DaskArray @@ -264,3 +267,54 @@ def shuffle( if chunks != "auto": raise NotImplementedError("Only chunks='auto' is supported at present.") return dask.array.shuffle(x, indexer, axis, chunks="auto") + + def rechunk( # type: ignore[override] + self, + data: T_ChunkedArray, + chunks: _NormalizedChunks | tuple[int, ...] | _Chunks, + **kwargs: Any, + ) -> Any: + """ + Changes the chunking pattern of the given array. + + Called when the .chunk method is called on an xarray object that is already chunked. + + Parameters + ---------- + data : dask array + Array to be rechunked. + chunks : int, tuple, dict or str, optional + The new block dimensions to create. -1 indicates the full size of the + corresponding dimension. Default is "auto" which automatically + determines chunk sizes. + + Returns + ------- + chunked array + + See Also + -------- + dask.array.Array.rechunk + cubed.Array.rechunk + """ + + if _contains_cftime_datetimes(data): + # Preprocess chunks if they're cftime + cftime_nbytes_approx = 64 + from dask.utils import parse_bytes + target_chunksize = parse_bytes(dask.config.get("array.chunk-size")) + + # Calculate total elements per chunk + elements_per_chunk = target_chunksize // cftime_nbytes_approx + + # Distribute elements across dimensions + # Simple approach: try to make chunks roughly cubic + ndim = data.ndim # type:ignore + shape = data.shape # type:ignore + if ndim > 0: + chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) + chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) + else: + chunks = () + + return data.rechunk(chunks, **kwargs) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 9024f2ae677..7f67769fd7c 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1059,6 +1059,27 @@ def make_da(): return da +def make_da_cftime(): + yrs = np.arange(2000,2120) + cftime_dates = xr.date_range( + start=f"{yrs[0]}-01-01", + end=f"{yrs[-1]}-12-31", + freq="1YE", + use_cftime=True, + ) + yr_array = np.tile(cftime_dates.values, (10, 1)) + da = xr.DataArray( + yr_array, + dims=["x", "t"], + coords={"x": np.arange(10), "t": cftime_dates}, + name="a", + ).chunk({"x": 4, "t": 5}) + da.x.attrs["long_name"] = "x" + da.attrs["test"] = "test" + da.coords["c2"] = 0.5 + da.coords["ndcoord"] = da.x * 2 + + return da def make_ds(): map_ds = xr.Dataset() @@ -1140,6 +1161,13 @@ def test_auto_chunk_da(obj): np.testing.assert_array_equal(actual, expected) assert actual.chunks == expected.chunks +@pytest.mark.parametrize("obj", [make_da_cftime()]) +def test_auto_chunk_da_cftime(obj): + actual = obj.chunk("auto").data + expected = obj.data.rechunk({0: 10, 1: 120}) + np.testing.assert_array_equal(actual, expected) + assert actual.chunks == expected.chunks + def test_map_blocks_error(map_da, map_ds): def bad_func(darray): From 852476d127df64540298369e40ae20ce2f89aa2d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Jul 2025 01:10:13 +0000 Subject: [PATCH 02/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/namedarray/daskmanager.py | 25 +++++++++++++------------ xarray/tests/test_dask.py | 7 +++++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 3ef0229356b..945bcf077dd 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -3,11 +3,11 @@ from collections.abc import Callable, Iterable, Sequence from typing import TYPE_CHECKING, Any -import numpy as np import dask +import numpy as np -from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.core.common import _contains_cftime_datetimes +from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.namedarray.utils import is_duck_dask_array, module_available @@ -268,12 +268,12 @@ def shuffle( raise NotImplementedError("Only chunks='auto' is supported at present.") return dask.array.shuffle(x, indexer, axis, chunks="auto") - def rechunk( # type: ignore[override] - self, - data: T_ChunkedArray, - chunks: _NormalizedChunks | tuple[int, ...] | _Chunks, - **kwargs: Any, - ) -> Any: + def rechunk( # type: ignore[override] + self, + data: T_ChunkedArray, + chunks: _NormalizedChunks | tuple[int, ...] | _Chunks, + **kwargs: Any, + ) -> Any: """ Changes the chunking pattern of the given array. @@ -302,15 +302,16 @@ def rechunk( # type: ignore[override] # Preprocess chunks if they're cftime cftime_nbytes_approx = 64 from dask.utils import parse_bytes + target_chunksize = parse_bytes(dask.config.get("array.chunk-size")) - + # Calculate total elements per chunk elements_per_chunk = target_chunksize // cftime_nbytes_approx - + # Distribute elements across dimensions # Simple approach: try to make chunks roughly cubic - ndim = data.ndim # type:ignore - shape = data.shape # type:ignore + ndim = data.ndim # type:ignore + shape = data.shape # type:ignore if ndim > 0: chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 7f67769fd7c..68a93dfc9e2 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1059,8 +1059,9 @@ def make_da(): return da + def make_da_cftime(): - yrs = np.arange(2000,2120) + yrs = np.arange(2000, 2120) cftime_dates = xr.date_range( start=f"{yrs[0]}-01-01", end=f"{yrs[-1]}-12-31", @@ -1078,9 +1079,10 @@ def make_da_cftime(): da.attrs["test"] = "test" da.coords["c2"] = 0.5 da.coords["ndcoord"] = da.x * 2 - + return da + def make_ds(): map_ds = xr.Dataset() map_ds["a"] = make_da() @@ -1161,6 +1163,7 @@ def test_auto_chunk_da(obj): np.testing.assert_array_equal(actual, expected) assert actual.chunks == expected.chunks + @pytest.mark.parametrize("obj", [make_da_cftime()]) def test_auto_chunk_da_cftime(obj): actual = obj.chunk("auto").data From 1aba531a52231043ad5db5fe6927a43792e5341c Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Sat, 12 Jul 2025 18:15:09 -0700 Subject: [PATCH 03/36] Fix moving import to be optional --- xarray/namedarray/daskmanager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 945bcf077dd..c1c9cd1fa86 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -3,7 +3,6 @@ from collections.abc import Callable, Iterable, Sequence from typing import TYPE_CHECKING, Any -import dask import numpy as np from xarray.core.common import _contains_cftime_datetimes @@ -302,8 +301,9 @@ def rechunk( # type: ignore[override] # Preprocess chunks if they're cftime cftime_nbytes_approx = 64 from dask.utils import parse_bytes + from dask import config as dask_config - target_chunksize = parse_bytes(dask.config.get("array.chunk-size")) + target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) # Calculate total elements per chunk elements_per_chunk = target_chunksize // cftime_nbytes_approx From 9429c3d0c2f43cb6cb056916f339e3f36cc39052 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Jul 2025 01:15:37 +0000 Subject: [PATCH 04/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/namedarray/daskmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index c1c9cd1fa86..6c9128c93b3 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -300,8 +300,8 @@ def rechunk( # type: ignore[override] if _contains_cftime_datetimes(data): # Preprocess chunks if they're cftime cftime_nbytes_approx = 64 - from dask.utils import parse_bytes from dask import config as dask_config + from dask.utils import parse_bytes target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) From 3c9d27ef7f4af34e1fbeb9c956c6c6121c0a2e91 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Sat, 12 Jul 2025 18:18:10 -0700 Subject: [PATCH 05/36] Make mypy happy --- xarray/namedarray/daskmanager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index c1c9cd1fa86..6a14c6dc2cf 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -300,18 +300,16 @@ def rechunk( # type: ignore[override] if _contains_cftime_datetimes(data): # Preprocess chunks if they're cftime cftime_nbytes_approx = 64 - from dask.utils import parse_bytes from dask import config as dask_config + from dask.utils import parse_bytes target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - # Calculate total elements per chunk elements_per_chunk = target_chunksize // cftime_nbytes_approx - # Distribute elements across dimensions - # Simple approach: try to make chunks roughly cubic - ndim = data.ndim # type:ignore - shape = data.shape # type:ignore + # Try to make chunks roughly cubic + ndim = data.ndim # type:ignore[attr-defined] + shape = data.shape # type:ignore[attr-defined] if ndim > 0: chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) From 5153d2d96717850fcbfb5137a52c566e7921a5d6 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Sat, 12 Jul 2025 20:26:05 -0700 Subject: [PATCH 06/36] Add some clarifying comments about what we need to do to optimise this --- xarray/namedarray/daskmanager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 6a14c6dc2cf..afa6a6fa463 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -307,7 +307,12 @@ def rechunk( # type: ignore[override] elements_per_chunk = target_chunksize // cftime_nbytes_approx - # Try to make chunks roughly cubic + """ + Try to make chunks roughly cubic. This needs to be a bit smarter, it + really ought to account for xr.structure.chunks._getchunk and try to + use the default encoding to set the chunk size. + """ + ndim = data.ndim # type:ignore[attr-defined] shape = data.shape # type:ignore[attr-defined] if ndim > 0: From cfdc31b26cb082fee61dd4db7c3ced7a6fd54ace Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 15 Jul 2025 09:00:42 +1000 Subject: [PATCH 07/36] @dcherian's suggestions. Just need to update chunking strategy to respect disk chunks sensibly & this should be ready to go, I think --- xarray/namedarray/daskmanager.py | 23 +++++++---------------- xarray/namedarray/utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index afa6a6fa463..5466ad44737 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -299,26 +299,17 @@ def rechunk( # type: ignore[override] if _contains_cftime_datetimes(data): # Preprocess chunks if they're cftime - cftime_nbytes_approx = 64 + from dask import config as dask_config from dask.utils import parse_bytes + from xarray.namedarray.utils import build_chunkspec + target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - elements_per_chunk = target_chunksize // cftime_nbytes_approx - - """ - Try to make chunks roughly cubic. This needs to be a bit smarter, it - really ought to account for xr.structure.chunks._getchunk and try to - use the default encoding to set the chunk size. - """ - - ndim = data.ndim # type:ignore[attr-defined] - shape = data.shape # type:ignore[attr-defined] - if ndim > 0: - chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) - chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) - else: - chunks = () + chunks = build_chunkspec( + data, + target_chunksize=target_chunksize, + ) return data.rechunk(chunks, **kwargs) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 96060730345..4a3bcbcdacd 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib +import sys import warnings from collections.abc import Hashable, Iterable, Iterator, Mapping from functools import lru_cache @@ -16,6 +17,8 @@ from numpy.typing import NDArray + from xarray.namedarray.parallelcompat import T_ChunkedArray + try: from dask.array.core import Array as DaskArray from dask.typing import DaskCollection @@ -195,6 +198,30 @@ def either_dict_or_kwargs( return pos_kwargs +def build_chunkspec( + data: T_ChunkedArray, + target_chunksize: int, +) -> tuple[int, ...]: + """ + Try to make chunks roughly cubic. This needs to be a bit smarter, it + really ought to account for xr.structure.chunks._getchunk and try to + use the default encoding to set the chunk size. + """ + from xarray.core.formatting import first_n_items + + cftime_nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] + elements_per_chunk = target_chunksize // cftime_nbytes_approx + ndim = data.ndim # type:ignore[attr-defined] + shape = data.shape # type:ignore[attr-defined] + if ndim > 0: + chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) + chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) + else: + chunks = () + + return chunks + + class ReprObject: """Object that prints as the given value, for use with sentinel values.""" From e58d6d719b68a42499744b3acd9b29bfa85548e5 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 23 Jul 2025 16:36:42 +0800 Subject: [PATCH 08/36] Can now load cftime arrays with auto-chunking. Implementation still kinda janky --- xarray/structure/chunks.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 281cfe278f1..4f9c414f08c 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload from xarray.core import utils +from xarray.core.common import _contains_cftime_datetimes from xarray.core.utils import emit_user_level_warning from xarray.core.variable import IndexVariable, Variable from xarray.namedarray.parallelcompat import ( @@ -83,9 +84,27 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - chunk_shape = chunkmanager.normalize_chunks( - chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape - ) + if _contains_cftime_datetimes(var): + # If we have cftime datetimes, need to preprocess them - we can't pass + # an object dtype into DaskManager.normalize_chunks. + from dask import config as dask_config + from dask.utils import parse_bytes + + from xarray.namedarray.utils import build_chunkspec + + target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) + chunk_shape = build_chunkspec(var, target_chunksize=target_chunksize) + + chunk_shape = chunkmanager.normalize_chunks( + chunk_shape, shape=shape, previous_chunks=preferred_chunk_shape + ) + else: + chunk_shape = chunkmanager.normalize_chunks( + chunk_shape, + shape=shape, + dtype=var.dtype, + previous_chunks=preferred_chunk_shape, + ) # Warn where requested chunks break preferred chunks, provided that the variable # contains data. From f953976fc162a16b1caa4cdf180e5bfa9014d249 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 25 Jul 2025 09:20:52 +0800 Subject: [PATCH 09/36] Test for autochunking when reading from disk --- xarray/tests/test_backends.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6997be200b1..34c87b9d2f1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5416,6 +5416,35 @@ def test_open_multi_dataset(self) -> None: ) as actual: assert_identical(expected, actual) + def test_open_dataset_cftime_autochunk(self) -> None: + """Create a dataset with cftime datetime objects and + ensure that auto-chunking works correctly.""" + import cftime + + from xarray.core.common import _contains_cftime_datetimes + + original = xr.Dataset( + { + "foo": ("time", [0.0]), + "time_bnds": ( + ("time", "bnds"), + [ + [ + cftime.Datetime360Day(2005, 12, 1, 0, 0, 0, 0), + cftime.Datetime360Day(2005, 12, 2, 0, 0, 0, 0), + ] + ], + ), + }, + {"time": [cftime.Datetime360Day(2005, 12, 1, 12, 0, 0, 0)]}, + ) + with create_tmp_file() as tmp: + original.to_netcdf(tmp) + with open_dataset(tmp, chunks="auto") as actual: + assert isinstance(actual.time_bnds.variable.data, da.Array) + assert _contains_cftime_datetimes(actual.time) + assert_identical(original, actual) + # Flaky test. Very open to contributions on fixing this @pytest.mark.flaky def test_dask_roundtrip(self) -> None: From 67065244af3dcd211e059d3c85b9529ee18ab3ab Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 25 Jul 2025 11:25:49 +0800 Subject: [PATCH 10/36] replace `build_chunkspec` with faking the dtype of a cftime array & adjusting target chunksize so we get sensible chunks using dask's default chunking strategy --- xarray/namedarray/daskmanager.py | 16 ++++++++++----- xarray/namedarray/utils.py | 34 +++++++++++++++----------------- xarray/structure/chunks.py | 34 +++++++++++++++++++------------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 5466ad44737..c5c781cbcee 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -301,15 +301,21 @@ def rechunk( # type: ignore[override] # Preprocess chunks if they're cftime from dask import config as dask_config + from dask.array.core import normalize_chunks from dask.utils import parse_bytes - from xarray.namedarray.utils import build_chunkspec + from xarray.namedarray.utils import fake_target_chunksize target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - - chunks = build_chunkspec( - data, - target_chunksize=target_chunksize, + limit, var_dtype = fake_target_chunksize( + data, target_chunksize=target_chunksize ) + chunks = normalize_chunks( + chunks, + shape=data.shape, + dtype=var_dtype, + limit=limit, + ) # type: ignore[no-untyped-call] + return data.rechunk(chunks, **kwargs) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 4a3bcbcdacd..d1f95a4bc0c 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -17,8 +17,6 @@ from numpy.typing import NDArray - from xarray.namedarray.parallelcompat import T_ChunkedArray - try: from dask.array.core import Array as DaskArray from dask.typing import DaskCollection @@ -26,7 +24,7 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] - from xarray.namedarray._typing import _Dim, duckarray + from xarray.namedarray._typing import _Dim, _DType, duckarray K = TypeVar("K") @@ -198,28 +196,28 @@ def either_dict_or_kwargs( return pos_kwargs -def build_chunkspec( - data: T_ChunkedArray, +def fake_target_chunksize( + data: Any, # Should be duckarray I think, but causes upsteam issues target_chunksize: int, -) -> tuple[int, ...]: +) -> tuple[int, _DType]: """ - Try to make chunks roughly cubic. This needs to be a bit smarter, it - really ought to account for xr.structure.chunks._getchunk and try to - use the default encoding to set the chunk size. + Naughty trick - let's get the ratio of our cftime_nbytes, and then compute + the ratio of that size to a np.float64. Then we can just adjust our target_chunksize + and use the default dask chunking algorithm to get a reasonable chunk size. """ + import numpy as np + from xarray.core.formatting import first_n_items + output_dtype: _DType = np.dtype(np.float64) # type: ignore[assignment] + cftime_nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] - elements_per_chunk = target_chunksize // cftime_nbytes_approx - ndim = data.ndim # type:ignore[attr-defined] - shape = data.shape # type:ignore[attr-defined] - if ndim > 0: - chunk_size_per_dim = int(elements_per_chunk ** (1.0 / ndim)) - chunks = tuple(min(chunk_size_per_dim, dim_size) for dim_size in shape) - else: - chunks = () - return chunks + f64_nbytes = output_dtype.itemsize # Should be 8 bytes + + target_chunksize = int(target_chunksize * (cftime_nbytes_approx / f64_nbytes)) + + return target_chunksize, output_dtype class ReprObject: diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 4f9c414f08c..31463224f95 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -12,7 +12,7 @@ from xarray.core import utils from xarray.core.common import _contains_cftime_datetimes -from xarray.core.utils import emit_user_level_warning +from xarray.core.utils import emit_user_level_warning, is_dict_like from xarray.core.variable import IndexVariable, Variable from xarray.namedarray.parallelcompat import ( ChunkManagerEntrypoint, @@ -84,27 +84,33 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - if _contains_cftime_datetimes(var): + # Chunks can be either dict-like or tuple-like (according to type annotations) + # at this point, so check for # this before we manually construct our chunk + # spec- if we've set chunks to auto + _chunks = list(chunks.values()) if is_dict_like(chunks) else chunks + auto_chunks = all(_chunk == "auto" for _chunk in _chunks) + + if _contains_cftime_datetimes(var) and auto_chunks: # If we have cftime datetimes, need to preprocess them - we can't pass # an object dtype into DaskManager.normalize_chunks. from dask import config as dask_config from dask.utils import parse_bytes - from xarray.namedarray.utils import build_chunkspec + from xarray.namedarray.utils import fake_target_chunksize target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - chunk_shape = build_chunkspec(var, target_chunksize=target_chunksize) - - chunk_shape = chunkmanager.normalize_chunks( - chunk_shape, shape=shape, previous_chunks=preferred_chunk_shape - ) + limit, var_dtype = fake_target_chunksize(var, target_chunksize=target_chunksize) else: - chunk_shape = chunkmanager.normalize_chunks( - chunk_shape, - shape=shape, - dtype=var.dtype, - previous_chunks=preferred_chunk_shape, - ) + var_dtype = var.dtype + limit = None + + chunk_shape = chunkmanager.normalize_chunks( + chunk_shape, + shape=shape, + dtype=var_dtype, + limit=limit, + previous_chunks=preferred_chunk_shape, + ) # Warn where requested chunks break preferred chunks, provided that the variable # contains data. From 4e56acddc26be74c8c8cb634ac7967f4692b4009 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Jul 2025 03:26:30 +0000 Subject: [PATCH 11/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/namedarray/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index d1f95a4bc0c..252df9573aa 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -197,7 +197,7 @@ def either_dict_or_kwargs( def fake_target_chunksize( - data: Any, # Should be duckarray I think, but causes upsteam issues + data: Any, # Should be duckarray I think, but causes upstream issues target_chunksize: int, ) -> tuple[int, _DType]: """ From 5d00b0afcddff9bbd0bb213704f5c3f857657c3f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 25 Jul 2025 13:15:16 +0800 Subject: [PATCH 12/36] Remove redundant comments, rename things to make them clearer, add more comprehensive tests, etc & Shut mypy up --- xarray/namedarray/daskmanager.py | 6 ++--- xarray/namedarray/utils.py | 11 +++++---- xarray/tests/test_namedarray.py | 40 ++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index c5c781cbcee..9b674213efa 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -298,8 +298,6 @@ def rechunk( # type: ignore[override] """ if _contains_cftime_datetimes(data): - # Preprocess chunks if they're cftime - from dask import config as dask_config from dask.array.core import normalize_chunks from dask.utils import parse_bytes @@ -307,13 +305,13 @@ def rechunk( # type: ignore[override] from xarray.namedarray.utils import fake_target_chunksize target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - limit, var_dtype = fake_target_chunksize( + limit, var_dtype = fake_target_chunksize( # type: ignore[var-annotated] data, target_chunksize=target_chunksize ) chunks = normalize_chunks( chunks, - shape=data.shape, + shape=data.shape, # type: ignore[attr-defined] dtype=var_dtype, limit=limit, ) # type: ignore[no-untyped-call] diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 252df9573aa..9fadc85d544 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -24,7 +24,7 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] - from xarray.namedarray._typing import _Dim, _DType, duckarray + from xarray.namedarray._typing import DuckArray, _Dim, _DType, duckarray K = TypeVar("K") @@ -197,7 +197,7 @@ def either_dict_or_kwargs( def fake_target_chunksize( - data: Any, # Should be duckarray I think, but causes upstream issues + data: DuckArray[Any], target_chunksize: int, ) -> tuple[int, _DType]: """ @@ -211,11 +211,14 @@ def fake_target_chunksize( output_dtype: _DType = np.dtype(np.float64) # type: ignore[assignment] - cftime_nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] + if data.dtype == object: + nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] + else: + nbytes_approx = data[0].itemsize f64_nbytes = output_dtype.itemsize # Should be 8 bytes - target_chunksize = int(target_chunksize * (cftime_nbytes_approx / f64_nbytes)) + target_chunksize = int(target_chunksize * (f64_nbytes / nbytes_approx)) return target_chunksize, output_dtype diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 537cd824767..ba2368cb5fd 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -6,6 +6,7 @@ from collections.abc import Mapping from typing import TYPE_CHECKING, Any, Generic, cast, overload +import cftime import numpy as np import pytest from packaging.version import Version @@ -18,6 +19,7 @@ _ShapeType_co, ) from xarray.namedarray.core import NamedArray, from_array +from xarray.namedarray.utils import fake_target_chunksize if TYPE_CHECKING: from types import ModuleType @@ -26,6 +28,7 @@ from xarray.namedarray._typing import ( Default, + DuckArray, _AttrsLike, _Dim, _DimsLike, @@ -609,3 +612,40 @@ def test_repr() -> None: # Basic comparison: assert r == " Size: 8B\narray([0], dtype=uint64)" + + +@pytest.mark.parametrize( + "input_array, expected_chunksize_faked", + [ + (np.arange(100).reshape(10, 10), 1024), + (np.arange(100).reshape(10, 10).astype(np.float32), 2048), + ( + np.array( + [ + cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0) + for month in range(1, 11) + for day in range(1, 11) + ], + dtype=object, + ).reshape(10, 10), + 73, + ), + ], +) +def test_fake_target_chunksize( + input_array: DuckArray[Any], expected_chunksize_faked: int +) -> None: + """ + Check that `fake_target_chunksize` returns the expected chunksize and dtype. + - It pretends to dask we are chunking an array with an 8-byte dtype, ie. a float64. + As such, it wll *double* the amount of memory a 4-byte dtype (like float32) would try to use, + fooling it into actually using the correct amount of memory. For object dtypes, which are + generally larger, it will reduce the effective dask configuration chunksize, reducing the size of + the arrays per chunk such that we get the same amount of memory used. + """ + target_chunksize = 1024 + + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[var-annotated] + + assert faked_chunksize == expected_chunksize_faked + assert dtype == np.float64 From 80421ef5616fab2c46396c3dd9a51c7128dea844 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Jul 2025 08:57:37 +0000 Subject: [PATCH 13/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_namedarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index ba2368cb5fd..35c278805f6 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -638,7 +638,7 @@ def test_fake_target_chunksize( """ Check that `fake_target_chunksize` returns the expected chunksize and dtype. - It pretends to dask we are chunking an array with an 8-byte dtype, ie. a float64. - As such, it wll *double* the amount of memory a 4-byte dtype (like float32) would try to use, + As such, it will *double* the amount of memory a 4-byte dtype (like float32) would try to use, fooling it into actually using the correct amount of memory. For object dtypes, which are generally larger, it will reduce the effective dask configuration chunksize, reducing the size of the arrays per chunk such that we get the same amount of memory used. From d1f7ad34b84eda3c78b44d5e8f04272d1bb2949d Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 25 Jul 2025 17:53:56 +0800 Subject: [PATCH 14/36] Refactor to move most of the changes into the DaskManager --- xarray/namedarray/daskmanager.py | 11 +++++++++++ xarray/namedarray/parallelcompat.py | 29 +++++++++++++++++++++++++++++ xarray/namedarray/utils.py | 8 +++++++- xarray/structure/chunks.py | 13 ++----------- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 9b674213efa..f28bef7e930 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -11,8 +11,10 @@ from xarray.namedarray.utils import is_duck_dask_array, module_available if TYPE_CHECKING: + from xarray.core.variable import Variable from xarray.namedarray._typing import ( T_Chunks, + _DType, _DType_co, _NormalizedChunks, duckarray, @@ -317,3 +319,12 @@ def rechunk( # type: ignore[override] ) # type: ignore[no-untyped-call] return data.rechunk(chunks, **kwargs) + + def get_auto_chunk_size(self, var: Variable) -> tuple[int, _DType]: + from dask import config as dask_config + from dask.utils import parse_bytes + + from xarray.namedarray.utils import fake_target_chunksize + + target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) + return fake_target_chunksize(var, target_chunksize=target_chunksize) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index c1fe5999ecb..e2fb5e5682f 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -746,3 +746,32 @@ def store( cubed.store """ raise NotImplementedError() + + def get_auto_chunk_size( + self, + var, + ) -> tuple[int, _DType]: + """ + Get the default chunk size for a variable. + + This is used to determine the chunk size when opening a dataset with + ``chunks="auto"`` or when rechunking an array with ``chunks="auto"``. + + Parameters + ---------- + var : xarray.Variable + The variable for which to get the chunk size. + target_chunksize : int, optional + The target chunk size in bytes. If not provided, a default value is used. + + Returns + ------- + chunk_size : int + The chunk size in bytes. + dtype : np.dtype + The data type of the variable. + """ + + raise NotImplementedError( + "get_auto_chunk_size must be implemented by the chunk manager." + ) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 9fadc85d544..f40b77371d5 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -24,7 +24,9 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] + from xarray.core.variable import Variable from xarray.namedarray._typing import DuckArray, _Dim, _DType, duckarray + from xarray.namedarray.parallelcompat import T_ChunkedArray K = TypeVar("K") @@ -197,13 +199,17 @@ def either_dict_or_kwargs( def fake_target_chunksize( - data: DuckArray[Any], + data: DuckArray[Any] | T_ChunkedArray | Variable, target_chunksize: int, ) -> tuple[int, _DType]: """ Naughty trick - let's get the ratio of our cftime_nbytes, and then compute the ratio of that size to a np.float64. Then we can just adjust our target_chunksize and use the default dask chunking algorithm to get a reasonable chunk size. + + ? I don't think T_chunkedArray or Variable should be necessary, but the calls + ? to this in daskmanager.py requires it to be that. I still need to wrap my head + ? around the typing here a bit more. """ import numpy as np diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 31463224f95..150635f54d5 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -91,18 +91,9 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): auto_chunks = all(_chunk == "auto" for _chunk in _chunks) if _contains_cftime_datetimes(var) and auto_chunks: - # If we have cftime datetimes, need to preprocess them - we can't pass - # an object dtype into DaskManager.normalize_chunks. - from dask import config as dask_config - from dask.utils import parse_bytes - - from xarray.namedarray.utils import fake_target_chunksize - - target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - limit, var_dtype = fake_target_chunksize(var, target_chunksize=target_chunksize) + limit, var_dtype = chunkmanager.get_auto_chunk_size(var) else: - var_dtype = var.dtype - limit = None + limit, var_dtype = None, var.dtype chunk_shape = chunkmanager.normalize_chunks( chunk_shape, From 4407185e7ac5a7bf97c867624c3e1b5108926002 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 25 Jul 2025 18:19:12 +0800 Subject: [PATCH 15/36] bare-min tests should pass now? --- xarray/tests/test_namedarray.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 35c278805f6..d35545d0603 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -6,7 +6,6 @@ from collections.abc import Mapping from typing import TYPE_CHECKING, Any, Generic, cast, overload -import cftime import numpy as np import pytest from packaging.version import Version @@ -40,6 +39,13 @@ duckarray, ) +try: + import cftime + + cftime_available = True +except ModuleNotFoundError: + cftime_available = False + class CustomArrayBase(Generic[_ShapeType_co, _DType_co]): def __init__(self, array: duckarray[Any, _DType_co]) -> None: @@ -620,15 +626,21 @@ def test_repr() -> None: (np.arange(100).reshape(10, 10), 1024), (np.arange(100).reshape(10, 10).astype(np.float32), 2048), ( - np.array( - [ - cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0) - for month in range(1, 11) - for day in range(1, 11) - ], - dtype=object, - ).reshape(10, 10), - 73, + pytest.param( + np.array( + [ + cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0) + for month in range(1, 11) + for day in range(1, 11) + ], + dtype=object, + ).reshape(10, 10), + 73, + marks=pytest.mark.xfail( + not cftime_available, + reason="cftime not available, cannot test object dtype with cftime dates", + ), + ) ), ], ) From d8f45b22f0221addf28b362c50187b069c0b5f2c Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 28 Jul 2025 08:43:22 +0800 Subject: [PATCH 16/36] Deepak's suggestions (think mypy is still going to be angry for now) --- xarray/namedarray/daskmanager.py | 7 ++--- xarray/namedarray/parallelcompat.py | 6 ++-- xarray/namedarray/utils.py | 1 + xarray/structure/chunks.py | 6 ++-- xarray/tests/test_backends.py | 11 ++++---- xarray/tests/test_namedarray.py | 44 ++++++++++++++++++----------- 6 files changed, 41 insertions(+), 34 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index f28bef7e930..36b4ddba1d2 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -320,11 +320,8 @@ def rechunk( # type: ignore[override] return data.rechunk(chunks, **kwargs) - def get_auto_chunk_size(self, var: Variable) -> tuple[int, _DType]: + def get_auto_chunk_size(self, var: Variable) -> int: from dask import config as dask_config from dask.utils import parse_bytes - from xarray.namedarray.utils import fake_target_chunksize - - target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) - return fake_target_chunksize(var, target_chunksize=target_chunksize) + return parse_bytes(dask_config.get("array.chunk-size")) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index e2fb5e5682f..9ff5ea79b78 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -749,8 +749,8 @@ def store( def get_auto_chunk_size( self, - var, - ) -> tuple[int, _DType]: + var, #: xarray.Variable, + ) -> int: """ Get the default chunk size for a variable. @@ -768,8 +768,6 @@ def get_auto_chunk_size( ------- chunk_size : int The chunk size in bytes. - dtype : np.dtype - The data type of the variable. """ raise NotImplementedError( diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index f40b77371d5..ababebd8146 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -11,6 +11,7 @@ from packaging.version import Version from xarray.namedarray._typing import ErrorOptionsWithWarn, _DimsLike +from xarray.namedarray.utils import fake_target_chunksize if TYPE_CHECKING: from typing import TypeGuard diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 150635f54d5..d1ffb2ab9f5 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -19,6 +19,7 @@ get_chunked_array_type, guess_chunkmanager, ) +from xarray.namedarray.utils import fake_target_chunksize if TYPE_CHECKING: from xarray.core.dataarray import DataArray @@ -88,10 +89,11 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): # at this point, so check for # this before we manually construct our chunk # spec- if we've set chunks to auto _chunks = list(chunks.values()) if is_dict_like(chunks) else chunks - auto_chunks = all(_chunk == "auto" for _chunk in _chunks) + auto_chunks = any(_chunk == "auto" for _chunk in _chunks) if _contains_cftime_datetimes(var) and auto_chunks: - limit, var_dtype = chunkmanager.get_auto_chunk_size(var) + limit = chunkmanager.get_auto_chunk_size(var) + limit, var_dtype = fake_target_chunksize(var, limit) else: limit, var_dtype = None, var.dtype diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fafafd190f3..89b10c2d177 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5427,6 +5427,7 @@ def test_open_multi_dataset(self) -> None: ) as actual: assert_identical(expected, actual) + @requires_cftime def test_open_dataset_cftime_autochunk(self) -> None: """Create a dataset with cftime datetime objects and ensure that auto-chunking works correctly.""" @@ -5449,12 +5450,10 @@ def test_open_dataset_cftime_autochunk(self) -> None: }, {"time": [cftime.Datetime360Day(2005, 12, 1, 12, 0, 0, 0)]}, ) - with create_tmp_file() as tmp: - original.to_netcdf(tmp) - with open_dataset(tmp, chunks="auto") as actual: - assert isinstance(actual.time_bnds.variable.data, da.Array) - assert _contains_cftime_datetimes(actual.time) - assert_identical(original, actual) + with self.roundtrip(original, open_kwargs={"chunks": "auto"}) as actual: + assert isinstance(actual.time_bnds.variable.data, da.Array) + assert _contains_cftime_datetimes(actual.time) + assert_identical(original, actual) # Flaky test. Very open to contributions on fixing this @pytest.mark.flaky diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index d35545d0603..9c8df636529 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -19,6 +19,7 @@ ) from xarray.namedarray.core import NamedArray, from_array from xarray.namedarray.utils import fake_target_chunksize +from xarray.tests import requires_cftime if TYPE_CHECKING: from types import ModuleType @@ -625,23 +626,6 @@ def test_repr() -> None: [ (np.arange(100).reshape(10, 10), 1024), (np.arange(100).reshape(10, 10).astype(np.float32), 2048), - ( - pytest.param( - np.array( - [ - cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0) - for month in range(1, 11) - for day in range(1, 11) - ], - dtype=object, - ).reshape(10, 10), - 73, - marks=pytest.mark.xfail( - not cftime_available, - reason="cftime not available, cannot test object dtype with cftime dates", - ), - ) - ), ], ) def test_fake_target_chunksize( @@ -661,3 +645,29 @@ def test_fake_target_chunksize( assert faked_chunksize == expected_chunksize_faked assert dtype == np.float64 + + +@requires_cftime +def test_fake_target_chunksize_cftime() -> None: + """ + Check that `fake_target_chunksize` returns the expected chunksize and dtype. + - It pretends to dask we are chunking an array with an 8-byte dtype, ie. a float64. + - This is the same as the above test, but specifically for a CFTime array case - split for testing reasons + """ + import cftime + + target_chunksize = 1024 + + input_array = np.array( + [ + cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0) + for month in range(1, 11) + for day in range(1, 11) + ], + dtype=object, + ).reshape(10, 10) + + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[var-annotated] + + assert faked_chunksize == 73 + assert dtype == np.float64 From 20226c13ecbb65dfe4da99f2077b43a153cfdc14 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Jul 2025 00:44:21 +0000 Subject: [PATCH 17/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/namedarray/daskmanager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 36b4ddba1d2..eb47601800b 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -14,7 +14,6 @@ from xarray.core.variable import Variable from xarray.namedarray._typing import ( T_Chunks, - _DType, _DType_co, _NormalizedChunks, duckarray, From 8485df5fbfc1115ce9ea77da10e51174f1ad5a11 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 28 Jul 2025 08:53:58 +0800 Subject: [PATCH 18/36] Fix errant line --- xarray/namedarray/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index ababebd8146..f40b77371d5 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -11,7 +11,6 @@ from packaging.version import Version from xarray.namedarray._typing import ErrorOptionsWithWarn, _DimsLike -from xarray.namedarray.utils import fake_target_chunksize if TYPE_CHECKING: from typing import TypeGuard From 2c2787756d052662821447520976ed180b21155a Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 28 Jul 2025 09:56:46 +0800 Subject: [PATCH 19/36] Clean up `DaskManager.rechunk` a bit - maybe possible to remove more added code? --- xarray/namedarray/daskmanager.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index eb47601800b..09fca8d5191 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -299,15 +299,14 @@ def rechunk( # type: ignore[override] """ if _contains_cftime_datetimes(data): - from dask import config as dask_config from dask.array.core import normalize_chunks - from dask.utils import parse_bytes from xarray.namedarray.utils import fake_target_chunksize - target_chunksize = parse_bytes(dask_config.get("array.chunk-size")) + limit = self.get_auto_chunk_size(data) + limit, var_dtype = fake_target_chunksize( # type: ignore[var-annotated] - data, target_chunksize=target_chunksize + data, target_chunksize=limit ) chunks = normalize_chunks( From 098326196375c5e13faba43a98ed53e7bd69b44e Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 28 Jul 2025 10:00:53 +0800 Subject: [PATCH 20/36] Remove unused import --- xarray/tests/test_namedarray.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 9c8df636529..e3fe70a5846 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -40,13 +40,6 @@ duckarray, ) -try: - import cftime - - cftime_available = True -except ModuleNotFoundError: - cftime_available = False - class CustomArrayBase(Generic[_ShapeType_co, _DType_co]): def __init__(self, array: duckarray[Any, _DType_co]) -> None: From 6c93bc489ae83682ddfc7cb6ad8ac4de945147b1 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 8 Aug 2025 12:07:04 +0800 Subject: [PATCH 21/36] Fix a couple of type errors --- xarray/namedarray/daskmanager.py | 6 +++--- xarray/namedarray/utils.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 09fca8d5191..1a67327652a 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -18,7 +18,7 @@ _NormalizedChunks, duckarray, ) - from xarray.namedarray.parallelcompat import _Chunks + from xarray.namedarray.parallelcompat import ChunkedArrayMixinProtocol, _Chunks try: from dask.array import Array as DaskArray @@ -268,9 +268,9 @@ def shuffle( raise NotImplementedError("Only chunks='auto' is supported at present.") return dask.array.shuffle(x, indexer, axis, chunks="auto") - def rechunk( # type: ignore[override] + def rechunk( self, - data: T_ChunkedArray, + data: ChunkedArrayMixinProtocol, chunks: _NormalizedChunks | tuple[int, ...] | _Chunks, **kwargs: Any, ) -> Any: diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index f40b77371d5..9efb6f8388c 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -24,7 +24,6 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] - from xarray.core.variable import Variable from xarray.namedarray._typing import DuckArray, _Dim, _DType, duckarray from xarray.namedarray.parallelcompat import T_ChunkedArray @@ -199,7 +198,7 @@ def either_dict_or_kwargs( def fake_target_chunksize( - data: DuckArray[Any] | T_ChunkedArray | Variable, + data: DuckArray[Any] | T_ChunkedArray, target_chunksize: int, ) -> tuple[int, _DType]: """ @@ -220,7 +219,7 @@ def fake_target_chunksize( if data.dtype == object: nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] else: - nbytes_approx = data[0].itemsize + nbytes_approx = data.dtype.itemsize f64_nbytes = output_dtype.itemsize # Should be 8 bytes From 74bc0ea449f4be037ee70d4915b710045c33938a Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 8 Aug 2025 14:19:52 +0800 Subject: [PATCH 22/36] Mypy & tests passing locally --- xarray/namedarray/daskmanager.py | 12 +++++------- xarray/namedarray/parallelcompat.py | 3 --- xarray/namedarray/utils.py | 14 ++++++++++---- xarray/structure/chunks.py | 9 ++++----- xarray/tests/test_namedarray.py | 4 ++-- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 1a67327652a..10bc81e39c0 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -11,14 +11,14 @@ from xarray.namedarray.utils import is_duck_dask_array, module_available if TYPE_CHECKING: - from xarray.core.variable import Variable from xarray.namedarray._typing import ( T_Chunks, + _Chunks, _DType_co, _NormalizedChunks, duckarray, ) - from xarray.namedarray.parallelcompat import ChunkedArrayMixinProtocol, _Chunks + from xarray.namedarray.parallelcompat import ChunkedArrayMixinProtocol try: from dask.array import Array as DaskArray @@ -303,11 +303,9 @@ def rechunk( from xarray.namedarray.utils import fake_target_chunksize - limit = self.get_auto_chunk_size(data) + limit = self.get_auto_chunk_size() - limit, var_dtype = fake_target_chunksize( # type: ignore[var-annotated] - data, target_chunksize=limit - ) + limit, var_dtype = fake_target_chunksize(data, target_chunksize=limit) chunks = normalize_chunks( chunks, @@ -318,7 +316,7 @@ def rechunk( return data.rechunk(chunks, **kwargs) - def get_auto_chunk_size(self, var: Variable) -> int: + def get_auto_chunk_size(self) -> int: from dask import config as dask_config from dask.utils import parse_bytes diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 9ff5ea79b78..fd6669b8f6b 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -749,7 +749,6 @@ def store( def get_auto_chunk_size( self, - var, #: xarray.Variable, ) -> int: """ Get the default chunk size for a variable. @@ -759,8 +758,6 @@ def get_auto_chunk_size( Parameters ---------- - var : xarray.Variable - The variable for which to get the chunk size. target_chunksize : int, optional The target chunk size in bytes. If not provided, a default value is used. diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 9efb6f8388c..41a80de0d17 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -24,7 +24,8 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] - from xarray.namedarray._typing import DuckArray, _Dim, _DType, duckarray + from xarray.core.variable import Variable + from xarray.namedarray._typing import _Dim, duckarray from xarray.namedarray.parallelcompat import T_ChunkedArray @@ -198,9 +199,10 @@ def either_dict_or_kwargs( def fake_target_chunksize( - data: DuckArray[Any] | T_ChunkedArray, + data: Variable | T_ChunkedArray, target_chunksize: int, -) -> tuple[int, _DType]: + no_op: bool = False, +) -> tuple[int, np.dtype[Any]]: """ Naughty trick - let's get the ratio of our cftime_nbytes, and then compute the ratio of that size to a np.float64. Then we can just adjust our target_chunksize @@ -210,11 +212,15 @@ def fake_target_chunksize( ? to this in daskmanager.py requires it to be that. I still need to wrap my head ? around the typing here a bit more. """ + + if no_op: + return target_chunksize, data.dtype + import numpy as np from xarray.core.formatting import first_n_items - output_dtype: _DType = np.dtype(np.float64) # type: ignore[assignment] + output_dtype = np.dtype(np.float64) if data.dtype == object: nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index d1ffb2ab9f5..5c2ab2c44f7 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -25,6 +25,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.types import T_ChunkDim + from xarray.core.variable import IndexVariable, Variable MissingCoreDimOptions = Literal["raise", "copy", "drop"] @@ -91,11 +92,9 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): _chunks = list(chunks.values()) if is_dict_like(chunks) else chunks auto_chunks = any(_chunk == "auto" for _chunk in _chunks) - if _contains_cftime_datetimes(var) and auto_chunks: - limit = chunkmanager.get_auto_chunk_size(var) - limit, var_dtype = fake_target_chunksize(var, limit) - else: - limit, var_dtype = None, var.dtype + limit = chunkmanager.get_auto_chunk_size() + no_op = not (_contains_cftime_datetimes(var) and auto_chunks) + limit, var_dtype = fake_target_chunksize(var, limit, no_op) chunk_shape = chunkmanager.normalize_chunks( chunk_shape, diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index e3fe70a5846..a10555c8145 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -634,7 +634,7 @@ def test_fake_target_chunksize( """ target_chunksize = 1024 - faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[var-annotated] + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type] assert faked_chunksize == expected_chunksize_faked assert dtype == np.float64 @@ -660,7 +660,7 @@ def test_fake_target_chunksize_cftime() -> None: dtype=object, ).reshape(10, 10) - faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[var-annotated] + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type] assert faked_chunksize == 73 assert dtype == np.float64 From b5933ed2d6e0b070b97b544d6c80e62c152a2ed4 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 25 Aug 2025 08:04:45 +0800 Subject: [PATCH 23/36] Deepak's comments --- xarray/namedarray/utils.py | 13 +------------ xarray/structure/chunks.py | 12 ++---------- xarray/tests/test_backends.py | 3 +-- xarray/tests/test_dask.py | 26 ++++++++++++++++++++++++-- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 41a80de0d17..903b568566a 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -201,23 +201,12 @@ def either_dict_or_kwargs( def fake_target_chunksize( data: Variable | T_ChunkedArray, target_chunksize: int, - no_op: bool = False, ) -> tuple[int, np.dtype[Any]]: """ Naughty trick - let's get the ratio of our cftime_nbytes, and then compute the ratio of that size to a np.float64. Then we can just adjust our target_chunksize and use the default dask chunking algorithm to get a reasonable chunk size. - - ? I don't think T_chunkedArray or Variable should be necessary, but the calls - ? to this in daskmanager.py requires it to be that. I still need to wrap my head - ? around the typing here a bit more. """ - - if no_op: - return target_chunksize, data.dtype - - import numpy as np - from xarray.core.formatting import first_n_items output_dtype = np.dtype(np.float64) @@ -227,7 +216,7 @@ def fake_target_chunksize( else: nbytes_approx = data.dtype.itemsize - f64_nbytes = output_dtype.itemsize # Should be 8 bytes + f64_nbytes = output_dtype.itemsize target_chunksize = int(target_chunksize * (f64_nbytes / nbytes_approx)) diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 5c2ab2c44f7..dd8fd87de39 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -11,8 +11,7 @@ from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload from xarray.core import utils -from xarray.core.common import _contains_cftime_datetimes -from xarray.core.utils import emit_user_level_warning, is_dict_like +from xarray.core.utils import emit_user_level_warning from xarray.core.variable import IndexVariable, Variable from xarray.namedarray.parallelcompat import ( ChunkManagerEntrypoint, @@ -86,15 +85,8 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - # Chunks can be either dict-like or tuple-like (according to type annotations) - # at this point, so check for # this before we manually construct our chunk - # spec- if we've set chunks to auto - _chunks = list(chunks.values()) if is_dict_like(chunks) else chunks - auto_chunks = any(_chunk == "auto" for _chunk in _chunks) - limit = chunkmanager.get_auto_chunk_size() - no_op = not (_contains_cftime_datetimes(var) and auto_chunks) - limit, var_dtype = fake_target_chunksize(var, limit, no_op) + limit, var_dtype = fake_target_chunksize(var, limit) chunk_shape = chunkmanager.normalize_chunks( chunk_shape, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5020b9114f4..fa46d7e15d6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -60,6 +60,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.common import _contains_cftime_datetimes from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.types import PDDatetimeUnitOptions @@ -6067,8 +6068,6 @@ def test_open_dataset_cftime_autochunk(self) -> None: ensure that auto-chunking works correctly.""" import cftime - from xarray.core.common import _contains_cftime_datetimes - original = xr.Dataset( { "foo": ("time", [0.0]), diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index c4bd13fafca..e209250ed48 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1176,8 +1176,30 @@ def test_auto_chunk_da(obj): assert actual.chunks == expected.chunks -@pytest.mark.parametrize("obj", [make_da_cftime()]) -def test_auto_chunk_da_cftime(obj): +def test_auto_chunk_da_cftime(): + def make_da_cftime(): + yrs = np.arange(2000, 2120) + cftime_dates = xr.date_range( + start=f"{yrs[0]}-01-01", + end=f"{yrs[-1]}-12-31", + freq="1YE", + use_cftime=True, + ) + yr_array = np.tile(cftime_dates.values, (10, 1)) + da = xr.DataArray( + yr_array, + dims=["x", "t"], + coords={"x": np.arange(10), "t": cftime_dates}, + name="a", + ).chunk({"x": 4, "t": 5}) + da.x.attrs["long_name"] = "x" + da.attrs["test"] = "test" + da.coords["c2"] = 0.5 + da.coords["ndcoord"] = da.x * 2 + + return da + + obj = make_da_cftime() actual = obj.chunk("auto").data expected = obj.data.rechunk({0: 10, 1: 120}) np.testing.assert_array_equal(actual, expected) From 9443815a29332b94e8e8d207deabf347c6070710 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 18 Sep 2025 08:45:07 -0600 Subject: [PATCH 24/36] Edits --- xarray/namedarray/daskmanager.py | 51 ---------------------------- xarray/tests/test_dask.py | 58 ++++++-------------------------- 2 files changed, 10 insertions(+), 99 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 10bc81e39c0..eb01a150c18 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -5,7 +5,6 @@ import numpy as np -from xarray.core.common import _contains_cftime_datetimes from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.namedarray.utils import is_duck_dask_array, module_available @@ -13,12 +12,10 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( T_Chunks, - _Chunks, _DType_co, _NormalizedChunks, duckarray, ) - from xarray.namedarray.parallelcompat import ChunkedArrayMixinProtocol try: from dask.array import Array as DaskArray @@ -268,54 +265,6 @@ def shuffle( raise NotImplementedError("Only chunks='auto' is supported at present.") return dask.array.shuffle(x, indexer, axis, chunks="auto") - def rechunk( - self, - data: ChunkedArrayMixinProtocol, - chunks: _NormalizedChunks | tuple[int, ...] | _Chunks, - **kwargs: Any, - ) -> Any: - """ - Changes the chunking pattern of the given array. - - Called when the .chunk method is called on an xarray object that is already chunked. - - Parameters - ---------- - data : dask array - Array to be rechunked. - chunks : int, tuple, dict or str, optional - The new block dimensions to create. -1 indicates the full size of the - corresponding dimension. Default is "auto" which automatically - determines chunk sizes. - - Returns - ------- - chunked array - - See Also - -------- - dask.array.Array.rechunk - cubed.Array.rechunk - """ - - if _contains_cftime_datetimes(data): - from dask.array.core import normalize_chunks - - from xarray.namedarray.utils import fake_target_chunksize - - limit = self.get_auto_chunk_size() - - limit, var_dtype = fake_target_chunksize(data, target_chunksize=limit) - - chunks = normalize_chunks( - chunks, - shape=data.shape, # type: ignore[attr-defined] - dtype=var_dtype, - limit=limit, - ) # type: ignore[no-untyped-call] - - return data.rechunk(chunks, **kwargs) - def get_auto_chunk_size(self) -> int: from dask import config as dask_config from dask.utils import parse_bytes diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index e209250ed48..13c16b14478 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1072,29 +1072,6 @@ def make_da(): return da -def make_da_cftime(): - yrs = np.arange(2000, 2120) - cftime_dates = xr.date_range( - start=f"{yrs[0]}-01-01", - end=f"{yrs[-1]}-12-31", - freq="1YE", - use_cftime=True, - ) - yr_array = np.tile(cftime_dates.values, (10, 1)) - da = xr.DataArray( - yr_array, - dims=["x", "t"], - coords={"x": np.arange(10), "t": cftime_dates}, - name="a", - ).chunk({"x": 4, "t": 5}) - da.x.attrs["long_name"] = "x" - da.attrs["test"] = "test" - da.coords["c2"] = 0.5 - da.coords["ndcoord"] = da.x * 2 - - return da - - def make_ds(): map_ds = xr.Dataset() map_ds["a"] = make_da() @@ -1177,31 +1154,16 @@ def test_auto_chunk_da(obj): def test_auto_chunk_da_cftime(): - def make_da_cftime(): - yrs = np.arange(2000, 2120) - cftime_dates = xr.date_range( - start=f"{yrs[0]}-01-01", - end=f"{yrs[-1]}-12-31", - freq="1YE", - use_cftime=True, - ) - yr_array = np.tile(cftime_dates.values, (10, 1)) - da = xr.DataArray( - yr_array, - dims=["x", "t"], - coords={"x": np.arange(10), "t": cftime_dates}, - name="a", - ).chunk({"x": 4, "t": 5}) - da.x.attrs["long_name"] = "x" - da.attrs["test"] = "test" - da.coords["c2"] = 0.5 - da.coords["ndcoord"] = da.x * 2 - - return da - - obj = make_da_cftime() - actual = obj.chunk("auto").data - expected = obj.data.rechunk({0: 10, 1: 120}) + yrs = np.arange(2000, 2120) + cftime_dates = xr.date_range( + start=f"{yrs[0]}-01-01", end=f"{yrs[-1]}-12-31", freq="1YE", use_cftime=True + ) + yr_array = np.tile(cftime_dates.values, (10, 1)) + da = xr.DataArray( + yr_array, dims=["x", "t"], coords={"x": np.arange(10), "t": cftime_dates} + ).chunk({"x": 4, "t": 5}) + actual = da.chunk("auto").data + expected = da.data.rechunk({0: 10, 1: 120}) np.testing.assert_array_equal(actual, expected) assert actual.chunks == expected.chunks From 85ebafdc38d21f52fce373be1604fea0a9c29de5 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 23 Sep 2025 08:17:12 +0800 Subject: [PATCH 25/36] Start refactoring `get_chunk` into named_array - xfail marker on the auto_chunk_da_cftime for now until we get everything else working --- xarray/backends/api.py | 3 +- xarray/namedarray/utils.py | 67 +++++++++++++++++++++++++++++++++++++- xarray/structure/chunks.py | 62 ++--------------------------------- xarray/tests/test_dask.py | 3 ++ 4 files changed, 73 insertions(+), 62 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 37b5d819ae5..cfe1dcd0c13 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -35,7 +35,8 @@ from xarray.core.utils import emit_user_level_warning, is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager -from xarray.structure.chunks import _get_chunk, _maybe_chunk +from xarray.namedarray.utils import _get_chunk +from xarray.structure.chunks import _maybe_chunk from xarray.structure.combine import ( _infer_concat_order_from_positions, _nested_combine, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 903b568566a..717f29b7ffa 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -5,6 +5,7 @@ import warnings from collections.abc import Hashable, Iterable, Iterator, Mapping from functools import lru_cache +from numbers import Number from typing import TYPE_CHECKING, Any, TypeVar, cast import numpy as np @@ -24,9 +25,10 @@ DaskArray = NDArray # type: ignore[assignment, misc] DaskCollection: Any = NDArray # type: ignore[no-redef] + from xarray.core.types import T_ChunkDim from xarray.core.variable import Variable from xarray.namedarray._typing import _Dim, duckarray - from xarray.namedarray.parallelcompat import T_ChunkedArray + from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray K = TypeVar("K") @@ -198,6 +200,69 @@ def either_dict_or_kwargs( return pos_kwargs +def _get_chunk( + var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint +) -> Mapping[Any, T_ChunkDim]: + """ + Return map from each dim to chunk sizes, accounting for backend's preferred chunks. + """ + import itertools + + from xarray.core.utils import emit_user_level_warning + from xarray.core.variable import IndexVariable + from xarray.structure.chunks import _get_breaks_cached + + if isinstance(var, IndexVariable): + return {} + dims = var.dims + shape = var.shape + + # Determine the explicit requested chunks. + preferred_chunks = var.encoding.get("preferred_chunks", {}) + preferred_chunk_shape = tuple( + itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) + ) + if isinstance(chunks, Number) or (chunks == "auto"): + chunks = dict.fromkeys(dims, chunks) + chunk_shape = tuple( + chunks.get(dim, None) or preferred_chunk_sizes + for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) + ) + + limit = chunkmanager.get_auto_chunk_size() + limit, var_dtype = fake_target_chunksize(var, limit) + + chunk_shape = chunkmanager.normalize_chunks( + chunk_shape, + shape=shape, + dtype=var_dtype, + limit=limit, + previous_chunks=preferred_chunk_shape, + ) + + # Warn where requested chunks break preferred chunks, provided that the variable + # contains data. + if var.size: + for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): + try: + preferred_chunk_sizes = preferred_chunks[dim] + except KeyError: + continue + disagreement = _get_breaks_cached( + size=size, + chunk_sizes=chunk_sizes, + preferred_chunk_sizes=preferred_chunk_sizes, + ) + if disagreement: + emit_user_level_warning( + "The specified chunks separate the stored chunks along " + f'dimension "{dim}" starting at index {disagreement}. This could ' + "degrade performance. Instead, consider rechunking after loading.", + ) + + return dict(zip(dims, chunk_shape, strict=True)) + + def fake_target_chunksize( data: Variable | T_ChunkedArray, target_chunksize: int, diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index dd8fd87de39..5e45b3da9ad 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -7,24 +7,21 @@ import itertools from collections.abc import Hashable, Mapping from functools import lru_cache -from numbers import Number from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload from xarray.core import utils -from xarray.core.utils import emit_user_level_warning -from xarray.core.variable import IndexVariable, Variable +from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import ( ChunkManagerEntrypoint, get_chunked_array_type, guess_chunkmanager, ) -from xarray.namedarray.utils import fake_target_chunksize if TYPE_CHECKING: from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.types import T_ChunkDim - from xarray.core.variable import IndexVariable, Variable + from xarray.core.variable import Variable MissingCoreDimOptions = Literal["raise", "copy", "drop"] @@ -64,61 +61,6 @@ def _get_breaks_cached( return None -def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): - """ - Return map from each dim to chunk sizes, accounting for backend's preferred chunks. - """ - if isinstance(var, IndexVariable): - return {} - dims = var.dims - shape = var.shape - - # Determine the explicit requested chunks. - preferred_chunks = var.encoding.get("preferred_chunks", {}) - preferred_chunk_shape = tuple( - itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) - ) - if isinstance(chunks, Number) or (chunks == "auto"): - chunks = dict.fromkeys(dims, chunks) - chunk_shape = tuple( - chunks.get(dim, None) or preferred_chunk_sizes - for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) - ) - - limit = chunkmanager.get_auto_chunk_size() - limit, var_dtype = fake_target_chunksize(var, limit) - - chunk_shape = chunkmanager.normalize_chunks( - chunk_shape, - shape=shape, - dtype=var_dtype, - limit=limit, - previous_chunks=preferred_chunk_shape, - ) - - # Warn where requested chunks break preferred chunks, provided that the variable - # contains data. - if var.size: - for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): - try: - preferred_chunk_sizes = preferred_chunks[dim] - except KeyError: - continue - disagreement = _get_breaks_cached( - size=size, - chunk_sizes=chunk_sizes, - preferred_chunk_sizes=preferred_chunk_sizes, - ) - if disagreement: - emit_user_level_warning( - "The specified chunks separate the stored chunks along " - f'dimension "{dim}" starting at index {disagreement}. This could ' - "degrade performance. Instead, consider rechunking after loading.", - ) - - return dict(zip(dims, chunk_shape, strict=True)) - - def _maybe_chunk( name: Hashable, var: Variable, diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index ccbfc06eeb0..ec654161e84 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1161,6 +1161,9 @@ def test_auto_chunk_da(obj): assert actual.chunks == expected.chunks +@pytest.mark.xfail( + reason="Mid refactoring - https://github.com/pydata/xarray/pull/10527#issuecomment-3308246670" +) def test_auto_chunk_da_cftime(): yrs = np.arange(2000, 2120) cftime_dates = xr.date_range( From e2627c6021ac3993a935a589d1d10d7b277a9a0e Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 23 Sep 2025 08:37:00 +0800 Subject: [PATCH 26/36] Optimise `fake_target_chunksize` (@dcherian suggestion) --- xarray/namedarray/utils.py | 31 ++++++++++++++++++++----------- xarray/tests/test_namedarray.py | 13 ++++++++----- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 717f29b7ffa..10cae371878 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -200,7 +200,7 @@ def either_dict_or_kwargs( return pos_kwargs -def _get_chunk( +def _get_chunk( # type: ignore[no-untyped-def] var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint ) -> Mapping[Any, T_ChunkDim]: """ @@ -265,27 +265,36 @@ def _get_chunk( def fake_target_chunksize( data: Variable | T_ChunkedArray, - target_chunksize: int, + limit: int, ) -> tuple[int, np.dtype[Any]]: """ - Naughty trick - let's get the ratio of our cftime_nbytes, and then compute - the ratio of that size to a np.float64. Then we can just adjust our target_chunksize - and use the default dask chunking algorithm to get a reasonable chunk size. + The `normalize_chunks` algorithm takes a size `limit` in bytes, but will not + work for object dtypes. So we rescale the `limit` to an appropriate one based + on `float64` dtype, and pass that to `normalize_chunks`. + + Arguments + --------- + data : Variable or ChunkedArray + The data for which we want to determine chunk sizes. + limit : int + The target chunk size in bytes. Passed to the chunk manager's `normalize_chunks` method. """ + + # Short circuit for non-object dtypes + if data.dtype != object: + return limit, data.dtype + from xarray.core.formatting import first_n_items output_dtype = np.dtype(np.float64) - if data.dtype == object: - nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] - else: - nbytes_approx = data.dtype.itemsize + nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] f64_nbytes = output_dtype.itemsize - target_chunksize = int(target_chunksize * (f64_nbytes / nbytes_approx)) + limit = int(limit * (f64_nbytes / nbytes_approx)) - return target_chunksize, output_dtype + return limit, output_dtype class ReprObject: diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index b07211bfe69..b752f8de83f 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -615,14 +615,17 @@ def test_repr() -> None: @pytest.mark.parametrize( - "input_array, expected_chunksize_faked", + "input_array, expected_chunksize_faked, expected_dtype", [ - (np.arange(100).reshape(10, 10), 1024), - (np.arange(100).reshape(10, 10).astype(np.float32), 2048), + (np.arange(100).reshape(10, 10), 1024, np.int64), + (np.arange(100).reshape(10, 10).astype(np.float32), 1024, np.float32), + (np.arange(100).reshape(10, 10).astype(object), 73, np.float64), ], ) def test_fake_target_chunksize( - input_array: DuckArray[Any], expected_chunksize_faked: int + input_array: DuckArray[Any], + expected_chunksize_faked: int, + expected_dtype: DTypeLike, ) -> None: """ Check that `fake_target_chunksize` returns the expected chunksize and dtype. @@ -637,7 +640,7 @@ def test_fake_target_chunksize( faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type] assert faked_chunksize == expected_chunksize_faked - assert dtype == np.float64 + assert dtype == expected_dtype @requires_cftime From 0bca828e00c569e6d16ebb2d57bd927a07b8121a Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 23 Sep 2025 11:11:32 +0800 Subject: [PATCH 27/36] WIP --- xarray/core/dataarray.py | 2 ++ xarray/namedarray/daskmanager.py | 12 ++++++++++++ xarray/tests/test_dask.py | 3 --- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d6f5ba149ca..ba1194b90c6 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1489,6 +1489,8 @@ def chunk( else: chunk_mapping = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + breakpoint() + ds = self._to_temp_dataset().chunk( chunk_mapping, name_prefix=name_prefix, diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index eb01a150c18..f2ed8bbbaf1 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -270,3 +270,15 @@ def get_auto_chunk_size(self) -> int: from dask.utils import parse_bytes return parse_bytes(dask_config.get("array.chunk-size")) + + def rechunk( + self, + data: DaskArray, + chunks: T_Chunks | _NormalizedChunks, + **kwargs: Any, + ) -> DaskArray: + from xarray.namedarray.utils import _get_chunk + + if data.dtype == object: + chunks = _get_chunk(data, chunks, self) + return data.rechunk(chunks, **kwargs) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index ec654161e84..ccbfc06eeb0 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1161,9 +1161,6 @@ def test_auto_chunk_da(obj): assert actual.chunks == expected.chunks -@pytest.mark.xfail( - reason="Mid refactoring - https://github.com/pydata/xarray/pull/10527#issuecomment-3308246670" -) def test_auto_chunk_da_cftime(): yrs = np.arange(2000, 2120) cftime_dates = xr.date_range( From a930a65c04487cd6865e25fce69b3d9c99d4d785 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 23 Sep 2025 14:55:05 +0800 Subject: [PATCH 28/36] Everything seems to be working - some type issues though I think --- xarray/backends/api.py | 7 ++++++- xarray/core/dataarray.py | 2 -- xarray/namedarray/daskmanager.py | 4 ++-- xarray/namedarray/utils.py | 16 ++++++++++++---- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cfe1dcd0c13..70abf65e166 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -245,7 +245,12 @@ def _chunk_ds( variables = {} for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks, chunkmanager) + var_chunks = _get_chunk( + var, + chunks, + chunkmanager, + preferred_chunks=var.encoding.get("preferred_chunks", {}), + ) variables[name] = _maybe_chunk( name, var, diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ba1194b90c6..d6f5ba149ca 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1489,8 +1489,6 @@ def chunk( else: chunk_mapping = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") - breakpoint() - ds = self._to_temp_dataset().chunk( chunk_mapping, name_prefix=name_prefix, diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index f2ed8bbbaf1..8d6620ce282 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -279,6 +279,6 @@ def rechunk( ) -> DaskArray: from xarray.namedarray.utils import _get_chunk - if data.dtype == object: - chunks = _get_chunk(data, chunks, self) + if data.dtype.hasobject: + chunks = _get_chunk(data, chunks, self, preferred_chunks={}) return data.rechunk(chunks, **kwargs) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 10cae371878..6bea6583ded 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -201,7 +201,11 @@ def either_dict_or_kwargs( def _get_chunk( # type: ignore[no-untyped-def] - var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint + var: Variable | T_ChunkedArray, + chunks, + chunkmanager: ChunkManagerEntrypoint, + *, + preferred_chunks, ) -> Mapping[Any, T_ChunkDim]: """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. @@ -214,11 +218,15 @@ def _get_chunk( # type: ignore[no-untyped-def] if isinstance(var, IndexVariable): return {} - dims = var.dims + + if not is_duck_array(var): + dims = var.dims + else: + dims = chunks.keys() + shape = var.shape # Determine the explicit requested chunks. - preferred_chunks = var.encoding.get("preferred_chunks", {}) preferred_chunk_shape = tuple( itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) ) @@ -281,7 +289,7 @@ def fake_target_chunksize( """ # Short circuit for non-object dtypes - if data.dtype != object: + if not data.dtype.hasobject: return limit, data.dtype from xarray.core.formatting import first_n_items From cbcb640ab93658aa8c6e750392329b8f74bc298f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 23 Sep 2025 17:03:28 +0800 Subject: [PATCH 29/36] object => cftime - zarr failures... --- xarray/namedarray/daskmanager.py | 3 ++- xarray/namedarray/utils.py | 4 +++- xarray/tests/test_namedarray.py | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 8d6620ce282..ad9157938e9 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -5,6 +5,7 @@ import numpy as np +from xarray.core.common import _contains_cftime_datetimes from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.namedarray.utils import is_duck_dask_array, module_available @@ -279,6 +280,6 @@ def rechunk( ) -> DaskArray: from xarray.namedarray.utils import _get_chunk - if data.dtype.hasobject: + if _contains_cftime_datetimes(data): chunks = _get_chunk(data, chunks, self, preferred_chunks={}) return data.rechunk(chunks, **kwargs) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 6bea6583ded..a2c1baa5998 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -289,7 +289,9 @@ def fake_target_chunksize( """ # Short circuit for non-object dtypes - if not data.dtype.hasobject: + from xarray.core.common import _contains_cftime_datetimes + + if not _contains_cftime_datetimes(data): return limit, data.dtype from xarray.core.formatting import first_n_items diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index b752f8de83f..2a09a4ef4c1 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -619,7 +619,6 @@ def test_repr() -> None: [ (np.arange(100).reshape(10, 10), 1024, np.int64), (np.arange(100).reshape(10, 10).astype(np.float32), 1024, np.float32), - (np.arange(100).reshape(10, 10).astype(object), 73, np.float64), ], ) def test_fake_target_chunksize( From 3f0d3aabc955ac6c67895460f337661dd3547f85 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 24 Sep 2025 07:40:23 +0800 Subject: [PATCH 30/36] Fix typing --- xarray/namedarray/utils.py | 14 +++++++------- xarray/tests/test_namedarray.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index a2c1baa5998..d63bb2c717f 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -27,8 +27,8 @@ from xarray.core.types import T_ChunkDim from xarray.core.variable import Variable - from xarray.namedarray._typing import _Dim, duckarray - from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray + from xarray.namedarray._typing import DuckArray, _Dim, duckarray + from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint K = TypeVar("K") @@ -201,9 +201,9 @@ def either_dict_or_kwargs( def _get_chunk( # type: ignore[no-untyped-def] - var: Variable | T_ChunkedArray, + var: Variable | DuckArray[Any], chunks, - chunkmanager: ChunkManagerEntrypoint, + chunkmanager: ChunkManagerEntrypoint[Any], *, preferred_chunks, ) -> Mapping[Any, T_ChunkDim]: @@ -219,7 +219,7 @@ def _get_chunk( # type: ignore[no-untyped-def] if isinstance(var, IndexVariable): return {} - if not is_duck_array(var): + if isinstance(var, Variable): dims = var.dims else: dims = chunks.keys() @@ -250,7 +250,7 @@ def _get_chunk( # type: ignore[no-untyped-def] # Warn where requested chunks break preferred chunks, provided that the variable # contains data. - if var.size: + if var.size: # type: ignore[union-attr] # DuckArray protocol doesn't include 'size' - should it? for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): try: preferred_chunk_sizes = preferred_chunks[dim] @@ -272,7 +272,7 @@ def _get_chunk( # type: ignore[no-untyped-def] def fake_target_chunksize( - data: Variable | T_ChunkedArray, + data: Variable | DuckArray[Any], limit: int, ) -> tuple[int, np.dtype[Any]]: """ diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 2a09a4ef4c1..06cefd99094 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -636,7 +636,7 @@ def test_fake_target_chunksize( """ target_chunksize = 1024 - faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type] + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) assert faked_chunksize == expected_chunksize_faked assert dtype == expected_dtype From e944eb49565fb81ed69e97aa5538b5e4876d12c6 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 24 Sep 2025 07:52:02 +0800 Subject: [PATCH 31/36] Don't just import Variable in typing clause --- xarray/namedarray/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index d63bb2c717f..03bade78aaa 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -213,7 +213,7 @@ def _get_chunk( # type: ignore[no-untyped-def] import itertools from xarray.core.utils import emit_user_level_warning - from xarray.core.variable import IndexVariable + from xarray.core.variable import IndexVariable, Variable from xarray.structure.chunks import _get_breaks_cached if isinstance(var, IndexVariable): From 92bb538558f89e2c70b38245b3468971e99ffb97 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 13 Oct 2025 11:40:31 -0600 Subject: [PATCH 32/36] Cleanup --- xarray/namedarray/utils.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 03bade78aaa..6ec902bc23f 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib +import itertools import sys import warnings from collections.abc import Hashable, Iterable, Iterator, Mapping @@ -210,8 +211,6 @@ def _get_chunk( # type: ignore[no-untyped-def] """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ - import itertools - from xarray.core.utils import emit_user_level_warning from xarray.core.variable import IndexVariable, Variable from xarray.structure.chunks import _get_breaks_cached @@ -237,8 +236,7 @@ def _get_chunk( # type: ignore[no-untyped-def] for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - limit = chunkmanager.get_auto_chunk_size() - limit, var_dtype = fake_target_chunksize(var, limit) + limit, var_dtype = fake_target_chunksize(var, chunkmanager.get_auto_chunk_size()) chunk_shape = chunkmanager.normalize_chunks( chunk_shape, @@ -252,21 +250,18 @@ def _get_chunk( # type: ignore[no-untyped-def] # contains data. if var.size: # type: ignore[union-attr] # DuckArray protocol doesn't include 'size' - should it? for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): - try: - preferred_chunk_sizes = preferred_chunks[dim] - except KeyError: - continue - disagreement = _get_breaks_cached( - size=size, - chunk_sizes=chunk_sizes, - preferred_chunk_sizes=preferred_chunk_sizes, - ) - if disagreement: - emit_user_level_warning( - "The specified chunks separate the stored chunks along " - f'dimension "{dim}" starting at index {disagreement}. This could ' - "degrade performance. Instead, consider rechunking after loading.", + if preferred_chunk_sizes := preferred_chunks.get(dim): + disagreement = _get_breaks_cached( + size=size, + chunk_sizes=chunk_sizes, + preferred_chunk_sizes=preferred_chunk_sizes, ) + if disagreement: + emit_user_level_warning( + "The specified chunks separate the stored chunks along " + f'dimension "{dim}" starting at index {disagreement}. This could ' + "degrade performance. Instead, consider rechunking after loading.", + ) return dict(zip(dims, chunk_shape, strict=True)) From 1e3a015dc2a9759593d0855a991518ab2b6a4473 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 13 Oct 2025 11:44:48 -0600 Subject: [PATCH 33/36] Remove Variable handling --- xarray/backends/api.py | 3 ++- xarray/namedarray/utils.py | 24 ++++++++---------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f52c3af9b7f..eb0b3e6ec78 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -246,10 +246,11 @@ def _chunk_ds( variables = {} for name, var in backend_ds.variables.items(): var_chunks = _get_chunk( - var, + var._data, chunks, chunkmanager, preferred_chunks=var.encoding.get("preferred_chunks", {}), + dims=var.dims, ) variables[name] = _maybe_chunk( name, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 6ec902bc23f..c3a6e932ad3 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -27,7 +27,6 @@ DaskCollection: Any = NDArray # type: ignore[no-redef] from xarray.core.types import T_ChunkDim - from xarray.core.variable import Variable from xarray.namedarray._typing import DuckArray, _Dim, duckarray from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint @@ -202,28 +201,21 @@ def either_dict_or_kwargs( def _get_chunk( # type: ignore[no-untyped-def] - var: Variable | DuckArray[Any], + data: DuckArray[Any], chunks, chunkmanager: ChunkManagerEntrypoint[Any], *, preferred_chunks, + dims=None, ) -> Mapping[Any, T_ChunkDim]: """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ from xarray.core.utils import emit_user_level_warning - from xarray.core.variable import IndexVariable, Variable from xarray.structure.chunks import _get_breaks_cached - if isinstance(var, IndexVariable): - return {} - - if isinstance(var, Variable): - dims = var.dims - else: - dims = chunks.keys() - - shape = var.shape + dims = dims or chunks.keys() + shape = data.shape # Determine the explicit requested chunks. preferred_chunk_shape = tuple( @@ -236,19 +228,19 @@ def _get_chunk( # type: ignore[no-untyped-def] for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - limit, var_dtype = fake_target_chunksize(var, chunkmanager.get_auto_chunk_size()) + limit, dtype = fake_target_chunksize(data, chunkmanager.get_auto_chunk_size()) chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, - dtype=var_dtype, + dtype=dtype, limit=limit, previous_chunks=preferred_chunk_shape, ) # Warn where requested chunks break preferred chunks, provided that the variable # contains data. - if var.size: # type: ignore[union-attr] # DuckArray protocol doesn't include 'size' - should it? + if data.size: # type: ignore[union-attr] # DuckArray protocol doesn't include 'size' - should it? for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): if preferred_chunk_sizes := preferred_chunks.get(dim): disagreement = _get_breaks_cached( @@ -267,7 +259,7 @@ def _get_chunk( # type: ignore[no-untyped-def] def fake_target_chunksize( - data: Variable | DuckArray[Any], + data: DuckArray[Any], limit: int, ) -> tuple[int, np.dtype[Any]]: """ From 861cc5730e09ce39eb20ccc0c40e89dc758d5fe1 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 13 Oct 2025 12:04:12 -0600 Subject: [PATCH 34/36] Try more --- xarray/namedarray/daskmanager.py | 13 ------------- xarray/namedarray/parallelcompat.py | 7 ++++++- xarray/namedarray/utils.py | 7 ++++++- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index ad9157938e9..eb01a150c18 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -5,7 +5,6 @@ import numpy as np -from xarray.core.common import _contains_cftime_datetimes from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.namedarray.utils import is_duck_dask_array, module_available @@ -271,15 +270,3 @@ def get_auto_chunk_size(self) -> int: from dask.utils import parse_bytes return parse_bytes(dask_config.get("array.chunk-size")) - - def rechunk( - self, - data: DaskArray, - chunks: T_Chunks | _NormalizedChunks, - **kwargs: Any, - ) -> DaskArray: - from xarray.namedarray.utils import _get_chunk - - if _contains_cftime_datetimes(data): - chunks = _get_chunk(data, chunks, self, preferred_chunks={}) - return data.rechunk(chunks, **kwargs) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index fd6669b8f6b..544c9763ecd 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -346,6 +346,11 @@ def rechunk( dask.array.Array.rechunk cubed.Array.rechunk """ + from xarray.core.common import _contains_cftime_datetimes + from xarray.namedarray.utils import _get_chunk + + if _contains_cftime_datetimes(data): + chunks = _get_chunk(data, chunks, self, preferred_chunks={}) return data.rechunk(chunks, **kwargs) @abstractmethod @@ -768,5 +773,5 @@ def get_auto_chunk_size( """ raise NotImplementedError( - "get_auto_chunk_size must be implemented by the chunk manager." + "For 'auto' rechunking of cftime arrays, get_auto_chunk_size must be implemented by the chunk manager" ) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index c3a6e932ad3..48d43c4f2c5 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -211,6 +211,7 @@ def _get_chunk( # type: ignore[no-untyped-def] """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ + from xarray.core.common import _contains_cftime_datetimes from xarray.core.utils import emit_user_level_warning from xarray.structure.chunks import _get_breaks_cached @@ -228,7 +229,11 @@ def _get_chunk( # type: ignore[no-untyped-def] for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) - limit, dtype = fake_target_chunksize(data, chunkmanager.get_auto_chunk_size()) + if _contains_cftime_datetimes(data): + limit, dtype = fake_target_chunksize(data, chunkmanager.get_auto_chunk_size()) + else: + limit = None + dtype = data.dtype chunk_shape = chunkmanager.normalize_chunks( chunk_shape, From 16ccc784a0cf15129512c5f3049c535240732f5f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 13 Oct 2025 13:11:42 -0600 Subject: [PATCH 35/36] bugfix --- xarray/backends/api.py | 3 +++ xarray/namedarray/utils.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index eb0b3e6ec78..dfb5810dea7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -245,6 +245,9 @@ def _chunk_ds( variables = {} for name, var in backend_ds.variables.items(): + if var._in_memory: + variables[name] = var + continue var_chunks = _get_chunk( var._data, chunks, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 48d43c4f2c5..61e7bfc273b 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -215,7 +215,7 @@ def _get_chunk( # type: ignore[no-untyped-def] from xarray.core.utils import emit_user_level_warning from xarray.structure.chunks import _get_breaks_cached - dims = dims or chunks.keys() + dims = chunks.keys() if dims is None else dims shape = data.shape # Determine the explicit requested chunks. From 1bd2f32db6d07486c24b4710fc9a50d927908fe3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 13 Oct 2025 13:31:07 -0600 Subject: [PATCH 36/36] typing --- xarray/namedarray/parallelcompat.py | 6 ++++-- xarray/namedarray/utils.py | 3 ++- xarray/tests/test_namedarray.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 544c9763ecd..cd57a8d4487 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -350,8 +350,10 @@ def rechunk( from xarray.namedarray.utils import _get_chunk if _contains_cftime_datetimes(data): - chunks = _get_chunk(data, chunks, self, preferred_chunks={}) - return data.rechunk(chunks, **kwargs) + chunks2 = _get_chunk(data, chunks, self, preferred_chunks={}) # type: ignore[arg-type] + else: + chunks2 = chunks # type: ignore[assignment] + return data.rechunk(chunks2, **kwargs) @abstractmethod def compute( diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 61e7bfc273b..3490a76aa8d 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -229,6 +229,7 @@ def _get_chunk( # type: ignore[no-untyped-def] for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) ) + limit: int | None if _contains_cftime_datetimes(data): limit, dtype = fake_target_chunksize(data, chunkmanager.get_auto_chunk_size()) else: @@ -245,7 +246,7 @@ def _get_chunk( # type: ignore[no-untyped-def] # Warn where requested chunks break preferred chunks, provided that the variable # contains data. - if data.size: # type: ignore[union-attr] # DuckArray protocol doesn't include 'size' - should it? + if data.size: # type: ignore[unused-ignore,attr-defined] # DuckArray protocol doesn't include 'size' - should it? for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): if preferred_chunk_sizes := preferred_chunks.get(dim): disagreement = _get_breaks_cached( diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 96e6fd7f675..568f8ec70f2 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -662,7 +662,7 @@ def test_fake_target_chunksize_cftime() -> None: dtype=object, ).reshape(10, 10) - faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type] + faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[arg-type,unused-ignore] assert faked_chunksize == 73 assert dtype == np.float64