Skip to content

Commit 2aa143f

Browse files
committed
Change the meaning of auto to default preserve behavior
1 parent 67ff2f3 commit 2aa143f

File tree

9 files changed

+59
-52
lines changed

9 files changed

+59
-52
lines changed

properties/test_parallelcompat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def test_preserve_all_chunks(
2020
target = 1024 * 1024
2121

2222
actual = ChunkManagerEntrypoint.preserve_chunks(
23-
chunks=("preserve",) * len(shape),
23+
chunks=("auto",) * len(shape),
2424
shape=shape,
2525
target=target,
2626
typesize=typesize,
@@ -47,7 +47,7 @@ def test_preserve_some_chunks(
4747
target = 2 * 1024 * 1024
4848

4949
actual = ChunkManagerEntrypoint.preserve_chunks(
50-
chunks=(first_chunk, *["preserve" for _ in range(len(shape) - 1)]),
50+
chunks=(first_chunk, *["auto" for _ in range(len(shape) - 1)]),
5151
shape=shape,
5252
target=target,
5353
typesize=typesize,

xarray/backends/api.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,11 @@ def _chunk_ds(
259259
name,
260260
var,
261261
var_chunks,
262+
chunkmanager,
262263
overwrite_encoded_chunks=overwrite_encoded_chunks,
263264
name_prefix=name_prefix,
264265
token=token,
265266
inline_array=inline_array,
266-
chunked_array_type=chunkmanager,
267267
from_array_kwargs=from_array_kwargs.copy(),
268268
just_use_token=True,
269269
)
@@ -292,9 +292,9 @@ def _dataset_from_backend_dataset(
292292
create_default_indexes,
293293
**extra_tokens,
294294
):
295-
if not isinstance(chunks, int | dict) and chunks not in {None, "auto", "preserve"}:
295+
if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
296296
raise ValueError(
297-
f"chunks must be an int, dict, 'auto', 'preserve', or None. Instead found {chunks}."
297+
f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}."
298298
)
299299

300300
_protect_dataset_variables_inplace(backend_ds, cache)
@@ -430,14 +430,14 @@ def open_dataset(
430430
"netcdf4" over "h5netcdf" over "scipy" (customizable via
431431
``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend
432432
class (a subclass of ``BackendEntrypoint``) can also be used.
433-
chunks : int, dict, 'auto', 'preserve' or None, default: None
433+
chunks : int, dict, 'auto', 'dask-auto' or None, default: None
434434
If provided, used to load the data into dask arrays.
435435
436-
- ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
437-
engine preferred chunks.
438-
- ``chunks="preserve"`` will use a chunking scheme that never splits encoded
439-
chunks. If encoded chunks are small then "preserve" takes multiples of them
436+
- ``chunks="auto"`` will use a chunking scheme that never splits encoded
437+
chunks. If encoded chunks are small then "auto" takes multiples of them
440438
over the largest dimension.
439+
- ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the
440+
engine preferred chunks.
441441
- ``chunks=None`` skips using dask. This uses xarray's internally private
442442
:ref:`lazy indexing classes <internal design.lazy indexing>`,
443443
but data is eagerly loaded into memory as numpy arrays when accessed.
@@ -677,14 +677,14 @@ def open_dataarray(
677677
"netcdf4" over "h5netcdf" over "scipy" (customizable via
678678
``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend
679679
class (a subclass of ``BackendEntrypoint``) can also be used.
680-
chunks : int, dict, 'auto', 'preserve', or None, default: None
680+
chunks : int, dict, 'auto', 'dask-auto', or None, default: None
681681
If provided, used to load the data into dask arrays.
682682
683-
- ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
684-
engine preferred chunks.
685-
- ``chunks="preserve"`` will use a chunking scheme that never splits encoded
686-
chunks. If encoded chunks are small then "preserve" takes multiples of them
683+
- ``chunks="auto"`` will use a chunking scheme that never splits encoded
684+
chunks. If encoded chunks are small then "auto" takes multiples of them
687685
over the largest dimension.
686+
- ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the
687+
engine preferred chunks.
688688
- ``chunks=None`` skips using dask. This uses xarray's internally private
689689
:ref:`lazy indexing classes <internal design.lazy indexing>`,
690690
but data is eagerly loaded into memory as numpy arrays when accessed.
@@ -906,13 +906,13 @@ def open_datatree(
906906
"h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in
907907
``xarray.set_options()``). A custom backend class (a subclass of
908908
``BackendEntrypoint``) can also be used.
909-
chunks : int, dict, 'auto', preserve, or None, default: None
909+
chunks : int, dict, 'auto', 'dask-auto', or None, default: None
910910
If provided, used to load the data into dask arrays.
911911
912-
- ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
912+
- ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the
913913
engine preferred chunks.
914-
- ``chunks="preserve"`` will use a chunking scheme that never splits encoded
915-
chunks. If encoded chunks are small then "preserve" takes multiples of them
914+
- ``chunks="auto"`` will use a chunking scheme that never splits encoded
915+
chunks. If encoded chunks are small then "auto" takes multiples of them
916916
over the largest dimension.
917917
- ``chunks=None`` skips using dask. This uses xarray's internally private
918918
:ref:`lazy indexing classes <internal design.lazy indexing>`,
@@ -1155,14 +1155,14 @@ def open_groups(
11551155
``xarray.set_options()``). A custom backend class (a subclass of
11561156
``BackendEntrypoint``) can also be used.
11571157
can also be used.
1158-
chunks : int, dict, 'auto', 'preserve', or None, default: None
1158+
chunks : int, dict, 'auto', 'dask-auto', or None, default: None
11591159
If provided, used to load the data into dask arrays.
11601160
1161-
- ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
1162-
engine preferred chunks.
1163-
- ``chunks="preserve"`` will use a chunking scheme that never splits encoded
1164-
chunks. If encoded chunks are small then "preserve" takes multiples of them
1161+
- ``chunks="auto"`` will use a chunking scheme that never splits encoded
1162+
chunks. If encoded chunks are small then "auto" takes multiples of them
11651163
over the largest dimension.
1164+
- ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the
1165+
engine preferred chunks.
11661166
- ``chunks=None`` skips using dask. This uses xarray's internally private
11671167
:ref:`lazy indexing classes <internal design.lazy indexing>`,
11681168
but data is eagerly loaded into memory as numpy arrays when accessed.
@@ -1430,7 +1430,7 @@ def open_mfdataset(
14301430
concatenation along more than one dimension is desired, then ``paths`` must be a
14311431
nested list-of-lists (see ``combine_nested`` for details). (A string glob will
14321432
be expanded to a 1-dimensional list.)
1433-
chunks : int, dict, 'auto', 'preserve', or None, optional
1433+
chunks : int, dict, 'auto', 'dask-auto', or None, optional
14341434
Dictionary with keys given by dimension names and values given by chunk sizes.
14351435
In general, these should divide the dimensions of each dataset. If int, chunk
14361436
each dimension by ``chunks``. By default, chunks will be chosen to match the

xarray/backends/zarr.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,15 +1499,15 @@ def open_zarr(
14991499
Array synchronizer provided to zarr
15001500
group : str, optional
15011501
Group path. (a.k.a. `path` in zarr terminology.)
1502-
chunks : int, dict, "auto", "preserve", or None, optional
1502+
chunks : int, dict, "auto", "dask-auto", or None, optional
15031503
Used to load the data into dask arrays. Default behavior is to use
15041504
``chunks={}`` if dask is available, otherwise ``chunks=None``.
15051505
1506-
- ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
1507-
engine preferred chunks.
1508-
- ``chunks="preserve"`` will use a chunking scheme that never splits encoded
1509-
chunks. If encoded chunks are small then "preserve" takes multiples of them
1506+
- ``chunks="auto"`` will use a chunking scheme that never splits encoded
1507+
chunks. If encoded chunks are small then "auto" takes multiples of them
15101508
over the largest dimension.
1509+
- ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the
1510+
engine preferred chunks.
15111511
- ``chunks=None`` skips using dask. This uses xarray's internally private
15121512
:ref:`lazy indexing classes <internal design.lazy indexing>`,
15131513
but data is eagerly loaded into memory as numpy arrays when accessed.

xarray/core/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2639,11 +2639,11 @@ def _resolve_resampler(name: Hashable, resampler: Resampler) -> tuple[int, ...]:
26392639
k,
26402640
v,
26412641
chunks_mapping_ints,
2642+
chunkmanager,
26422643
token,
26432644
lock,
26442645
name_prefix,
26452646
inline_array=inline_array,
2646-
chunked_array_type=chunkmanager,
26472647
from_array_kwargs=from_array_kwargs.copy(),
26482648
)
26492649
for k, v in self.variables.items()

xarray/namedarray/_typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def dtype(self) -> _DType_co: ...
7474
_NormalizedChunks = tuple[tuple[int, ...], ...]
7575
# FYI in some cases we don't allow `None`, which this doesn't take account of.
7676
# # FYI the `str` is for a size string, e.g. "16MB", supported by dask.
77-
T_ChunkDim: TypeAlias = str | int | Literal["auto", "preserve"] | tuple[int, ...] | None # noqa: PYI051
77+
T_ChunkDim: TypeAlias = str | int | Literal["auto"] | tuple[int, ...] | None # noqa: PYI051
7878
# We allow the tuple form of this (though arguably we could transition to named dims only)
7979
T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim]
8080

xarray/namedarray/parallelcompat.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,7 @@ def preserve_chunks(
850850
of tuple of ints.
851851
"""
852852
new_chunks = [*previous_chunks]
853-
auto_dims = [c == "preserve" for c in chunks]
853+
auto_dims = [c == "auto" for c in chunks]
854854
max_chunks = np.array(shape)
855855
for i, previous_chunk in enumerate(previous_chunks):
856856
chunk = chunks[i]
@@ -869,8 +869,8 @@ def preserve_chunks(
869869
max_chunks[i] = max(previous_chunk)
870870

871871
if isinstance(previous_chunk, int):
872-
# preserve, None or () means we want to track previous chunk
873-
if chunk == "preserve" or not chunk:
872+
# auto, None or () means we want to track previous chunk
873+
if chunk == "auto" or not chunk:
874874
max_chunks[i] = previous_chunk
875875
# otherwise use the explicitly provided chunk
876876
else:

xarray/namedarray/utils.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,7 @@ def _get_chunk( # type: ignore[no-untyped-def]
236236
limit = None
237237
dtype = data.dtype
238238

239-
if any(c == "preserve" for c in chunk_shape) and any(
240-
c == "auto" for c in chunk_shape
241-
):
242-
raise ValueError('chunks cannot use a combination of "auto" and "preserve"')
243-
244-
if shape and preferred_chunk_shape and any(c == "preserve" for c in chunk_shape):
239+
if shape and preferred_chunk_shape and any(c == "auto" for c in chunk_shape):
245240
chunk_shape = chunkmanager.preserve_chunks(
246241
chunk_shape,
247242
shape=shape,

xarray/structure/chunks.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from xarray.namedarray.parallelcompat import (
1515
ChunkManagerEntrypoint,
1616
get_chunked_array_type,
17-
guess_chunkmanager,
1817
)
1918

2019
if TYPE_CHECKING:
@@ -65,12 +64,12 @@ def _maybe_chunk(
6564
name: Hashable,
6665
var: Variable,
6766
chunks: Mapping[Any, T_ChunkDim] | None,
67+
chunkmanager: ChunkManagerEntrypoint,
6868
token=None,
6969
lock=None,
7070
name_prefix: str = "xarray-",
7171
overwrite_encoded_chunks: bool = False,
7272
inline_array: bool = False,
73-
chunked_array_type: str | ChunkManagerEntrypoint | None = None,
7473
from_array_kwargs=None,
7574
just_use_token=False,
7675
) -> Variable:
@@ -80,10 +79,24 @@ def _maybe_chunk(
8079
chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks}
8180

8281
if var.ndim:
83-
chunked_array_type = guess_chunkmanager(
84-
chunked_array_type
85-
) # coerce string to ChunkManagerEntrypoint type
86-
if isinstance(chunked_array_type, DaskManager):
82+
if (
83+
var.shape
84+
and var.chunks
85+
and chunks
86+
and any(c == "auto" for c in chunks.values())
87+
):
88+
chunk_shape = chunkmanager.preserve_chunks(
89+
tuple(chunks.get(dim, ()) for dim in var.dims),
90+
shape=var.shape,
91+
target=chunkmanager.get_auto_chunk_size(),
92+
typesize=getattr(var.dtype, "itemsize", 8),
93+
previous_chunks=var.chunks,
94+
)
95+
chunks = {
96+
dim: chunk_shape[i] for i, dim in enumerate(var.dims) if dim in chunks
97+
}
98+
99+
if isinstance(chunkmanager, DaskManager):
87100
if not just_use_token:
88101
from dask.base import tokenize
89102

@@ -104,7 +117,7 @@ def _maybe_chunk(
104117

105118
var = var.chunk(
106119
chunks,
107-
chunked_array_type=chunked_array_type,
120+
chunked_array_type=chunkmanager,
108121
from_array_kwargs=from_array_kwargs,
109122
)
110123

xarray/tests/test_backends.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7405,7 +7405,6 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path: Path) -> None:
74057405
@pytest.mark.parametrize(
74067406
"chunks", ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}]
74077407
)
7408-
@pytest.mark.filterwarnings("ignore:The specified chunks separate")
74097408
def test_chunking_consistency(chunks, tmp_path: Path) -> None:
74107409
encoded_chunks: dict[str, Any] = {}
74117410
dask_arr = da.from_array(
@@ -7439,12 +7438,12 @@ def test_chunking_consistency(chunks, tmp_path: Path) -> None:
74397438
@pytest.mark.parametrize(
74407439
"chunks,expected",
74417440
[
7442-
("preserve", (160, 500)),
7441+
("auto", (160, 500)),
74437442
(-1, (500, 500)),
74447443
({}, (10, 10)),
7445-
({"x": "preserve"}, (500, 10)),
7444+
({"x": "auto"}, (500, 10)),
74467445
({"x": -1}, (500, 10)),
7447-
({"x": "preserve", "y": -1}, (160, 500)),
7446+
({"x": "auto", "y": -1}, (160, 500)),
74487447
],
74497448
)
74507449
def test_open_dataset_chunking_zarr_with_preserve(

0 commit comments

Comments
 (0)