Skip to content

Commit 508ef75

Browse files
committed
Improve consistency and engine keyword argument for to_netcdf()
This PR introduces a bug fix and a breaking changes: 1. The default backend ``engine`` used by `Dataset.to_netcdf` and `DataTree.to_netcdf` is now chosen consistently with `open_dataset` and `open_datatree`, using whichever netCDF libraries are available and preferring netCDF4 to h5netcdf to scipy. Previously, `DataTree.to_netcdf` was hard-coded to use h5netcdf. 2. The return value of `Dataset.to_netcdf` without ``path`` is now a ``memoryview`` object instead of ``bytes``. This removes an unnecessary memory copy and ensures consistency when using either ``engine="scipy"`` or ``engine="h5netcdf"``. Fixes #10654
1 parent efb6bb6 commit 508ef75

11 files changed

Lines changed: 178 additions & 172 deletions

File tree

doc/whats-new.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,19 @@ New Features
2424
Breaking changes
2525
~~~~~~~~~~~~~~~~
2626

27+
- The default backend ``engine`` used by :py:meth:`Dataset.to_netcdf`
28+
and :py:meth:`DataTree.to_netcdf` is now chosen consistently with
29+
:py:func:`open_dataset` and :py:func:`open_datatree`, using whichever netCDF
30+
libraries are available and preferring netCDF4 to h5netcdf to scipy
31+
(:issue:`10654`). Previously, :py:meth:`DataTree.to_netcdf` was hard-coded to
32+
use h5netcdf.
33+
By `Stephan Hoyer <https://github.com/shoyer>`_.
34+
- The return value of :py:meth:`Dataset.to_netcdf` without ``path`` is
35+
now a ``memoryview`` object instead of ``bytes``. This removes an unnecessary
36+
memory copy and ensures consistency when using either ``engine="scipy"`` or
37+
``engine="h5netcdf"``. If you need a bytes object, simply wrap the return
38+
value of ``to_netcdf()`` with ``bytes()``.
39+
By `Stephan Hoyer <https://github.com/shoyer>`_.
2740

2841
Deprecations
2942
~~~~~~~~~~~~
@@ -41,6 +54,7 @@ Bug fixes
4154
(:issue:`10637`).
4255
By `Stephan Hoyer <https://github.com/shoyer>`_.
4356

57+
4458
Documentation
4559
~~~~~~~~~~~~~
4660

xarray/backends/api.py

Lines changed: 39 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from typing import (
1818
TYPE_CHECKING,
1919
Any,
20-
Final,
2120
Literal,
2221
TypeVar,
2322
Union,
@@ -98,69 +97,44 @@
9897
DATAARRAY_NAME = "__xarray_dataarray_name__"
9998
DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"
10099

101-
ENGINES = {
102-
"netcdf4": backends.NetCDF4DataStore.open,
103-
"scipy": backends.ScipyDataStore,
104-
"pydap": backends.PydapDataStore.open,
105-
"h5netcdf": backends.H5NetCDFStore.open,
106-
"zarr": backends.ZarrStore.open_group,
107-
}
108-
109-
110-
def _get_default_engine_remote_uri() -> Literal["netcdf4", "pydap"]:
111-
engine: Literal["netcdf4", "pydap"]
112-
try:
113-
import netCDF4 # noqa: F401
114-
115-
engine = "netcdf4"
116-
except ImportError: # pragma: no cover
117-
try:
118-
import pydap # noqa: F401
119100

120-
engine = "pydap"
121-
except ImportError as err:
122-
raise ValueError(
123-
"netCDF4 or pydap is required for accessing remote datasets via OPeNDAP"
124-
) from err
125-
return engine
126-
127-
128-
def _get_default_engine_gz() -> Literal["scipy"]:
129-
try:
130-
import scipy # noqa: F401
101+
def get_default_netcdf_write_engine(
102+
format: T_NetcdfTypes | None,
103+
to_file_or_memoryview: bool,
104+
) -> Literal["netcdf4", "h5netcdf", "scipy"]:
105+
"""Return the default netCDF library to use for writing a netCDF file."""
106+
module_names = {
107+
"netcdf4": "netCDF4",
108+
"scipy": "scipy",
109+
"h5netcdf": "h5netcdf",
110+
}
131111

132-
engine: Final = "scipy"
133-
except ImportError as err: # pragma: no cover
134-
raise ValueError("scipy is required for accessing .gz files") from err
135-
return engine
112+
candidates = list(plugins.NETCDF_BACKENDS_ORDER)
136113

114+
if format is not None:
115+
if format.upper().startswith("NETCDF3"):
116+
candidates.remove("h5netcdf")
117+
elif format.upper().startswith("NETCDF4"):
118+
candidates.remove("scipy")
119+
else:
120+
raise ValueError(f"unexpected {format=}")
137121

138-
def _get_default_engine_netcdf() -> Literal["netcdf4", "h5netcdf", "scipy"]:
139-
candidates: list[tuple[str, str]] = [
140-
("netcdf4", "netCDF4"),
141-
("h5netcdf", "h5netcdf"),
142-
("scipy", "scipy.io.netcdf"),
143-
]
122+
if to_file_or_memoryview:
123+
candidates.remove("netcdf4")
144124

145-
for engine, module_name in candidates:
125+
for engine in candidates:
126+
module_name = module_names[engine]
146127
if importlib.util.find_spec(module_name) is not None:
147128
return cast(Literal["netcdf4", "h5netcdf", "scipy"], engine)
148129

130+
format_str = f" with {format=}" if format is not None else ""
131+
libraries = ", ".join(module_names[c] for c in candidates)
149132
raise ValueError(
150-
"cannot read or write NetCDF files because none of "
151-
"'netCDF4-python', 'h5netcdf', or 'scipy' are installed"
133+
f"cannot write NetCDF files{format_str} because none of the suitable "
134+
f"backend libraries ({libraries}) are installed"
152135
)
153136

154137

155-
def _get_default_engine(path: str, allow_remote: bool = False) -> T_NetcdfEngine:
156-
if allow_remote and is_remote_uri(path):
157-
return _get_default_engine_remote_uri() # type: ignore[return-value]
158-
elif path.endswith(".gz"):
159-
return _get_default_engine_gz()
160-
else:
161-
return _get_default_engine_netcdf()
162-
163-
164138
def _validate_dataset_names(dataset: Dataset) -> None:
165139
"""DataArray.name and Dataset keys must be a string or None"""
166140

@@ -1958,7 +1932,7 @@ def to_netcdf(
19581932
multifile: Literal[False] = False,
19591933
invalid_netcdf: bool = False,
19601934
auto_complex: bool | None = None,
1961-
) -> bytes | memoryview: ...
1935+
) -> memoryview: ...
19621936

19631937

19641938
# compute=False returns dask.Delayed
@@ -2051,7 +2025,7 @@ def to_netcdf(
20512025
multifile: bool = False,
20522026
invalid_netcdf: bool = False,
20532027
auto_complex: bool | None = None,
2054-
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: ...
2028+
) -> tuple[ArrayWriter, AbstractDataStore] | memoryview | Delayed | None: ...
20552029

20562030

20572031
def to_netcdf(
@@ -2067,41 +2041,22 @@ def to_netcdf(
20672041
multifile: bool = False,
20682042
invalid_netcdf: bool = False,
20692043
auto_complex: bool | None = None,
2070-
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None:
2044+
) -> tuple[ArrayWriter, AbstractDataStore] | memoryview | Delayed | None:
20712045
"""This function creates an appropriate datastore for writing a dataset to
20722046
disk as a netCDF file
20732047
20742048
See `Dataset.to_netcdf` for full API docs.
20752049
20762050
The ``multifile`` argument is only for the private use of save_mfdataset.
20772051
"""
2078-
if isinstance(path_or_file, os.PathLike):
2079-
path_or_file = os.fspath(path_or_file)
2080-
20812052
if encoding is None:
20822053
encoding = {}
20832054

2084-
if isinstance(path_or_file, str):
2085-
if engine is None:
2086-
engine = _get_default_engine(path_or_file)
2087-
path_or_file = _normalize_path(path_or_file)
2088-
else:
2089-
# writing to bytes/memoryview or a file-like object
2090-
if engine is None:
2091-
# TODO: only use 'scipy' if format is None or a netCDF3 format
2092-
engine = "scipy"
2093-
elif engine not in ("scipy", "h5netcdf"):
2094-
raise ValueError(
2095-
"invalid engine for creating bytes/memoryview or writing to a "
2096-
f"file-like object with to_netcdf: {engine!r}. Only "
2097-
"engine=None, engine='scipy' and engine='h5netcdf' is "
2098-
"supported."
2099-
)
2100-
if not compute:
2101-
raise NotImplementedError(
2102-
"to_netcdf() with compute=False is not yet implemented when "
2103-
"returning bytes"
2104-
)
2055+
path_or_file = _normalize_path(path_or_file)
2056+
2057+
if engine is None:
2058+
to_file_or_memoryview = not isinstance(path_or_file, str)
2059+
engine = get_default_netcdf_write_engine(format, to_file_or_memoryview)
21052060

21062061
# validate Dataset keys, DataArray names, and attr keys/values
21072062
_validate_dataset_names(dataset)
@@ -2121,6 +2076,11 @@ def to_netcdf(
21212076
)
21222077

21232078
if path_or_file is None:
2079+
if not compute:
2080+
raise NotImplementedError(
2081+
"to_netcdf() with compute=False is not yet implemented when "
2082+
"returning a memoryview"
2083+
)
21242084
target = BytesIOProxy()
21252085
else:
21262086
target = path_or_file # type: ignore[assignment]

xarray/backends/plugins.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from xarray.backends.common import AbstractDataStore
1919
from xarray.core.types import ReadBuffer
2020

21-
STANDARD_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"]
21+
NETCDF_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"]
2222

2323

2424
def remove_duplicates(entrypoints: EntryPoints) -> list[EntryPoint]:
@@ -92,7 +92,7 @@ def sort_backends(
9292
backend_entrypoints: dict[str, type[BackendEntrypoint]],
9393
) -> dict[str, type[BackendEntrypoint]]:
9494
ordered_backends_entrypoints = {}
95-
for be_name in STANDARD_BACKENDS_ORDER:
95+
for be_name in NETCDF_BACKENDS_ORDER:
9696
if backend_entrypoints.get(be_name) is not None:
9797
ordered_backends_entrypoints[be_name] = backend_entrypoints.pop(be_name)
9898
ordered_backends_entrypoints.update(

xarray/backends/scipy_.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
Frozen,
3131
FrozenDict,
3232
close_on_error,
33-
emit_user_level_warning,
3433
module_available,
3534
try_read_magic_number_from_file_or_path,
3635
)
@@ -169,20 +168,9 @@ def __init__(
169168
self.lock = ensure_lock(lock)
170169

171170
if isinstance(filename_or_obj, BytesIOProxy):
172-
emit_user_level_warning(
173-
"return value of to_netcdf() without a target for "
174-
"engine='scipy' is currently bytes, but will switch to "
175-
"memoryview in a future version of Xarray. To silence this "
176-
"warning, use the following pattern or switch to "
177-
"to_netcdf(engine='h5netcdf'):\n"
178-
" target = io.BytesIO()\n"
179-
" dataset.to_netcdf(target)\n"
180-
" result = target.getbuffer()",
181-
FutureWarning,
182-
)
183171
source = filename_or_obj
184172
filename_or_obj = io.BytesIO()
185-
source.getvalue = filename_or_obj.getvalue
173+
source.getvalue = filename_or_obj.getbuffer
186174

187175
if isinstance(filename_or_obj, str): # path
188176
manager = CachingFileManager(

xarray/core/dataarray.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4067,7 +4067,7 @@ def to_netcdf(
40674067
compute: bool = True,
40684068
invalid_netcdf: bool = False,
40694069
auto_complex: bool | None = None,
4070-
) -> bytes | memoryview: ...
4070+
) -> memoryview: ...
40714071

40724072
# compute=False returns dask.Delayed
40734073
@overload
@@ -4131,17 +4131,15 @@ def to_netcdf(
41314131
compute: bool = True,
41324132
invalid_netcdf: bool = False,
41334133
auto_complex: bool | None = None,
4134-
) -> bytes | memoryview | Delayed | None:
4134+
) -> memoryview | Delayed | None:
41354135
"""Write DataArray contents to a netCDF file.
41364136
41374137
Parameters
41384138
----------
4139-
path : str, path-like or None, optional
4140-
Path to which to save this dataset. File-like objects are only
4141-
supported by the scipy engine. If no path is provided, this
4142-
function returns the resulting netCDF file as bytes; in this case,
4143-
we need to use scipy, which does not support netCDF version 4 (the
4144-
default format becomes NETCDF3_64BIT).
4139+
path : str, path-like, file-like or None, optional
4140+
Path to which to save this datatree, or a file-like object to write
4141+
it to (which must support read and write and be seekable) or None
4142+
(default) to return in-memory bytes as a memoryview.
41454143
mode : {"w", "a"}, default: "w"
41464144
Write ('w') or append ('a') mode. If mode='w', any existing file at
41474145
this location will be overwritten. If mode='a', existing variables
@@ -4201,7 +4199,7 @@ def to_netcdf(
42014199
42024200
Returns
42034201
-------
4204-
* ``bytes`` or ``memoryview`` if path is None
4202+
* ``memoryview`` if path is None
42054203
* ``dask.delayed.Delayed`` if compute is False
42064204
* None otherwise
42074205

xarray/core/dataset.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,7 +1953,7 @@ def to_netcdf(
19531953
compute: bool = True,
19541954
invalid_netcdf: bool = False,
19551955
auto_complex: bool | None = None,
1956-
) -> bytes | memoryview: ...
1956+
) -> memoryview: ...
19571957

19581958
# compute=False returns dask.Delayed
19591959
@overload
@@ -2017,17 +2017,15 @@ def to_netcdf(
20172017
compute: bool = True,
20182018
invalid_netcdf: bool = False,
20192019
auto_complex: bool | None = None,
2020-
) -> bytes | memoryview | Delayed | None:
2020+
) -> memoryview | Delayed | None:
20212021
"""Write dataset contents to a netCDF file.
20222022
20232023
Parameters
20242024
----------
2025-
path : str, path-like or file-like, optional
2026-
Path to which to save this dataset. File-like objects are only
2027-
supported by the scipy engine. If no path is provided, this
2028-
function returns the resulting netCDF file as bytes; in this case,
2029-
we need to use scipy, which does not support netCDF version 4 (the
2030-
default format becomes NETCDF3_64BIT).
2025+
path : str, path-like, file-like or None, optional
2026+
Path to which to save this datatree, or a file-like object to write
2027+
it to (which must support read and write and be seekable) or None
2028+
(default) to return in-memory bytes as a memoryview.
20312029
mode : {"w", "a"}, default: "w"
20322030
Write ('w') or append ('a') mode. If mode='w', any existing file at
20332031
this location will be overwritten. If mode='a', existing variables
@@ -2089,7 +2087,7 @@ def to_netcdf(
20892087
20902088
Returns
20912089
-------
2092-
* ``bytes`` or ``memoryview`` if path is None
2090+
* ``memoryview`` if path is None
20932091
* ``dask.delayed.Delayed`` if compute is False
20942092
* ``None`` otherwise
20952093

xarray/core/datatree_io.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from typing import TYPE_CHECKING, Any, Literal, get_args
77

88
from xarray.backends.api import (
9+
_normalize_path,
910
delayed_close_after_writes,
1011
dump_to_store,
12+
get_default_netcdf_write_engine,
1113
get_writable_netcdf_store,
1214
get_writable_zarr_store,
1315
)
@@ -50,8 +52,14 @@ def _datatree_to_netcdf(
5052
"DataTree.to_netcdf only supports the netcdf4 and h5netcdf engines"
5153
)
5254

55+
filepath = _normalize_path(filepath)
56+
5357
if engine is None:
54-
engine = "h5netcdf"
58+
to_file_or_memoryview = not isinstance(filepath, str)
59+
engine = get_default_netcdf_write_engine(
60+
format="NETCDF4", # required for supporting groups
61+
to_file_or_memoryview=to_file_or_memoryview,
62+
) # type: ignore[assignment]
5563

5664
if group is not None:
5765
raise NotImplementedError(
@@ -70,6 +78,11 @@ def _datatree_to_netcdf(
7078
)
7179

7280
if filepath is None:
81+
if not compute:
82+
raise NotImplementedError(
83+
"to_netcdf() with compute=False is not yet implemented when "
84+
"returning bytes"
85+
)
7386
# No need to use BytesIOProxy here because the legacy scipy backend
7487
# cannot write netCDF files with groups
7588
target = io.BytesIO()

0 commit comments

Comments
 (0)