Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/environment-upstream-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- itables
- matplotlib
- netcdf4 >=1.5.5,!=1.6.1
- pandas >=2.1.0
- pandas >=2.1.0,<3.0.0
- pip
- polars>=1.24.0,<1.33.0
- pooch
Expand Down
2 changes: 1 addition & 1 deletion ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dependencies:
- itables
- matplotlib
- netcdf4 >=1.5.5,!=1.6.1
- pandas >=2.1.0
- pandas >=2.1.0, <3.0.0
- pip
- polars>=1.24.0,<1.33.0
- pooch
Expand Down
30 changes: 28 additions & 2 deletions intake_esm/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,11 @@ def _open_dataset(
# How should we handle concat_dim, and other xr.open_mfdataset kwargs?
xarray_open_kwargs.update(preprocess=preprocess)
xarray_open_kwargs.update(parallel=True)
ds = xr.open_mfdataset(url, **xarray_open_kwargs)
# ds = xr.open_mfdataset(url, **xarray_open_kwargs)
ds = _open_dataset_try_auto(url, xr.open_mfdataset, xarray_open_kwargs)
else:
ds = xr.open_dataset(url, **xarray_open_kwargs)
# ds = xr.open_dataset(url, **xarray_open_kwargs)
ds = _open_dataset_try_auto(url, xr.open_dataset, xarray_open_kwargs)
if preprocess is not None:
ds = preprocess(ds)

Expand Down Expand Up @@ -137,6 +139,30 @@ def _open_dataset(
return ds


def _open_dataset_try_auto(url, func: typing.Callable, xarray_open_kwargs) -> xr.Dataset:
"""
Try to open a dataset with chunks set to auto. If we fail because dask doesn't know how to chunk
it, set chunks to `{}` and retry. Handles cases where datasets contain things like string variables,
which can't be autochunked, as that's restricted to cftime arrays only.

Attempting to autochunk, rather than always using disk chunks (`{}`) is advantageous as it is generally
quite a lot more performant. Unfortunately, there is no straightforward way to detect which variables
within a dataset can and can't be autochunked without opening it.
"""
try:
ds = func(url, **xarray_open_kwargs)
except NotImplementedError as exc:
if (
'Can not use auto rechunking with object dtype. We are unable to estimate the size in bytes of object data'
in str(exc)
):
xarray_open_kwargs['chunks'] = {}
ds = func(url, **xarray_open_kwargs)
else:
raise exc
return ds


def _update_attrs(*, additional_attrs, ds):
additional_attrs = additional_attrs or {}
if additional_attrs:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ fsspec>=2024.12
intake>=2.0.0
itables
netCDF4>=1.5.5
pandas>=2.1.0
pandas>=2.1.0,<3.0.0
polars>=1.24.0,<1.33.0
pydantic>=2.0
pydap!=3.5.5
Expand Down
2 changes: 1 addition & 1 deletion tests/test_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def test_open_dataset_kerchunk(kerchunk_file=kerchunk_file):
# chunking
xarray_open_kwargs = _get_xarray_open_kwargs(
'reference',
dict(engine='zarr', consolidated=False, drop_variables='crs'),
dict(engine='zarr', consolidated=False),
storage_options={
'remote_protocol': 's3',
'remote_options': {'anon': True, 'asynchronous': _zarr_async()},
Expand Down
Loading