diff --git a/ci/environment-upstream-dev.yml b/ci/environment-upstream-dev.yml index d6c3834d..d8e11285 100644 --- a/ci/environment-upstream-dev.yml +++ b/ci/environment-upstream-dev.yml @@ -15,7 +15,7 @@ dependencies: - itables - matplotlib - netcdf4 >=1.5.5,!=1.6.1 - - pandas >=2.1.0 + - pandas >=2.1.0,<3.0.0 - pip - polars>=1.24.0,<1.33.0 - pooch diff --git a/ci/environment.yml b/ci/environment.yml index 98fad538..3c54a113 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -16,7 +16,7 @@ dependencies: - itables - matplotlib - netcdf4 >=1.5.5,!=1.6.1 - - pandas >=2.1.0 + - pandas >=2.1.0, <3.0.0 - pip - polars>=1.24.0,<1.33.0 - pooch diff --git a/intake_esm/source.py b/intake_esm/source.py index 2104c513..4fb13f2d 100644 --- a/intake_esm/source.py +++ b/intake_esm/source.py @@ -104,9 +104,11 @@ def _open_dataset( # How should we handle concat_dim, and other xr.open_mfdataset kwargs? xarray_open_kwargs.update(preprocess=preprocess) xarray_open_kwargs.update(parallel=True) - ds = xr.open_mfdataset(url, **xarray_open_kwargs) + # ds = xr.open_mfdataset(url, **xarray_open_kwargs) + ds = _open_dataset_try_auto(url, xr.open_mfdataset, xarray_open_kwargs) else: - ds = xr.open_dataset(url, **xarray_open_kwargs) + # ds = xr.open_dataset(url, **xarray_open_kwargs) + ds = _open_dataset_try_auto(url, xr.open_dataset, xarray_open_kwargs) if preprocess is not None: ds = preprocess(ds) @@ -137,6 +139,30 @@ def _open_dataset( return ds +def _open_dataset_try_auto(url, func: typing.Callable, xarray_open_kwargs) -> xr.Dataset: + """ + Try to open a dataset with chunks set to auto. If we fail because dask doesn't know how to chunk + it, set chunks to `{}` and retry. Handles cases where datasets contain things like string variables, + which can't be autochunked, as that's restricted to cftime arrays only. + + Attempting to autochunk, rather than always using disk chunks (`{}`) is advantageous as it is generally + quite a lot more performant. Unfortunately, there is no straightforward way to detect which variables + within a dataset can and can't be autochunked without opening it. + """ + try: + ds = func(url, **xarray_open_kwargs) + except NotImplementedError as exc: + if ( + 'Can not use auto rechunking with object dtype. We are unable to estimate the size in bytes of object data' + in str(exc) + ): + xarray_open_kwargs['chunks'] = {} + ds = func(url, **xarray_open_kwargs) + else: + raise exc + return ds + + def _update_attrs(*, additional_attrs, ds): additional_attrs = additional_attrs or {} if additional_attrs: diff --git a/requirements.txt b/requirements.txt index 95039357..70b5c288 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ fsspec>=2024.12 intake>=2.0.0 itables netCDF4>=1.5.5 -pandas>=2.1.0 +pandas>=2.1.0,<3.0.0 polars>=1.24.0,<1.33.0 pydantic>=2.0 pydap!=3.5.5 diff --git a/tests/test_source.py b/tests/test_source.py index 32e47ae9..c4e3faa6 100644 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -95,7 +95,7 @@ def test_open_dataset_kerchunk(kerchunk_file=kerchunk_file): # chunking xarray_open_kwargs = _get_xarray_open_kwargs( 'reference', - dict(engine='zarr', consolidated=False, drop_variables='crs'), + dict(engine='zarr', consolidated=False), storage_options={ 'remote_protocol': 's3', 'remote_options': {'anon': True, 'asynchronous': _zarr_async()},