From ab8f91c436c97a631e48cb5675880d9c54a652d1 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 5 Feb 2026 11:09:05 +0800 Subject: [PATCH 1/2] - Add fallback to {} chunking strategy if dataset contains non-cftime object variables - Update test to ensure we fall back to working chunking strategy in case above - Change pandas requirements - pin to less than 3.0.0 --- ci/environment.yml | 2 +- intake_esm/source.py | 30 ++++++++++++++++++++++++++++-- requirements.txt | 2 +- tests/test_source.py | 2 +- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 98fad538..3c54a113 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -16,7 +16,7 @@ dependencies: - itables - matplotlib - netcdf4 >=1.5.5,!=1.6.1 - - pandas >=2.1.0 + - pandas >=2.1.0, <3.0.0 - pip - polars>=1.24.0,<1.33.0 - pooch diff --git a/intake_esm/source.py b/intake_esm/source.py index 2104c513..4fb13f2d 100644 --- a/intake_esm/source.py +++ b/intake_esm/source.py @@ -104,9 +104,11 @@ def _open_dataset( # How should we handle concat_dim, and other xr.open_mfdataset kwargs? xarray_open_kwargs.update(preprocess=preprocess) xarray_open_kwargs.update(parallel=True) - ds = xr.open_mfdataset(url, **xarray_open_kwargs) + # ds = xr.open_mfdataset(url, **xarray_open_kwargs) + ds = _open_dataset_try_auto(url, xr.open_mfdataset, xarray_open_kwargs) else: - ds = xr.open_dataset(url, **xarray_open_kwargs) + # ds = xr.open_dataset(url, **xarray_open_kwargs) + ds = _open_dataset_try_auto(url, xr.open_dataset, xarray_open_kwargs) if preprocess is not None: ds = preprocess(ds) @@ -137,6 +139,30 @@ def _open_dataset( return ds +def _open_dataset_try_auto(url, func: typing.Callable, xarray_open_kwargs) -> xr.Dataset: + """ + Try to open a dataset with chunks set to auto. If we fail because dask doesn't know how to chunk + it, set chunks to `{}` and retry. Handles cases where datasets contain things like string variables, + which can't be autochunked, as that's restricted to cftime arrays only. + + Attempting to autochunk, rather than always using disk chunks (`{}`) is advantageous as it is generally + quite a lot more performant. Unfortunately, there is no straightforward way to detect which variables + within a dataset can and can't be autochunked without opening it. + """ + try: + ds = func(url, **xarray_open_kwargs) + except NotImplementedError as exc: + if ( + 'Can not use auto rechunking with object dtype. We are unable to estimate the size in bytes of object data' + in str(exc) + ): + xarray_open_kwargs['chunks'] = {} + ds = func(url, **xarray_open_kwargs) + else: + raise exc + return ds + + def _update_attrs(*, additional_attrs, ds): additional_attrs = additional_attrs or {} if additional_attrs: diff --git a/requirements.txt b/requirements.txt index 95039357..70b5c288 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ fsspec>=2024.12 intake>=2.0.0 itables netCDF4>=1.5.5 -pandas>=2.1.0 +pandas>=2.1.0,<3.0.0 polars>=1.24.0,<1.33.0 pydantic>=2.0 pydap!=3.5.5 diff --git a/tests/test_source.py b/tests/test_source.py index 32e47ae9..c4e3faa6 100644 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -95,7 +95,7 @@ def test_open_dataset_kerchunk(kerchunk_file=kerchunk_file): # chunking xarray_open_kwargs = _get_xarray_open_kwargs( 'reference', - dict(engine='zarr', consolidated=False, drop_variables='crs'), + dict(engine='zarr', consolidated=False), storage_options={ 'remote_protocol': 's3', 'remote_options': {'anon': True, 'asynchronous': _zarr_async()}, From b8a9da8d67f766f5d1adf5a08c014918543d7ea6 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 17 Mar 2026 09:26:06 +0800 Subject: [PATCH 2/2] Pin pandas in `ci/environment-upstream-dev.yml` --- ci/environment-upstream-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment-upstream-dev.yml b/ci/environment-upstream-dev.yml index d6c3834d..d8e11285 100644 --- a/ci/environment-upstream-dev.yml +++ b/ci/environment-upstream-dev.yml @@ -15,7 +15,7 @@ dependencies: - itables - matplotlib - netcdf4 >=1.5.5,!=1.6.1 - - pandas >=2.1.0 + - pandas >=2.1.0,<3.0.0 - pip - polars>=1.24.0,<1.33.0 - pooch