From bebdfe9fe340b44e24c8bc62b930c9f2ed63024d Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 17 Oct 2025 16:06:14 -0400 Subject: [PATCH 1/4] iterable_columns in spec --- CHANGELOG.md | 6 ++++++ docs/source/reference/esm-catalog-spec.md | 1 + intake_esm/cat.py | 11 +++++++++++ intake_esm/core.py | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a887dd72..48857b15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ [Full Changelog](https://github.com/intake/intake-esm/compare/v2025.2.3...v2025.7.9) +## vUNRELEASED + +### New features added + +- New ``iterable_columns`` field in the ESM catalog spec to specify which columns should be read as iterables (tuples). Deprecates argument ``columns_with_iterable`` of the ``esm_datastore`` by @aulemahal in https://github.com/intake/intake-esm/pull/752 + ## v2025.7.9 ### New features added diff --git a/docs/source/reference/esm-catalog-spec.md b/docs/source/reference/esm-catalog-spec.md index 44c44d3b..7ba69665 100644 --- a/docs/source/reference/esm-catalog-spec.md +++ b/docs/source/reference/esm-catalog-spec.md @@ -67,6 +67,7 @@ They should be either [URIs](https://en.wikipedia.org/wiki/Uniform_Resource_Iden | description | string | **REQUIRED.** Detailed multi-line description to fully explain the catalog. [CommonMark 0.28](http://commonmark.org/) syntax MAY be used for rich text representation. | | catalog_file | string | **REQUIRED.** Path to a the CSV file with the catalog contents. | | catalog_dict | array | If specified, it is mutually exclusive with `catalog_file`. An array of dictionaries that represents the data that would otherwise be in the csv. | +| iterable_columns | array | A list of columns names to that contain iterable values instead of scalar ones. | | attributes | [[Attribute Object](#attribute-object)] | **REQUIRED.** A list of attribute columns in the data set. | | assets | [Assets Object](#assets-object) | **REQUIRED.** Description of how the assets (data files) are referenced in the CSV catalog file. | | aggregation_control | [Aggregation Control Object](#aggregation-control-object) | **OPTIONAL.** Description of how to support aggregation of multiple assets into a single xarray data set. | diff --git a/intake_esm/cat.py b/intake_esm/cat.py index 662fd43e..f461423e 100644 --- a/intake_esm/cat.py +++ b/intake_esm/cat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import builtins import datetime import enum @@ -115,6 +116,7 @@ class ESMCatalogModel(pydantic.BaseModel): id: str = '' catalog_dict: list[dict] | None = None catalog_file: pydantic.StrictStr | None = None + iterable_columns: set[pydantic.StrictStr] | None = None description: pydantic.StrictStr | None = None title: pydantic.StrictStr | None = None last_updated: datetime.datetime | datetime.date | None = None @@ -320,6 +322,15 @@ def _df_from_file( csv_path = f'{os.path.dirname(_mapper.root)}/{cat.catalog_file}' cat.catalog_file = csv_path + if self.iterable_columns: + converter = ast.literal_eval + read_kwargs.setdefault('converters', {}) + for col in self.iterable_columns: + if read_kwargs['converters'].setdefault(col, converter) != converter: + raise ValueError( + f"Cannot provide converter for '{col}' via `read_kwargs` when '{col}' is also specified in `iterable_columns`" + ) + reader = CatalogFileDataReader(cat.catalog_file, storage_options, **read_kwargs) self._iterable_dtype_map = reader.dtype_map return reader.frames diff --git a/intake_esm/core.py b/intake_esm/core.py index 26b72d5b..31818822 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -120,6 +120,13 @@ def __init__( read_kwargs = read_kwargs or {} if columns_with_iterables: + warnings.warn( + "columns_with_iterables is deprecated as an argument to esm_datastore " + "and will be removed in a future version. Please set 'iterable_columns' " + r"in the catalog's json definition or pass read_kwargs={'converters': {COL: ast.literal_eval}}.", + DeprecationWarning, + stacklevel=2 + ) converter = ast.literal_eval read_kwargs.setdefault('converters', {}) for col in columns_with_iterables: From e083d380645efeaf65141186eee474ab1779a28d Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 17 Oct 2025 16:22:01 -0400 Subject: [PATCH 2/4] Add a test --- CHANGELOG.md | 2 +- intake_esm/core.py | 4 ++-- tests/test_core.py | 2 ++ tests/utils.py | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48857b15..ccbb577d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ### New features added -- New ``iterable_columns`` field in the ESM catalog spec to specify which columns should be read as iterables (tuples). Deprecates argument ``columns_with_iterable`` of the ``esm_datastore`` by @aulemahal in https://github.com/intake/intake-esm/pull/752 +- New `iterable_columns` field in the ESM catalog spec to specify which columns should be read as iterables (tuples). Deprecates argument `columns_with_iterable` of the `esm_datastore` by @aulemahal in https://github.com/intake/intake-esm/pull/752 ## v2025.7.9 diff --git a/intake_esm/core.py b/intake_esm/core.py index 31818822..79e7ea18 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -121,11 +121,11 @@ def __init__( read_kwargs = read_kwargs or {} if columns_with_iterables: warnings.warn( - "columns_with_iterables is deprecated as an argument to esm_datastore " + 'columns_with_iterables is deprecated as an argument to esm_datastore ' "and will be removed in a future version. Please set 'iterable_columns' " r"in the catalog's json definition or pass read_kwargs={'converters': {COL: ast.literal_eval}}.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) converter = ast.literal_eval read_kwargs.setdefault('converters', {}) diff --git a/tests/test_core.py b/tests/test_core.py index 6d4f5bc5..cb24442f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -28,6 +28,7 @@ cdf_cat_sample_cmip6_noagg, mixed_cat_sample_cmip6, multi_variable_cat, + multi_variable_hard_cat, opendap_cat_sample_noaa, sample_df, sample_esmcat_data, @@ -158,6 +159,7 @@ def test_catalog_init_back_compat(capsys, obj, sep, read_kwargs, read_csv_kwargs [ (multi_variable_cat, {'converters': {'variable': ast.literal_eval}}, None), (multi_variable_cat, None, ['variable']), + (multi_variable_hard_cat, None, None), ], ) def test_columns_with_iterables(capsys, obj, read_kwargs, columns_with_iterables): diff --git a/tests/utils.py b/tests/utils.py index 7096f1b9..e8e78ffd 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,6 +7,9 @@ zarr_cat_pangeo_cmip6 = 'https://storage.googleapis.com/cmip6/pangeo-cmip6.json' cdf_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-netcdf.json') multi_variable_cat = os.path.join(here, 'sample-catalogs/multi-variable-catalog.json') +multi_variable_hard_cat = os.path.join( + here, 'sample-catalogs/multi-variable-hardcoded-catalog.json' +) cdf_cat_sample_cmip5 = os.path.join(here, 'sample-catalogs/cmip5-netcdf.json') cdf_cat_sample_cmip5_pq = os.path.join(here, 'sample-catalogs/cmip5-netcdf-parquet.json') cdf_cat_sample_cesmle = os.path.join(here, 'sample-catalogs/cesm1-lens-netcdf.json') From c83923d839cd79aaf0e54e467c79c2320810ef12 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 17 Oct 2025 16:29:58 -0400 Subject: [PATCH 3/4] Remove deprecation add note to docstring --- intake_esm/core.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 79e7ea18..167b90e7 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -59,6 +59,7 @@ class esm_datastore(Catalog): A list of columns in the csv file containing iterables. Values in columns specified here will be converted with `ast.literal_eval` when :py:func:`~pandas.read_csv` is called (i.e., this is a shortcut to passing converters to `read_kwargs`). + Catalogs might also have such columns configured in their ``iterable_columns`` field. storage_options : dict, optional Parameters passed to the backend file-system such as Google Cloud Storage, Amazon Web Service S3. @@ -120,13 +121,6 @@ def __init__( read_kwargs = read_kwargs or {} if columns_with_iterables: - warnings.warn( - 'columns_with_iterables is deprecated as an argument to esm_datastore ' - "and will be removed in a future version. Please set 'iterable_columns' " - r"in the catalog's json definition or pass read_kwargs={'converters': {COL: ast.literal_eval}}.", - DeprecationWarning, - stacklevel=2, - ) converter = ast.literal_eval read_kwargs.setdefault('converters', {}) for col in columns_with_iterables: From 06997681108f867fa1188cc194470e6330e8bd36 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 17 Oct 2025 17:09:09 -0400 Subject: [PATCH 4/4] add new test json --- .../multi-variable-hardcoded-catalog.json | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/sample-catalogs/multi-variable-hardcoded-catalog.json diff --git a/tests/sample-catalogs/multi-variable-hardcoded-catalog.json b/tests/sample-catalogs/multi-variable-hardcoded-catalog.json new file mode 100644 index 00000000..1b9fa393 --- /dev/null +++ b/tests/sample-catalogs/multi-variable-hardcoded-catalog.json @@ -0,0 +1,55 @@ +{ + "esmcat_version": "0.1.0", + "id": "sample-multi-variable-cesm1-lens", + "description": "This is a sample ESM catalog emulating multi variable/history files for CESM1-LENS", + "catalog_file": "multi-variable-catalog.csv", + "iterable_columns": ["variable"], + "attributes": [ + { + "column_name": "experiment", + "vocabulary": "" + }, + { + "column_name": "case", + "vocabulary": "" + }, + { + "column_name": "component", + "vocabulary": "" + }, + { + "column_name": "stream", + "vocabulary": "" + }, + { "column_name": "variable", "vocabulary": "" }, + { + "column_name": "member_id", + "vocabulary": "" + } + ], + "assets": { + "column_name": "path", + "format": "netcdf" + }, + + "aggregation_control": { + "variable_column_name": "variable", + "groupby_attrs": ["component", "experiment", "stream"], + "aggregations": [ + { + "type": "join_new", + "attribute_name": "member_id", + "options": { "coords": "minimal", "compat": "override" } + }, + { + "type": "join_existing", + "attribute_name": "time_range", + "options": { "dim": "time" } + }, + { + "type": "union", + "attribute_name": "variable" + } + ] + } +}