diff --git a/CHANGELOG.md b/CHANGELOG.md index a887dd72..ccbb577d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ [Full Changelog](https://github.com/intake/intake-esm/compare/v2025.2.3...v2025.7.9) +## vUNRELEASED + +### New features added + +- New `iterable_columns` field in the ESM catalog spec to specify which columns should be read as iterables (tuples). Deprecates argument `columns_with_iterable` of the `esm_datastore` by @aulemahal in https://github.com/intake/intake-esm/pull/752 + ## v2025.7.9 ### New features added diff --git a/docs/source/reference/esm-catalog-spec.md b/docs/source/reference/esm-catalog-spec.md index 44c44d3b..7ba69665 100644 --- a/docs/source/reference/esm-catalog-spec.md +++ b/docs/source/reference/esm-catalog-spec.md @@ -67,6 +67,7 @@ They should be either [URIs](https://en.wikipedia.org/wiki/Uniform_Resource_Iden | description | string | **REQUIRED.** Detailed multi-line description to fully explain the catalog. [CommonMark 0.28](http://commonmark.org/) syntax MAY be used for rich text representation. | | catalog_file | string | **REQUIRED.** Path to a the CSV file with the catalog contents. | | catalog_dict | array | If specified, it is mutually exclusive with `catalog_file`. An array of dictionaries that represents the data that would otherwise be in the csv. | +| iterable_columns | array | A list of columns names to that contain iterable values instead of scalar ones. | | attributes | [[Attribute Object](#attribute-object)] | **REQUIRED.** A list of attribute columns in the data set. | | assets | [Assets Object](#assets-object) | **REQUIRED.** Description of how the assets (data files) are referenced in the CSV catalog file. | | aggregation_control | [Aggregation Control Object](#aggregation-control-object) | **OPTIONAL.** Description of how to support aggregation of multiple assets into a single xarray data set. | diff --git a/intake_esm/cat.py b/intake_esm/cat.py index 662fd43e..f461423e 100644 --- a/intake_esm/cat.py +++ b/intake_esm/cat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import builtins import datetime import enum @@ -115,6 +116,7 @@ class ESMCatalogModel(pydantic.BaseModel): id: str = '' catalog_dict: list[dict] | None = None catalog_file: pydantic.StrictStr | None = None + iterable_columns: set[pydantic.StrictStr] | None = None description: pydantic.StrictStr | None = None title: pydantic.StrictStr | None = None last_updated: datetime.datetime | datetime.date | None = None @@ -320,6 +322,15 @@ def _df_from_file( csv_path = f'{os.path.dirname(_mapper.root)}/{cat.catalog_file}' cat.catalog_file = csv_path + if self.iterable_columns: + converter = ast.literal_eval + read_kwargs.setdefault('converters', {}) + for col in self.iterable_columns: + if read_kwargs['converters'].setdefault(col, converter) != converter: + raise ValueError( + f"Cannot provide converter for '{col}' via `read_kwargs` when '{col}' is also specified in `iterable_columns`" + ) + reader = CatalogFileDataReader(cat.catalog_file, storage_options, **read_kwargs) self._iterable_dtype_map = reader.dtype_map return reader.frames diff --git a/intake_esm/core.py b/intake_esm/core.py index 26b72d5b..167b90e7 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -59,6 +59,7 @@ class esm_datastore(Catalog): A list of columns in the csv file containing iterables. Values in columns specified here will be converted with `ast.literal_eval` when :py:func:`~pandas.read_csv` is called (i.e., this is a shortcut to passing converters to `read_kwargs`). + Catalogs might also have such columns configured in their ``iterable_columns`` field. storage_options : dict, optional Parameters passed to the backend file-system such as Google Cloud Storage, Amazon Web Service S3. diff --git a/tests/sample-catalogs/multi-variable-hardcoded-catalog.json b/tests/sample-catalogs/multi-variable-hardcoded-catalog.json new file mode 100644 index 00000000..1b9fa393 --- /dev/null +++ b/tests/sample-catalogs/multi-variable-hardcoded-catalog.json @@ -0,0 +1,55 @@ +{ + "esmcat_version": "0.1.0", + "id": "sample-multi-variable-cesm1-lens", + "description": "This is a sample ESM catalog emulating multi variable/history files for CESM1-LENS", + "catalog_file": "multi-variable-catalog.csv", + "iterable_columns": ["variable"], + "attributes": [ + { + "column_name": "experiment", + "vocabulary": "" + }, + { + "column_name": "case", + "vocabulary": "" + }, + { + "column_name": "component", + "vocabulary": "" + }, + { + "column_name": "stream", + "vocabulary": "" + }, + { "column_name": "variable", "vocabulary": "" }, + { + "column_name": "member_id", + "vocabulary": "" + } + ], + "assets": { + "column_name": "path", + "format": "netcdf" + }, + + "aggregation_control": { + "variable_column_name": "variable", + "groupby_attrs": ["component", "experiment", "stream"], + "aggregations": [ + { + "type": "join_new", + "attribute_name": "member_id", + "options": { "coords": "minimal", "compat": "override" } + }, + { + "type": "join_existing", + "attribute_name": "time_range", + "options": { "dim": "time" } + }, + { + "type": "union", + "attribute_name": "variable" + } + ] + } +} diff --git a/tests/test_core.py b/tests/test_core.py index 6d4f5bc5..cb24442f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -28,6 +28,7 @@ cdf_cat_sample_cmip6_noagg, mixed_cat_sample_cmip6, multi_variable_cat, + multi_variable_hard_cat, opendap_cat_sample_noaa, sample_df, sample_esmcat_data, @@ -158,6 +159,7 @@ def test_catalog_init_back_compat(capsys, obj, sep, read_kwargs, read_csv_kwargs [ (multi_variable_cat, {'converters': {'variable': ast.literal_eval}}, None), (multi_variable_cat, None, ['variable']), + (multi_variable_hard_cat, None, None), ], ) def test_columns_with_iterables(capsys, obj, read_kwargs, columns_with_iterables): diff --git a/tests/utils.py b/tests/utils.py index 7096f1b9..e8e78ffd 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,6 +7,9 @@ zarr_cat_pangeo_cmip6 = 'https://storage.googleapis.com/cmip6/pangeo-cmip6.json' cdf_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-netcdf.json') multi_variable_cat = os.path.join(here, 'sample-catalogs/multi-variable-catalog.json') +multi_variable_hard_cat = os.path.join( + here, 'sample-catalogs/multi-variable-hardcoded-catalog.json' +) cdf_cat_sample_cmip5 = os.path.join(here, 'sample-catalogs/cmip5-netcdf.json') cdf_cat_sample_cmip5_pq = os.path.join(here, 'sample-catalogs/cmip5-netcdf-parquet.json') cdf_cat_sample_cesmle = os.path.join(here, 'sample-catalogs/cesm1-lens-netcdf.json')