From 1b2f68c1d408aac9971ccf66e59922f06aa03baf Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 23 May 2025 18:44:41 +0200 Subject: [PATCH 001/162] Add cli, xarray and extractions --- isimip_utils/cli.py | 28 +++++++ isimip_utils/exceptions.py | 6 ++ isimip_utils/extractions.py | 156 ++++++++++++++++++++++++++++++++++++ isimip_utils/xarray.py | 80 ++++++++++++++++++ pyproject.toml | 13 ++- 5 files changed, 279 insertions(+), 4 deletions(-) create mode 100644 isimip_utils/cli.py create mode 100644 isimip_utils/extractions.py create mode 100644 isimip_utils/xarray.py diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py new file mode 100644 index 0000000..ff52c54 --- /dev/null +++ b/isimip_utils/cli.py @@ -0,0 +1,28 @@ +import logging +from pathlib import Path + +from dotenv import load_dotenv +from rich.logging import RichHandler + + +def setup_env(): + load_dotenv(Path().cwd() / '.env') + + +def setup_logs(log_level='WARN', log_file=None): + log_level = log_level.upper() + + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + + rich_handler = RichHandler() + rich_handler.setLevel(log_level) + + root_logger.addHandler(RichHandler()) + + if log_file is not None: + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(log_level) + file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')) + + root_logger.addHandler(file_handler) diff --git a/isimip_utils/exceptions.py b/isimip_utils/exceptions.py index 6019e6f..031f4d7 100644 --- a/isimip_utils/exceptions.py +++ b/isimip_utils/exceptions.py @@ -1,3 +1,9 @@ +class ExtractionError(RuntimeError): + pass + +class ValidationError(RuntimeError): + pass + class DidNotMatch(RuntimeError): pass diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py new file mode 100644 index 0000000..4b1fbab --- /dev/null +++ b/isimip_utils/extractions.py @@ -0,0 +1,156 @@ +import logging + +import cftime +import numpy as np +import xarray as xr + +from isimip_utils.exceptions import ExtractionError, ValidationError + +logger = logging.getLogger(__name__) + + +def select_time(ds, timestamp): + logger.info(f'select time time={timestamp}') + return ds.sel(time=compute_time(ds, timestamp), method='nearest') + + +def select_period(ds, start, end): + logger.info(f'select period start={start} end={end}') + units = ds.coords['time'].attrs['units'] + calendar = ds.coords['time'].attrs['calendar'] + + start_time = cftime.date2num(start, units=units, calendar=calendar) if start else None + end_time = cftime.date2num(end, units=units, calendar=calendar) if end else None + + ds = ds.sel(time=slice(start_time, end_time)) + + if 'time' not in ds.sizes: + raise ExtractionError('No time axis remains after selecting period.') + + return ds + + +def select_point(ds, lat, lon): + logger.info(f'select point lat={lat} lon={lon}') + validate_lat(lat) + validate_lon(lon) + return ds.sel(lat=lat, lon=lon, method='nearest') + + +def select_bbox(ds, west, east, south, north): + logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') + + validate_lat(south) + validate_lat(north) + validate_lon(west) + validate_lon(east) + + lat_slice = slice(north, south) if ds.lon.values[1] > ds.lon.values[0] else slice(south, north) + lon_slice = slice(west, east) + + ds = ds.sel(lat=lat_slice, lon=lon_slice) + + if 'lat' not in ds.sizes: + raise ExtractionError('No lat axis remains after selecting bbox.') + elif 'lon' not in ds.sizes: + raise ExtractionError('No lon axis remains after selecting bbox.') + + return ds + + +def mask_bbox(ds, west, east, south, north): + logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') + + validate_lat(south) + validate_lat(north) + validate_lon(west) + validate_lon(east) + + lat = ds['lat'] + lon = ds['lon'] + + if west > east: + lon_mask = (lon >= west) | (lon <= east) + else: + lon_mask = (lon >= west) & (lon <= east) + + lat_mask = (lat >= south) & (lat <= north) + + mask = lat_mask & lon_mask + + ds = ds.where(mask) + + return ds + + +def mask_mask(ds, mask_ds, mask_var='mask'): + logger.info('select mask') + return ds.where(mask_ds[mask_var] == 1) + + +def compute_mean(ds, weights=None): + logger.info('compute mean') + if weights is None: + logger.warn('no weights provided, using latitude-dependent weights') + weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) + return ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) + + +def count_values(ds): + logger.info('count values') + return ds.count(dim=('lat', 'lon')).astype(np.float32) + + +def copy_attrs(ds1, ds2): + ds2['time'].attrs = ds1['time'].attrs + for var in ds1.data_vars: + ds2[var].attrs = ds1[var].attrs + return ds2 + + +def concat_extraction(ds1, ds2): + if ds1 is None: + return ds2.copy() + elif 'time' not in ds2.sizes: + return ds1 + else: + # apply offset when time units or calendar diverges + offset = compute_offset(ds1, ds2) + if offset is not None: + ds2 = ds2.assign_coords(time=ds2['time'] + offset) + + return xr.concat([ds1, ds2], 'time') + + +def compute_time(ds, timestamp): + units = ds.coords['time'].attrs['units'] + calendar = ds.coords['time'].attrs['calendar'] + return cftime.date2num(timestamp, units=units, calendar=calendar) if timestamp else None + + +def compute_offset(ds1, ds2): + units1 = ds1.coords['time'].attrs['units'] + units2 = ds2.coords['time'].attrs['units'] + calendar1 = ds1.coords['time'].attrs['calendar'] + calendar2 = ds2.coords['time'].attrs['calendar'] + + if units1 != units2 or calendar1 != calendar2: + start_time = ds2['time'][0] + start_date = cftime.num2date(start_time, units=units2, calendar=calendar2) + offset = cftime.date2num(start_date, units=units1, calendar=calendar1) - start_time + logger.debug(f'time axis diverges "{units1}"/"{units2}" "{calendar1}"/"{calendar2}" offset={offset.values}') + return offset + + +def validate_lat(lat): + if lat < -90: + raise ValidationError(f'lat={lat} must be > -90') + elif lat > 90: + raise ValidationError(f'lat={lat} must be < 90') + + +def validate_lon(lon): + if lon < -180: + raise ValidationError(f'lon={lon} must be > -180') + elif lon > 180: + raise ValidationError(f'lon={lon} must be < 180') diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py new file mode 100644 index 0000000..d331cbd --- /dev/null +++ b/isimip_utils/xarray.py @@ -0,0 +1,80 @@ +import logging + +import cftime +import numpy as np +import xarray as xr + +logger = logging.getLogger(__name__) + + +def open_dataset(path, decode_cf=False, load=False): + if not load: + logger.info(f'open {path.absolute()}') + else: + logger.info(f'load {path.absolute()}') + + try: + ds = xr.open_dataset(path, decode_cf=decode_cf) + except ValueError: + # workaround for non standard times (e.g. growing seasons) + ds = xr.open_dataset(path, decode_cf=decode_cf, decode_times=False) + + if ds['time'].units.startswith('growing seasons'): + units = ds['time'].units.replace('growing seasons', 'common_years') + times = cftime.num2date(ds['time'], units, calendar='365_day') + ds['time'] = times + + if load: + ds.load() + + return ds + + +def load_dataset(path, decode_cf=False): + return open_dataset(path, decode_cf=False, load=True) + + +def write_dataset(ds, path): + logger.info(f'write {path.absolute()}') + path.parent.mkdir(exist_ok=True, parents=True) + + for coord in ds.coords: + ds.coords[coord].attrs["_FillValue"] = 1.e+20 + + for var in ds.data_vars: + ds.data_vars[var].attrs["_FillValue"] = 1.e+20 + + # reorder the variables + ds = ds[[*ds.coords, *ds.data_vars]] + + ds.to_netcdf(path, format='NETCDF4_CLASSIC') + + +def get_var_name(ds): + return next(iter(ds.data_vars)) + + +def convert_to_dataframe(ds): + ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') + return ds.to_dataframe().reset_index() + + +def create_mask(ds, df, layer): + import shapely.geometry + logger.info('create mask') + + df_row = df.iloc[layer] + geometry = shapely.geometry.mapping(df_row['geometry']) + + ds_lat = ds.coords['lat'] + ds_lon = ds.coords['lon'] + mask_ds = xr.Dataset( + data_vars={ + 'mask': (('lat', 'lon'), np.ones((ds_lat.size, ds_lon.size), dtype=np.float32)) + }, + coords={'lat': ds_lat, 'lon': ds_lon} + ) + mask_ds.rio.write_crs(df.crs, inplace=True) + mask_ds = mask_ds.rio.clip([geometry], drop=False) + mask_ds = mask_ds.drop_vars('spatial_ref') + return mask_ds diff --git a/pyproject.toml b/pyproject.toml index 165fc61..b55bf06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,16 +13,21 @@ license = { file = "LICENSE" } classifiers = [ 'Operating System :: OS Independent', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ] dependencies = [ - "colorlog", + "shapely", + "geopandas", "netCDF4", "python-dotenv", - "requests" + "requests", + "rich", + "rioxarray", + "xarray", ] dynamic = ["version"] @@ -44,7 +49,7 @@ packages = ["isimip_utils"] version = { attr = "isimip_utils.__version__" } [tool.ruff] -target-version = "py38" +target-version = "py39" line-length = 120 select = [ "B", # flake8-bugbear From c97f06e22abf5af757ed4da47d1f08539d8a0118 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 May 2025 23:23:41 +0200 Subject: [PATCH 002/162] Add plot and get/set_attrs --- isimip_utils/extractions.py | 18 ++++---- isimip_utils/plot.py | 83 +++++++++++++++++++++++++++++++++++++ isimip_utils/xarray.py | 30 ++++++++++++++ pyproject.toml | 15 ++++++- 4 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 isimip_utils/plot.py diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 4b1fbab..1717353 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -11,7 +11,12 @@ def select_time(ds, timestamp): logger.info(f'select time time={timestamp}') - return ds.sel(time=compute_time(ds, timestamp), method='nearest') + time = compute_time(ds, timestamp) + if time < 0 or time > ds['time'].max(): + logger.warn(f'Selected time={time} is outside the dataset.') + return None + else: + return ds.sel(time=time, method='nearest') def select_period(ds, start, end): @@ -84,15 +89,17 @@ def mask_bbox(ds, west, east, south, north): def mask_mask(ds, mask_ds, mask_var='mask'): - logger.info('select mask') + logger.info('mask mask') return ds.where(mask_ds[mask_var] == 1) def compute_mean(ds, weights=None): logger.info('compute mean') + if weights is None: logger.warn('no weights provided, using latitude-dependent weights') weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) + return ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) @@ -101,13 +108,6 @@ def count_values(ds): return ds.count(dim=('lat', 'lon')).astype(np.float32) -def copy_attrs(ds1, ds2): - ds2['time'].attrs = ds1['time'].attrs - for var in ds1.data_vars: - ds2[var].attrs = ds1[var].attrs - return ds2 - - def concat_extraction(ds1, ds2): if ds1 is None: return ds2.copy() diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py new file mode 100644 index 0000000..8e5b323 --- /dev/null +++ b/isimip_utils/plot.py @@ -0,0 +1,83 @@ +import logging + +import altair as alt +import numpy as np + +from isimip_utils.xarray import convert_to_dataframe, get_var_name, get_var_units + +logger = logging.getLogger(__name__) + + +def enable_vegafusion(): + alt.data_transformers.enable('vegafusion') + + +def plot_line(ds, title=None): + var_name = get_var_name(ds) + var_units = get_var_units(ds) + var_title = f'{var_name} [{var_units}]' + + df = convert_to_dataframe(ds) + + return alt.Chart(df).mark_line().encode( + alt.X( + 'time:T', + title='time' + ), + alt.Y( + f'{var_name}:Q', + title=var_title, + ) + ).properties( + title=title, + ) + +def plot_map(ds, scale_factor=1, bin_size=1, color_bin=None, color_scale=None): + lon_size = len(ds['lon']) + lat_size = len(ds['lat']) + + lon_bin = float(abs(ds['lon'][1] - ds['lon'][0])) * bin_size + lat_bin = float(abs(ds['lat'][1] - ds['lat'][0])) * bin_size + + lon_domain = float(min(ds['lon']) - 0.5 * lon_bin), float(max(ds['lon']) + 0.5 * lon_bin) + lat_domain = float(min(ds['lat']) - 0.5 * lat_bin), float(max(ds['lat']) + 0.5 * lat_bin) + + lon_ticks = np.linspace(*lon_domain, num=7) + lat_ticks = np.linspace(*lat_domain, num=5) + + width = scale_factor * lon_size + height = scale_factor * lat_size + + var_name = get_var_name(ds) + var_units = get_var_units(ds) + title = f'{var_name} [{var_units}]' + + logger.info(f'plot map title="{title}" size=({width}, {height})') + + df = convert_to_dataframe(ds) + + return alt.Chart(df).mark_rect().encode( + alt.X( + 'lon:Q', + title='lon', + bin=alt.Bin(step=lon_bin), + axis=alt.Axis(values=lon_ticks), + scale=alt.Scale(domain=lon_domain, padding=0, round=True) + ), + alt.Y( + 'lat:Q', + title='lat', + bin=alt.Bin(step=lat_bin), + axis=alt.Axis(values=lat_ticks), + scale=alt.Scale(domain=lat_domain, padding=0, round=True) + ), + alt.Color( + f'{var_name}:Q', + title=title, + bin=color_bin, + scale=color_scale or alt.Scale() + ) + ).properties( + width=width, + height=height + ) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index d331cbd..1f13ca0 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -54,11 +54,41 @@ def get_var_name(ds): return next(iter(ds.data_vars)) +def get_var_units(ds): + var_name = get_var_name(ds) + return ds[var_name].units + + +def get_attrs(ds): + attrs = {} + for coord in ds.coords: + attrs[coord] = ds[coord].attrs + for data_var in ds.data_vars: + attrs[data_var] = ds[data_var].attrs + return attrs + + +def set_attrs(ds, attrs): + for coord in ds.coords: + if coord in attrs: + ds[coord].attrs = attrs[coord] + for data_var in ds.data_vars: + if data_var in attrs: + ds[data_var].attrs = attrs[data_var] + + def convert_to_dataframe(ds): ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') return ds.to_dataframe().reset_index() +def apply_fill_value(ds): + for var in ds.data_vars: + fill_value = ds[var].attrs.get('_FillValue', 1e+20) + ds[var] = ds[var].where(ds[var] != fill_value) + return ds + + def create_mask(ds, df, layer): import shapely.geometry logger.info('create mask') diff --git a/pyproject.toml b/pyproject.toml index b55bf06..3801791 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,8 +20,6 @@ classifiers = [ 'Programming Language :: Python :: 3.13', ] dependencies = [ - "shapely", - "geopandas", "netCDF4", "python-dotenv", "requests", @@ -35,6 +33,19 @@ dynamic = ["version"] Repository = "https://github.com/ISI-MIP/isimip-utils" [project.optional-dependencies] +all = [ + "isimip-utils[plot,shapes]" +] +plot = [ + "altair[all]", +] +shapes = [ + "netCDF4", + "python-dotenv", + "requests", + "rich", + "xarray", +] dev = [ "build", "pre-commit", From 0827f3b57053a50d67123c730d4cf440e275194b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 28 May 2025 19:59:49 +0200 Subject: [PATCH 003/162] Add pandas and refactor --- isimip_utils/pandas.py | 43 ++++++++++++++++++ isimip_utils/plot.py | 99 +++++++++++++++++++++++++----------------- isimip_utils/xarray.py | 18 +++++--- 3 files changed, 114 insertions(+), 46 deletions(-) create mode 100644 isimip_utils/pandas.py diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py new file mode 100644 index 0000000..f34ff83 --- /dev/null +++ b/isimip_utils/pandas.py @@ -0,0 +1,43 @@ +def get_var(df): + return next(iter(df.attrs['data_vars'])) + + +def compute_area(df): + var = get_var(df) + attrs = df.attrs + + df['year'] = df['time'].dt.year + + df = df.groupby('year').agg( + mean=(var, 'mean'), + lower=(var, lambda y: y.mean() - y.std()), + upper=(var, lambda y: y.mean() + y.std()) + ).reset_index() + + df.attrs = attrs + + return df + + +def group_by_day(df): + var = get_var(df) + + df['day'] = df['time'].dt.dayofyear + df = df.groupby('day')[var].mean().reset_index() + + return normalize(df, var) + + +def group_by_month(df): + var = get_var(df) + + df['month'] = df['time'].dt.month + df = df.groupby('month')[var].mean().reset_index() + + return normalize(df, var) + + +def normalize(df, var): + mean, std = df[var].mean(), df[var].std() + df[var] = (df[var] - mean) / (std if std > 0 else 1.0) + return df diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 8e5b323..8fc8429 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -3,7 +3,7 @@ import altair as alt import numpy as np -from isimip_utils.xarray import convert_to_dataframe, get_var_name, get_var_units +from isimip_utils.pandas import get_var logger = logging.getLogger(__name__) @@ -12,72 +12,93 @@ def enable_vegafusion(): alt.data_transformers.enable('vegafusion') -def plot_line(ds, title=None): - var_name = get_var_name(ds) - var_units = get_var_units(ds) - var_title = f'{var_name} [{var_units}]' +def save_plot(chart, path, *args, **kwargs): + logger.info(f'save {path.absolute()}') + path.parent.mkdir(exist_ok=True, parents=True) + chart.save(path, *args, **kwargs) - df = convert_to_dataframe(ds) - return alt.Chart(df).mark_line().encode( - alt.X( +def get_title(df): + var = get_var(df) + var_name = df.attrs['data_vars'][var].get('long_name', var) + var_units = df.attrs['data_vars'][var]['units'] + + return f'{var_name} [{var_units}]' + + +def plot_time(df, interpolate=False, x=None, y=None, color=None): + mark_kwargs = {'interpolate': interpolate} if interpolate else {} + + return alt.Chart(df).mark_line(**mark_kwargs).encode( + x=x or alt.X( 'time:T', - title='time' + title='Time' ), - alt.Y( - f'{var_name}:Q', - title=var_title, - ) - ).properties( - title=title, + y=y or alt.Y( + f'{get_var(df)}:Q', + title=get_title(df) + ), + color=color or alt.Color() ) -def plot_map(ds, scale_factor=1, bin_size=1, color_bin=None, color_scale=None): - lon_size = len(ds['lon']) - lat_size = len(ds['lat']) - lon_bin = float(abs(ds['lon'][1] - ds['lon'][0])) * bin_size - lat_bin = float(abs(ds['lat'][1] - ds['lat'][0])) * bin_size +def plot_mean(df, x=None, color=None): + base = alt.Chart(df).encode( + x=x or alt.X( + 'year:T', + title='Year' + ), + color=color or alt.Color() + ) + + return base.mark_line(interpolate='step-after').encode( + y=alt.Y( + 'mean:Q', + title=get_title(df) + ) + ) + base.mark_area(interpolate='step-after', opacity=0.5).encode( + y='lower:Q', + y2='upper:Q' + ) - lon_domain = float(min(ds['lon']) - 0.5 * lon_bin), float(max(ds['lon']) + 0.5 * lon_bin) - lat_domain = float(min(ds['lat']) - 0.5 * lat_bin), float(max(ds['lat']) + 0.5 * lat_bin) - lon_ticks = np.linspace(*lon_domain, num=7) - lat_ticks = np.linspace(*lat_domain, num=5) +def plot_map(df, color_scale=None): + lon = np.sort(df['lon'].unique()) + lat = np.sort(df['lat'].unique()) - width = scale_factor * lon_size - height = scale_factor * lat_size + lon_size = len(lon) + lat_size = len(lat) - var_name = get_var_name(ds) - var_units = get_var_units(ds) - title = f'{var_name} [{var_units}]' + lon_bin = float(abs(lon[1] - lon[0])) + lat_bin = float(abs(lat[1] - lat[0])) - logger.info(f'plot map title="{title}" size=({width}, {height})') + lon_domain = (lon.min() - 0.5 * lon_bin, lon.max() + 0.5 * lon_bin) + lat_domain = (lat.min() - 0.5 * lat_bin, lat.max() + 0.5 * lat_bin) - df = convert_to_dataframe(ds) + lon_ticks = np.linspace(lon_domain[0], lon_domain[1], num=7) + lat_ticks = np.linspace(lat_domain[0], lat_domain[1], num=5) return alt.Chart(df).mark_rect().encode( - alt.X( + x=alt.X( 'lon:Q', title='lon', bin=alt.Bin(step=lon_bin), axis=alt.Axis(values=lon_ticks), scale=alt.Scale(domain=lon_domain, padding=0, round=True) ), - alt.Y( + y=alt.Y( 'lat:Q', title='lat', bin=alt.Bin(step=lat_bin), axis=alt.Axis(values=lat_ticks), scale=alt.Scale(domain=lat_domain, padding=0, round=True) ), - alt.Color( - f'{var_name}:Q', - title=title, - bin=color_bin, + color=alt.Color( + f'{get_var(df)}:Q', + title=get_title(df), scale=color_scale or alt.Scale() ) ).properties( - width=width, - height=height + width=lon_size, + height=lat_size ) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 1f13ca0..e0d8538 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -54,11 +54,6 @@ def get_var_name(ds): return next(iter(ds.data_vars)) -def get_var_units(ds): - var_name = get_var_name(ds) - return ds[var_name].units - - def get_attrs(ds): attrs = {} for coord in ds.coords: @@ -77,9 +72,18 @@ def set_attrs(ds, attrs): ds[data_var].attrs = attrs[data_var] -def convert_to_dataframe(ds): +def to_dataframe(ds): ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') - return ds.to_dataframe().reset_index() + + df = ds.to_dataframe().reset_index() + df.attrs['coords'] = { + coord: ds[coord].attrs for coord in ds.coords + } + df.attrs['data_vars'] = { + data_var: ds[data_var].attrs for data_var in ds.data_vars + } + + return df def apply_fill_value(ds): From 0618550959e367d59841415741dc4045cae7f0d6 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 13:41:23 +0200 Subject: [PATCH 004/162] Add xarray.init_dataset and refactor netcdf.init_dataset --- isimip_utils/netcdf.py | 50 ++++++++++++---------- isimip_utils/xarray.py | 97 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 117 insertions(+), 30 deletions(-) diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 8072d94..0bcf20f 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -19,18 +19,19 @@ def open_dataset_write(file_path): def init_dataset(file_path, diskless=False, lon=720, lat=360, time=True, time_unit='days since 1601-1-1 00:00:00', - time_calendar='proleptic_gregorian', **variables): + time_calendar='proleptic_gregorian', attrs={}, **variables): + # create NetCDF dataset ds = Dataset(file_path, 'w', format='NETCDF4_CLASSIC', diskless=diskless) + # create time dimension if time is set if time is not None and time is not False: ds.createDimension('time', None) - d_lon = 360.0 / lon - d_lat = 180.0 / lat - + # create lon and lat dimensions ds.createDimension('lon', lon) ds.createDimension('lat', lat) + # create time variable if time is set if time is not None: time_variable = ds.createVariable('time', 'f8', ('time',), fill_value=FILL_VALUE) time_variable.missing_value = FILL_VALUE @@ -42,37 +43,42 @@ def init_dataset(file_path, diskless=False, lon=720, lat=360, time=True, if isinstance(time, np.ndarray): time_variable[:] = time + # create lon variable + lon_delta = 360.0 / lon lon_variable = ds.createVariable('lon', 'f8', ('lon',), fill_value=FILL_VALUE) lon_variable.missing_value = FILL_VALUE lon_variable.standard_name = 'longitude' lon_variable.long_name = 'Longitude' lon_variable.units = 'degrees_east' lon_variable.axis = 'X' - lon_variable[:] = np.arange(-180 + 0.5 * d_lon, 180, d_lon) + lon_variable[:] = np.arange(-180 + 0.5 * lon_delta, 180, lon_delta) + # create lat variable + lat_delta = 180.0 / lat lat_variable = ds.createVariable('lat', 'f8', ('lat',), fill_value=FILL_VALUE) lat_variable.missing_value = FILL_VALUE lat_variable.standard_name = 'latitude' lat_variable.long_name = 'Latitude' lat_variable.units = 'degrees_north' lat_variable.axis = 'Y' - lat_variable[:] = np.arange(90 - 0.5 * d_lat, -90, -d_lat) - - for variable_name, variable_dict in variables.items(): - long_name = variable_dict.get('long_name') - dtype = variable_dict.get('dtype', 'f8') - dimensions = variable_dict.get('dimensions', ('time', 'lat', 'lon')) - units = variable_dict.get('units') - - if variable_name: - variable = ds.createVariable(variable_name, dtype, dimensions, - fill_value=FILL_VALUE, compression='zlib') - variable.missing_value = FILL_VALUE - variable.standard_name = variable_name - if long_name: - variable.long_name = long_name - if units: - variable.units = units + lat_variable[:] = np.arange(90 - 0.5 * lat_delta, -90, -lat_delta) + + # create a data variable for each provided variable + for variable_name, variable in variables.items(): + + dimensions = ('time', 'lat', 'lon') if time is not None else ('lat', 'lon') + var = ds.createVariable(variable_name, variable.dtype, dimensions, + fill_value=FILL_VALUE, compression='zlib') + + # set variable attributes + for key, value in attrs.get(variable_name, {}).items(): + setattr(var, key, value) + + var.missing_value = np.float32(FILL_VALUE) + + # set global attributes + for key, value in attrs.get('global', {}).items(): + setattr(ds, key, value) return ds diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index e0d8538..eca1522 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -1,4 +1,5 @@ import logging +from pathlib import Path import cftime import numpy as np @@ -7,7 +8,75 @@ logger = logging.getLogger(__name__) +def init_dataset(lon=720, lat=360, time=None, + time_unit='days since 1601-1-1 00:00:00', + time_calendar='proleptic_gregorian', attrs={}, **variables): + + # create coordinates + coords = {} + if time is not None: + coords['time'] = time + + lon_delta = 360.0 / lon + lat_delta = 180.0 / lat + + coords['lon'] = np.arange(-180 + 0.5 * lon_delta, 180, lon_delta) + coords['lat'] = np.arange(90 - 0.5 * lat_delta, -90, -lat_delta) + + # create data variables + data_vars = { + var_name: (['time', 'lon', 'lat'], var) + for var_name, var in variables.items() + } + + # create dataset + ds = xr.Dataset(coords=coords, data_vars=data_vars) + + # set time attributes if time is set + if time is not None: + ds.coords['time'].attrs = { + 'standard_name': 'time', + 'long_name': 'Time', + 'units': time_unit, + 'calendar': time_calendar, + 'axis': 'T', + '_FillValue': 1.e+20 + } + + # set lon attributes + ds.coords['lon'].attrs = { + 'standard_name': 'longitude', + 'long_name': 'Longitude', + 'units': 'degrees_east', + 'axis': 'X', + '_FillValue': 1.e+20 + } + + # set lon attributes + ds.coords['lat'].attrs = { + 'standard_name': 'latitude', + 'long_name': 'Latitude', + 'units': 'degrees_north', + 'axis': 'Y', + '_FillValue': 1.e+20 + } + + # set variable attributes + for data_var in ds.data_vars: + if data_var in attrs: + ds.data_vars[data_var].attrs.update(attrs[data_var]) + + ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 + + # set global attributes + ds.attrs = attrs.get('global', {}) + + return ds + + def open_dataset(path, decode_cf=False, load=False): + path = Path(path) + if not load: logger.info(f'open {path.absolute()}') else: @@ -35,19 +104,31 @@ def load_dataset(path, decode_cf=False): def write_dataset(ds, path): - logger.info(f'write {path.absolute()}') + path = Path(path) path.parent.mkdir(exist_ok=True, parents=True) - for coord in ds.coords: - ds.coords[coord].attrs["_FillValue"] = 1.e+20 + logger.info(f'write {path.absolute()}') - for var in ds.data_vars: - ds.data_vars[var].attrs["_FillValue"] = 1.e+20 + add_fill_value(ds) + ds = order_variables(ds) + + ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=['time']) + + +def order_variables(ds): + return ds[[*ds.coords, *ds.data_vars]] - # reorder the variables - ds = ds[[*ds.coords, *ds.data_vars]] - ds.to_netcdf(path, format='NETCDF4_CLASSIC') +def add_fill_value(ds): + for coord in ds.coords: + if '_FillValue' not in ds.coords[coord].attrs: + ds.coords[coord].attrs['_FillValue'] = 1.e+20 + + for data_var in ds.data_vars: + if '_FillValue' not in ds.data_vars[data_var].attrs: + ds.data_vars[data_var].attrs['_FillValue'] = 1.e+20 + if 'missing_value' not in ds.data_vars[data_var].attrs: + ds.data_vars[data_var].attrs['missing_value'] = 1.e+20 def get_var_name(ds): From 17ecb3b7678d73b3b553495248a6d40bccecc493 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 20:10:23 +0200 Subject: [PATCH 005/162] Update dependencies --- pyproject.toml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3801791..640cf09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,8 +24,7 @@ dependencies = [ "python-dotenv", "requests", "rich", - "rioxarray", - "xarray", + "xarray" ] dynamic = ["version"] @@ -38,13 +37,10 @@ all = [ ] plot = [ "altair[all]", + "palettable", ] shapes = [ - "netCDF4", - "python-dotenv", - "requests", - "rich", - "xarray", + "rioxarray", ] dev = [ "build", From 2adc345ad8771e1557cd9c4d384ba46f867ee516 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 20:10:59 +0200 Subject: [PATCH 006/162] Refactor isimip_utils.pandas.compute_mean --- isimip_utils/pandas.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index f34ff83..6e09ee6 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -2,18 +2,18 @@ def get_var(df): return next(iter(df.attrs['data_vars'])) -def compute_area(df): +def compute_mean(df, area=True): var = get_var(df) attrs = df.attrs df['year'] = df['time'].dt.year - df = df.groupby('year').agg( - mean=(var, 'mean'), - lower=(var, lambda y: y.mean() - y.std()), - upper=(var, lambda y: y.mean() + y.std()) - ).reset_index() + kwargs = {'mean': (var, 'mean')} + if area: + kwargs['lower'] = (var, lambda y: y.mean() - y.std()) + kwargs['upper'] = (var, lambda y: y.mean() + y.std()) + df = df.groupby('year').agg(**kwargs).reset_index() df.attrs = attrs return df From 3f67af0f304bebc0cf3f3e0ca8d82943499a0d87 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 20:11:17 +0200 Subject: [PATCH 007/162] Add parse_parameters --- isimip_utils/cli.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index ff52c54..9f3fe67 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -1,4 +1,5 @@ import logging +from collections import defaultdict from pathlib import Path from dotenv import load_dotenv @@ -26,3 +27,12 @@ def setup_logs(log_level='WARN', log_file=None): file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')) root_logger.addHandler(file_handler) + + +def parse_parameters(args): + parameters = defaultdict(list) + if args: + for arg_string in args: + key, values_string = arg_string.split('=') + parameters[key] += values_string.split(',') + return parameters From 97117cb96439bc8696d483bac552ae53c3a2209e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 20:12:12 +0200 Subject: [PATCH 008/162] Add plot_grid --- isimip_utils/plot.py | 66 ++++++++++++++++++++++++++++++++++++++++--- isimip_utils/utils.py | 11 ++++++++ 2 files changed, 73 insertions(+), 4 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 8fc8429..0859064 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -1,9 +1,12 @@ import logging +from pathlib import Path import altair as alt import numpy as np +import pandas as pd from isimip_utils.pandas import get_var +from isimip_utils.utils import get_permutations logger = logging.getLogger(__name__) @@ -13,6 +16,8 @@ def enable_vegafusion(): def save_plot(chart, path, *args, **kwargs): + path = Path(path) + logger.info(f'save {path.absolute()}') path.parent.mkdir(exist_ok=True, parents=True) chart.save(path, *args, **kwargs) @@ -51,16 +56,21 @@ def plot_mean(df, x=None, color=None): color=color or alt.Color() ) - return base.mark_line(interpolate='step-after').encode( + chart = base.mark_line(interpolate='step-after').encode( y=alt.Y( 'mean:Q', title=get_title(df) ) - ) + base.mark_area(interpolate='step-after', opacity=0.5).encode( - y='lower:Q', - y2='upper:Q' ) + if 'lower' in df and 'upper' in df: + chart += base.mark_area(interpolate='step-after', opacity=0.5).encode( + y='lower:Q', + y2='upper:Q' + ) + + return chart + def plot_map(df, color_scale=None): lon = np.sort(df['lon'].unique()) @@ -102,3 +112,51 @@ def plot_map(df, color_scale=None): width=lon_size, height=lat_size ) + + +def plot_empty(x=None, y=None): + return alt.Chart(pd.DataFrame({'time': [], 'y': []})).mark_point().encode( + x=x or alt.X('time:T', title=None), + y=y or alt.Y('y:Q', title=None) + ) + + +def plot_grid(parameters, plots, empty=None): + keys = list(parameters.keys()) + + rows = [] + prev_permutation = None + for permutation in get_permutations(parameters): + # start a new row + if prev_permutation is None or permutation[0] != prev_permutation[0]: + row = [] + rows.append((f'{keys[0]} = {permutation[0]}', row)) + + # start a new column + if prev_permutation is None or permutation[1] != prev_permutation[1]: + column = [] + row.append((f'{keys[1]} = {permutation[1]}', column)) + + column.append(plots.get(permutation, empty or plot_empty())) + + prev_permutation = permutation + + chart = alt.vconcat(*[ + alt.hconcat(*[ + alt.layer(*column, title=column_title).resolve_scale( + x='shared', + y='shared', + color='shared' + ) + for column_title, column in row + ], title=row_title).resolve_scale( + x='shared', + y='shared' + ) + for row_title, row in rows + ]).resolve_scale( + x='shared', + y='shared' + ) + + return chart diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index fcb07b4..948ce66 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,3 +1,6 @@ +from itertools import product + + def parse_filelist(filelist_file): if filelist_file: with open(filelist_file) as f: @@ -24,3 +27,11 @@ def include_path(include, path): return False else: return True + + +def get_permutations(parameters): + return list(product(*parameters.values())) + + +def get_placeholders(parameters, permutation): + return dict(zip(parameters.keys(), permutation)) From 0b4ce40c65247ec85f561e97ecdaa5a869f7ebf6 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 5 Jun 2025 17:47:59 +0200 Subject: [PATCH 009/162] Refactor ArgumentParser, use toml instead of config, and move to cli --- isimip_utils/cli.py | 73 +++++++++++++++++++++++++++++++++++++----- isimip_utils/parser.py | 62 ----------------------------------- pyproject.toml | 2 +- 3 files changed, 66 insertions(+), 71 deletions(-) delete mode 100644 isimip_utils/parser.py diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 9f3fe67..9ef33ff 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -1,5 +1,7 @@ +import argparse import logging -from collections import defaultdict +import os +import tomllib from pathlib import Path from dotenv import load_dotenv @@ -29,10 +31,65 @@ def setup_logs(log_level='WARN', log_file=None): root_logger.addHandler(file_handler) -def parse_parameters(args): - parameters = defaultdict(list) - if args: - for arg_string in args: - key, values_string = arg_string.split('=') - parameters[key] += values_string.split(',') - return parameters +def parse_dict(string): + key, values = string.split('=') + return { + key.strip(): [value.strip() for value in values.split(',')] + } + + +def parse_list(string): + return [value.strip() for value in string.split(',')] + + +class ArgumentParser(argparse.ArgumentParser): + + config_files = [ + 'isimip.toml', + '~/.isimip.toml', + '/etc/isimip.toml', + ] + + def parse_args(self, *args): + # parse the command line arguments with the default namespace + # obtained from the config file and the environment + return super().parse_args(*args, namespace=self.build_default_args()) + + def get_defaults(self): + defaults = {} + for action in self._actions: + if not action.required and action.dest != 'help': + defaults[action.dest] = action.default + + defaults.update(vars(self.build_default_args())) + return defaults + + def read_config(self): + for config_file in self.config_files: + config_path = Path(config_file).expanduser() + if config_path.is_file(): + with open(config_path, 'rb') as fp: + data = tomllib.load(fp) + if self.prog in data: + return data[self.prog] + return {} + + def build_default_args(self): + # read config file + config = self.read_config() + + # init the default namespace + default_args = argparse.Namespace() + + for action in self._actions: + if not action.required and action.dest != 'help': + key = action.dest + key_upper = key.upper() + if os.getenv(key_upper): + # if the attribute is in the environment, take the value + setattr(default_args, key, os.getenv(key_upper)) + elif config and key in config: + # if the attribute is in the config file, take it from there + setattr(default_args, key, config.get(key)) + + return default_args diff --git a/isimip_utils/parser.py b/isimip_utils/parser.py deleted file mode 100644 index 07b36f7..0000000 --- a/isimip_utils/parser.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -import configparser -import os -from pathlib import Path - -from dotenv import load_dotenv - - -class ArgumentParser(argparse.ArgumentParser): - - config_file = None - default_config_files = [ - 'isimip.conf', - '~/.isimip.conf', - '/etc/isimip.conf' - ] - - def parse_args(self, *args): - # parse the command line arguments with the default namespace - # obtained from the config file and the environment - return super().parse_args(*args, namespace=self.build_default_args()) - - def get_defaults(self): - defaults = {} - for action in self._actions: - if not action.required and action.dest != 'help': - defaults[action.dest] = action.default - - defaults.update(vars(self.build_default_args())) - return defaults - - def read_config(self): - config_files = [self.config_file] if self.config_file else self.default_config_files - for config_file in config_files: - config_path = Path(config_file).expanduser() - config = configparser.ConfigParser() - config.read(config_path) - if self.prog in config: - return config[self.prog] - - def build_default_args(self): - # setup env from .env file - load_dotenv(Path().cwd() / '.env') - - # read config file - config = self.read_config() - - # init the default namespace - default_args = argparse.Namespace() - - for action in self._actions: - if not action.required and action.dest != 'help': - key = action.dest - key_upper = key.upper() - if os.getenv(key_upper): - # if the attribute is in the environment, take the value - setattr(default_args, key, os.getenv(key_upper)) - elif config and key in config: - # if the attribute is in the config file, take it from there - setattr(default_args, key, config.get(key)) - - return default_args diff --git a/pyproject.toml b/pyproject.toml index 640cf09..f541a37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ packages = ["isimip_utils"] version = { attr = "isimip_utils.__version__" } [tool.ruff] -target-version = "py39" +target-version = "py311" line-length = 120 select = [ "B", # flake8-bugbear From 9c7ac3fbde01f57e2ed6cafd7a647c6aea0a9069 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 5 Jun 2025 17:48:50 +0200 Subject: [PATCH 010/162] Remove config.py --- isimip_utils/config.py | 47 ------------------------------------------ 1 file changed, 47 deletions(-) delete mode 100644 isimip_utils/config.py diff --git a/isimip_utils/config.py b/isimip_utils/config.py deleted file mode 100644 index cf58ec0..0000000 --- a/isimip_utils/config.py +++ /dev/null @@ -1,47 +0,0 @@ -import logging -from pathlib import Path - -from colorlog import ColoredFormatter, StreamHandler - - -class Settings: - - _shared_state = {} - - def __init__(self): - self.__dict__ = self._shared_state - - def __str__(self): - return str(self.args) - - def setup(self, args): - # reset the shared state - self.__dict__ = self._shared_state = {} - - # assign args to settings object - self.args = {key.upper(): value for key, value in args.items()} - - # setup logs - try: - self.LOG_LEVEL = self.LOG_LEVEL.upper() - self.LOG_FILE = Path(self.LOG_FILE).expanduser() if self.LOG_FILE else None - - if self.LOG_FILE: - logging.basicConfig(level=self.LOG_LEVEL, filename=self.LOG_FILE, - format='[%(asctime)s] %(levelname)s %(name)s: %(message)s') - else: - formatter = ColoredFormatter('%(log_color)s[%(asctime)s] %(levelname)s %(name)s: %(message)s') - handler = StreamHandler() - handler.setFormatter(formatter) - logging.basicConfig(level=self.LOG_LEVEL, handlers=[handler]) - - except AttributeError: - pass - - def __getattr__(self, name): - # this function catches all properties and returns the values in the self.args dict, e.g. - # settings.FOO -> settings.args['FOO'] - try: - return self.args[name] - except KeyError as e: - raise AttributeError from e From c731066caf224657d0b6f40c88eb33b2e76c9081 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 5 Jun 2025 18:41:20 +0200 Subject: [PATCH 011/162] Refactor fetch.py --- isimip_utils/fetch.py | 216 ++++++++++++++++++------------------------ 1 file changed, 93 insertions(+), 123 deletions(-) diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index d56c1d5..d6b2524 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -11,156 +11,126 @@ logger = logging.getLogger(__name__) - -def fetch_definitions(bases, path): - path_components = Path(path).parts - for i in range(len(path_components), 0, -1): - definitions_path = Path('definitions').joinpath(os.sep.join(path_components[:i+1])).with_suffix('.json') - definitions_json = fetch_json(bases, definitions_path, extend_base='output') - - if definitions_json: - logger.debug('definitions_path = %s', definitions_path) - logger.debug('definitions_json = %s', definitions_json) - - definitions = {} - for definition_name, definition in definitions_json.items(): - # convert the definitions to dicts if they are lists - if isinstance(definition, list): - definitions[definition_name] = { - row['specifier']: row for row in definition - } - else: - definitions[definition_name] = definition - - logger.debug('definitions = %s', definitions) - return definitions +PROTOCOL_LOCATIONS = [ + 'https://protocol.isimip.org', + 'https://protocol2.isimip.org', +] + +def fetch_definitions(path, protocol_locations=PROTOCOL_LOCATIONS): + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] + + for protocol_location in protocol_locations: + for definitions_path, definitions_json in find_json(protocol_location, 'definitions', path): + if definitions_json: + logger.debug('definitions_path = %s', definitions_path) + logger.debug('definitions_json = %s', definitions_json) + + definitions = {} + for definition_name, definition in definitions_json.items(): + # convert the definitions to dicts if they are lists + if isinstance(definition, list): + definitions[definition_name] = { + row['specifier']: row for row in definition + } + else: + definitions[definition_name] = definition + + logger.debug('definitions = %s', definitions) + return definitions raise NotFound(f'no definitions found for {path}') -def fetch_pattern(bases, path): - path_components = Path(path).parts - for i in range(len(path_components), 0, -1): - pattern_path = Path('pattern').joinpath(os.sep.join(path_components[:i+1]) + '.json') - pattern_json = fetch_json(bases, pattern_path, extend_base='output') - - if pattern_json: - logger.debug('pattern_path = %s', pattern_path) - logger.debug('pattern_json = %s', pattern_json) - - if not all([ - isinstance(pattern_json['path'], str), - isinstance(pattern_json['file'], str), - isinstance(pattern_json['dataset'], str), - isinstance(pattern_json['suffix'], list) - ]): - break - - pattern = { - 'path': re.compile(pattern_json['path']), - 'file': re.compile(pattern_json['file']), - 'dataset': re.compile(pattern_json['dataset']), - 'suffix': pattern_json['suffix'], - 'specifiers': pattern_json.get('specifiers', []), - 'specifiers_map': pattern_json.get('specifiers_map', {}) - } - - logger.debug('pattern = %s', pattern) - - return pattern +def fetch_pattern(path, protocol_locations=PROTOCOL_LOCATIONS): + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] - raise NotFound(f'no pattern found for {path}') + for protocol_location in protocol_locations: + for pattern_path, pattern_json in find_json(protocol_location, 'pattern', path): + if pattern_json: + logger.debug('pattern_path = %s', pattern_path) + logger.debug('pattern_json = %s', pattern_json) + if not all([ + isinstance(pattern_json['path'], str), + isinstance(pattern_json['file'], str), + isinstance(pattern_json['dataset'], str), + isinstance(pattern_json['suffix'], list) + ]): + break -def fetch_schema(bases, path): - path_components = Path(path).parts - for i in range(len(path_components), 0, -1): - schema_path = Path('schema').joinpath(os.sep.join(path_components[:i+1])).with_suffix('.json') - schema_json = fetch_json(bases, schema_path, extend_base='output') + pattern = { + 'path': re.compile(pattern_json['path']), + 'file': re.compile(pattern_json['file']), + 'dataset': re.compile(pattern_json['dataset']), + 'suffix': pattern_json['suffix'], + 'specifiers': pattern_json.get('specifiers', []), + 'specifiers_map': pattern_json.get('specifiers_map', {}) + } - if schema_json: - logger.debug('schema_path = %s', schema_path) - logger.debug('schema_json = %s', schema_json) - return schema_json + logger.debug('pattern = %s', pattern) - raise NotFound(f'no schema found for {path}') + return pattern + raise NotFound(f'no pattern found for {path}') -def fetch_tree(bases, path): - path_components = Path(path).parts - for i in range(len(path_components), 0, -1): - tree_path = Path('tree').joinpath(os.sep.join(path_components[:i+1])).with_suffix('.json') - tree_json = fetch_json(bases, tree_path, extend_base='output') - if tree_json: - logger.debug('tree_path = %s', tree_path) - logger.debug('tree_json = %s', tree_json) - return tree_json +def fetch_schema(path, protocol_locations=PROTOCOL_LOCATIONS): + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] - raise NotFound(f'no tree found for {path}') + for protocol_location in protocol_locations: + for schema_path, schema_json in find_json(protocol_location, 'pattern', path): + if schema_json: + logger.debug('schema_path = %s', schema_path) + logger.debug('schema_json = %s', schema_json) + return schema_json + raise NotFound(f'no schema found for {path}') -def fetch_resource(location): - return fetch_json([location]) +def fetch_tree(path, protocol_locations=PROTOCOL_LOCATIONS): + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] -def fetch_json(bases, path=None, extend_base=None): - for base in bases: - if urlparse(base).scheme: - if path is not None: - json_url = base.rstrip('/') + '/' + path.as_posix() - else: - json_url = base.rstrip('/') + for protocol_location in protocol_locations: + for tree_path, tree_json in find_json(protocol_location, 'pattern', path): + if tree_json: + logger.debug('tree_path = %s', tree_path) + logger.debug('tree_json = %s', tree_json) + return tree_json - logger.debug('json_url = %s', json_url) + raise NotFound(f'no tree found for {path}') - try: - response = requests.get(json_url) - except requests.exceptions.ConnectionError: - return None - if response.status_code == 200: - return response.json() +def find_json(protocol_location, sub_location, path): + path_components = Path(path).parts + for i in range(len(path_components), 0, -1): + current_path = Path(os.sep.join(path_components[:i+1])).with_suffix('.json') + if urlparse(protocol_location).scheme: + yield current_path, fetch_json(f'{protocol_location}/{sub_location}/{current_path}') else: - json_path = Path(base).expanduser() - if extend_base is not None: - json_path /= extend_base - if path is not None: - json_path /= path - - logger.debug('json_path = %s', json_path) + yield current_path, load_json(Path(protocol_location) / 'output' / sub_location / current_path) - if json_path.exists(): - return json.loads(open(json_path).read()) +def fetch_json(location): + logger.debug('location = %s', location) -def fetch_file(bases, path=None, extend_base=None): - for base in bases: - if urlparse(base).scheme: - if path is not None: - file_url = base.rstrip('/') + '/' + path.as_posix() - else: - file_url = base.rstrip('/') + try: + response = requests.get(location) + except requests.exceptions.ConnectionError: + return None - logger.debug('file_url = %s', file_url) + if response.status_code == 200: + return response.json() - try: - response = requests.get(file_url) - except requests.exceptions.ConnectionError: - return None - if response.status_code == 200: - return response.content - - else: - file_path = Path(base).expanduser() - if extend_base is not None: - file_path /= extend_base - if path is not None: - file_path /= path +def load_json(path): + path = Path(path).expanduser() - logger.debug('file_path = %s', file_path) + logger.debug('path = %s', path) - if file_path.exists(): - return file_path.read() + if path.exists(): + return json.loads(open(path).read()) From 67d70759dc58da526877be9edd7a30cb2ad226aa Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 6 Jun 2025 15:46:24 +0200 Subject: [PATCH 012/162] Update utils.py --- isimip_utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 948ce66..5fae051 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -34,4 +34,4 @@ def get_permutations(parameters): def get_placeholders(parameters, permutation): - return dict(zip(parameters.keys(), permutation)) + return dict(zip(parameters.keys(), permutation, strict=True)) From 967e522b48075a1bf6fb89aca6a89c1d85e5e19d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Jun 2025 20:45:51 +0200 Subject: [PATCH 013/162] Fix dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index f541a37..70448a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ plot = [ "palettable", ] shapes = [ + "geopandas", "rioxarray", ] dev = [ From 15cc5a4e4bfd76c4002bb98969eea865ab6fdf1c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 20 Aug 2025 16:43:08 +0200 Subject: [PATCH 014/162] Fix pandas.conpute_mean --- isimip_utils/pandas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 6e09ee6..81b85ee 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -1,3 +1,6 @@ +import numpy as np + + def get_var(df): return next(iter(df.attrs['data_vars'])) @@ -14,6 +17,7 @@ def compute_mean(df, area=True): kwargs['upper'] = (var, lambda y: y.mean() + y.std()) df = df.groupby('year').agg(**kwargs).reset_index() + df['mean'] = df['mean'].astype(np.float64) df.attrs = attrs return df From ee94b0e6074cfdd001bd0a8bfe58e4d11cffe117 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 1 Sep 2025 16:57:27 +0200 Subject: [PATCH 015/162] Add join_parameters function --- isimip_utils/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 5fae051..a4c70a6 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -35,3 +35,10 @@ def get_permutations(parameters): def get_placeholders(parameters, permutation): return dict(zip(parameters.keys(), permutation, strict=True)) + + +def join_parameters(parameters, max_count=5, max_label='various'): + return { + key: (max_label if len(values) > max_count else '+'.join(values)) + for key, values in parameters.items() + } From c4f45f6d4aac1805825e8fc6df57900b5e8ff040 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 14 Aug 2025 15:52:50 +0200 Subject: [PATCH 016/162] Add default_color --- isimip_utils/plot.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 0859064..357eefe 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -11,6 +11,9 @@ logger = logging.getLogger(__name__) +default_color = alt.Color('steelblue') + + def enable_vegafusion(): alt.data_transformers.enable('vegafusion') @@ -43,7 +46,7 @@ def plot_time(df, interpolate=False, x=None, y=None, color=None): f'{get_var(df)}:Q', title=get_title(df) ), - color=color or alt.Color() + color=color or default_color ) @@ -53,7 +56,7 @@ def plot_mean(df, x=None, color=None): 'year:T', title='Year' ), - color=color or alt.Color() + color=color or default_color ) chart = base.mark_line(interpolate='step-after').encode( From 1b24269dfeef57ef40b402c108bf38322decb593 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 2 Sep 2025 14:35:26 +0200 Subject: [PATCH 017/162] Add compute_temporal_average and rename compute_spatial_average --- isimip_utils/extractions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 1717353..3c5691f 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -93,8 +93,8 @@ def mask_mask(ds, mask_ds, mask_var='mask'): return ds.where(mask_ds[mask_var] == 1) -def compute_mean(ds, weights=None): - logger.info('compute mean') +def compute_spatial_average(ds, weights=None): + logger.info('compute spatial average') if weights is None: logger.warn('no weights provided, using latitude-dependent weights') @@ -103,6 +103,11 @@ def compute_mean(ds, weights=None): return ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) +def compute_temporal_average(ds): + logger.info('compute temporal average') + return ds.mean(dim='time', skipna=True).astype(np.float32) + + def count_values(ds): logger.info('count values') return ds.count(dim=('lat', 'lon')).astype(np.float32) From 8071ab33572420b64d0a159c476851578fa8be81 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 2 Sep 2025 14:36:07 +0200 Subject: [PATCH 018/162] Refactor plot.py --- isimip_utils/pandas.py | 71 +++++++++++++++---- isimip_utils/plot.py | 156 ++++++++++++++++++++--------------------- 2 files changed, 135 insertions(+), 92 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 81b85ee..bdfea4d 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -1,47 +1,90 @@ -import numpy as np +def get_coord(df): + return next(iter(df.attrs['coords'])) -def get_var(df): +def get_coord_label(df): + coord = get_coord(df) + name = df.attrs['coords'][coord].get('long_name', coord) + units = df.attrs['coords'][coord].get('units') + return f'{name} [{units}]' if units else name + + +def get_coord_axis(df): + coord = get_coord(df) + return df.attrs['coords'][coord].get('axis') + + +def get_data_var(df): return next(iter(df.attrs['data_vars'])) -def compute_mean(df, area=True): - var = get_var(df) +def get_data_var_label(df): + data_var = get_data_var(df) + data_var_name = df.attrs['data_vars'][data_var].get('long_name', data_var) + data_var_units = df.attrs['data_vars'][data_var].get('units') + return f'{data_var_name} [{data_var_units}]' if data_var_units else data_var_name + + +def compute_average(df, area=True): + data_var = get_data_var(df) + data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') + data_var_units = df.attrs['data_vars'][data_var].get('units') + attrs = df.attrs df['year'] = df['time'].dt.year - kwargs = {'mean': (var, 'mean')} + kwargs = {'mean': (data_var, 'mean')} if area: - kwargs['lower'] = (var, lambda y: y.mean() - y.std()) - kwargs['upper'] = (var, lambda y: y.mean() + y.std()) + kwargs['lower'] = (data_var, lambda y: y.mean() - y.std()) + kwargs['upper'] = (data_var, lambda y: y.mean() + y.std()) df = df.groupby('year').agg(**kwargs).reset_index() - df['mean'] = df['mean'].astype(np.float64) + + # cast to double + df['mean'] = df['mean'].astype('float64') + if area: + df['lower'] = df['lower'].astype('float64') + df['upper'] = df['upper'].astype('float64') + + # update attrs df.attrs = attrs + df.attrs['coords'] = {'year': {'long_name': 'Year', 'axis': 'T'}} + df.attrs['data_vars'] = { 'mean': {} } + if data_var_long_name: + df.attrs['data_vars']['mean']['long_name'] = f'Average {data_var_long_name.lower()}' + if data_var_units: + df.attrs['data_vars']['mean']['units'] = data_var_units return df def group_by_day(df): - var = get_var(df) + data_var = get_data_var(df) df['day'] = df['time'].dt.dayofyear - df = df.groupby('day')[var].mean().reset_index() + df = df.groupby('day')[data_var].mean().reset_index() + df.attrs['coords'] = {'day': { 'long_name': 'Day of the year'}} - return normalize(df, var) + return normalize(df, data_var) def group_by_month(df): - var = get_var(df) + data_var = get_data_var(df) df['month'] = df['time'].dt.month - df = df.groupby('month')[var].mean().reset_index() + df = df.groupby('month')[data_var].mean().reset_index() + df.attrs['coords'] = {'month': {'long_name': 'Month of the year'}} - return normalize(df, var) + return normalize(df, data_var) def normalize(df, var): mean, std = df[var].mean(), df[var].std() df[var] = (df[var] - mean) / (std if std > 0 else 1.0) return df + + +def create_label(df, labels): + df['label'] = ' '.join(labels) + return df diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 357eefe..c34fb9e 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -5,17 +5,23 @@ import numpy as np import pandas as pd -from isimip_utils.pandas import get_var +from isimip_utils.pandas import get_coord, get_coord_axis, get_coord_label, get_data_var, get_data_var_label from isimip_utils.utils import get_permutations logger = logging.getLogger(__name__) -default_color = alt.Color('steelblue') +def default_color_theme(): + return { + "config": { + "mark": {"color": "steelblue"} + } + } -def enable_vegafusion(): - alt.data_transformers.enable('vegafusion') +alt.data_transformers.enable('vegafusion') +alt.themes.register("default_color_theme", default_color_theme) +alt.themes.enable("default_color_theme") def save_plot(chart, path, *args, **kwargs): @@ -26,121 +32,109 @@ def save_plot(chart, path, *args, **kwargs): chart.save(path, *args, **kwargs) -def get_title(df): - var = get_var(df) - var_name = df.attrs['data_vars'][var].get('long_name', var) - var_units = df.attrs['data_vars'][var]['units'] +def plot_line(df, x=None, y=None, color=None, empty=False, **mark_kwargs): + if not x: + x_field = get_coord(df) + x_label = get_coord_label(df) + x_type = 'T' if get_coord_axis(df) == 'T' else 'Q' + x = alt.X(f'{x_field}:{x_type}', title=x_label) - return f'{var_name} [{var_units}]' + if not y: + y_field = get_data_var(df) + y_label = get_data_var_label(df) + y = alt.Y(f'{y_field}:Q', title=y_label) + if not color: + color = alt.Color() -def plot_time(df, interpolate=False, x=None, y=None, color=None): - mark_kwargs = {'interpolate': interpolate} if interpolate else {} - - return alt.Chart(df).mark_line(**mark_kwargs).encode( - x=x or alt.X( - 'time:T', - title='Time' - ), - y=y or alt.Y( - f'{get_var(df)}:Q', - title=get_title(df) - ), - color=color or default_color - ) + if empty: + df = pd.DataFrame({x.to_dict().get('field'): [], y.to_dict().get('field'): []}) + # the base chart contains only the x axis + base = alt.Chart(df).mark_line(**mark_kwargs).encode(x=x) -def plot_mean(df, x=None, color=None): - base = alt.Chart(df).encode( - x=x or alt.X( - 'year:T', - title='Year' - ), - color=color or default_color - ) - - chart = base.mark_line(interpolate='step-after').encode( - y=alt.Y( - 'mean:Q', - title=get_title(df) - ) - ) + chart = base.mark_line(**mark_kwargs).encode(y=y, color=color) if 'lower' in df and 'upper' in df: - chart += base.mark_area(interpolate='step-after', opacity=0.5).encode( + chart += base.mark_area(**mark_kwargs, opacity=0.5).encode( y='lower:Q', - y2='upper:Q' + y2='upper:Q', + color=color ) return chart -def plot_map(df, color_scale=None): - lon = np.sort(df['lon'].unique()) - lat = np.sort(df['lat'].unique()) - - lon_size = len(lon) - lat_size = len(lat) +def plot_map(df, x=None, y=None, color=None, empty=False): + if not x: + lon = np.sort(df['lon'].unique()) + lon_size = len(lon) + lon_bin = float(abs(lon[1] - lon[0])) + lon_domain = (lon.min() - 0.5 * lon_bin, lon.max() + 0.5 * lon_bin) + lon_ticks = np.linspace(lon_domain[0], lon_domain[1], num=7) - lon_bin = float(abs(lon[1] - lon[0])) - lat_bin = float(abs(lat[1] - lat[0])) - - lon_domain = (lon.min() - 0.5 * lon_bin, lon.max() + 0.5 * lon_bin) - lat_domain = (lat.min() - 0.5 * lat_bin, lat.max() + 0.5 * lat_bin) - - lon_ticks = np.linspace(lon_domain[0], lon_domain[1], num=7) - lat_ticks = np.linspace(lat_domain[0], lat_domain[1], num=5) - - return alt.Chart(df).mark_rect().encode( - x=alt.X( + x = alt.X( 'lon:Q', title='lon', bin=alt.Bin(step=lon_bin), axis=alt.Axis(values=lon_ticks), scale=alt.Scale(domain=lon_domain, padding=0, round=True) - ), - y=alt.Y( + ) + + if not y: + lat = np.sort(df['lat'].unique()) + lat_size = len(lat) + lat_bin = float(abs(lat[1] - lat[0])) + lat_domain = (lat.min() - 0.5 * lat_bin, lat.max() + 0.5 * lat_bin) + lat_ticks = np.linspace(lat_domain[0], lat_domain[1], num=5) + + y = alt.Y( 'lat:Q', title='lat', bin=alt.Bin(step=lat_bin), axis=alt.Axis(values=lat_ticks), scale=alt.Scale(domain=lat_domain, padding=0, round=True) - ), - color=alt.Color( - f'{get_var(df)}:Q', - title=get_title(df), - scale=color_scale or alt.Scale() ) - ).properties( - width=lon_size, - height=lat_size - ) + if not color: + color_field = get_data_var(df) + color_label = get_data_var_label(df) -def plot_empty(x=None, y=None): - return alt.Chart(pd.DataFrame({'time': [], 'y': []})).mark_point().encode( - x=x or alt.X('time:T', title=None), - y=y or alt.Y('y:Q', title=None) - ) + color = alt.Color( + f'{color_field}:Q', + title=color_label, + scale=alt.Scale() + ) + if empty: + df = pd.DataFrame({'lon': [], 'lat': [], color.to_dict().get('field'): []}) + + return alt.Chart(df).mark_rect().encode(x=x, y=y, color=color).properties( + width=lon_size, + height=lat_size + ) -def plot_grid(parameters, plots, empty=None): - keys = list(parameters.keys()) +def plot_grid(parameters, plots, empty_plot, layer=True): rows = [] prev_permutation = None for permutation in get_permutations(parameters): + # start a new row if prev_permutation is None or permutation[0] != prev_permutation[0]: row = [] - rows.append((f'{keys[0]} = {permutation[0]}', row)) + rows.append((permutation[0], row)) # start a new column if prev_permutation is None or permutation[1] != prev_permutation[1]: column = [] - row.append((f'{keys[1]} = {permutation[1]}', column)) + row.append((permutation[1], column)) - column.append(plots.get(permutation, empty or plot_empty())) + plot = plots.get(permutation, empty_plot) + if not layer: + plot = plot.properties(title=' '.join(permutation[2:])) + + column.append(plot) prev_permutation = permutation @@ -151,6 +145,12 @@ def plot_grid(parameters, plots, empty=None): y='shared', color='shared' ) + if layer else + alt.vconcat(*column, title=column_title).resolve_scale( + x='shared', + y='shared', + color='shared' + ) for column_title, column in row ], title=row_title).resolve_scale( x='shared', From 6c0b6c783578a8b4b79db7e672931d1e84145306 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 2 Sep 2025 14:39:22 +0200 Subject: [PATCH 019/162] Refactor xarray.py --- isimip_utils/xarray.py | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index eca1522..d19b32b 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -109,32 +109,19 @@ def write_dataset(ds, path): logger.info(f'write {path.absolute()}') - add_fill_value(ds) + ds = add_fill_value_to_attrs(ds) ds = order_variables(ds) - ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=['time']) + # time should be an unlimited dimension + unlimited_dims = ['time'] if 'time' in ds.dims else [] + + ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) def order_variables(ds): return ds[[*ds.coords, *ds.data_vars]] -def add_fill_value(ds): - for coord in ds.coords: - if '_FillValue' not in ds.coords[coord].attrs: - ds.coords[coord].attrs['_FillValue'] = 1.e+20 - - for data_var in ds.data_vars: - if '_FillValue' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['_FillValue'] = 1.e+20 - if 'missing_value' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['missing_value'] = 1.e+20 - - -def get_var_name(ds): - return next(iter(ds.data_vars)) - - def get_attrs(ds): attrs = {} for coord in ds.coords: @@ -151,10 +138,30 @@ def set_attrs(ds, attrs): for data_var in ds.data_vars: if data_var in attrs: ds[data_var].attrs = attrs[data_var] + return ds + + +def add_fill_value_to_attrs(ds): + for coord in ds.coords: + if '_FillValue' not in ds.coords[coord].attrs: + ds.coords[coord].attrs['_FillValue'] = 1.e+20 + + for data_var in ds.data_vars: + if '_FillValue' not in ds.data_vars[data_var].attrs: + ds.data_vars[data_var].attrs['_FillValue'] = 1.e+20 + if 'missing_value' not in ds.data_vars[data_var].attrs: + ds.data_vars[data_var].attrs['missing_value'] = 1.e+20 + return ds def to_dataframe(ds): - ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') + if 'time' in ds.coords: + ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') + + ds = ds.assign({ + data_var: ds[data_var].astype('float64') + for data_var in ds.data_vars + }) df = ds.to_dataframe().reset_index() df.attrs['coords'] = { @@ -167,7 +174,7 @@ def to_dataframe(ds): return df -def apply_fill_value(ds): +def set_fill_value_to_nan(ds): for var in ds.data_vars: fill_value = ds[var].attrs.get('_FillValue', 1e+20) ds[var] = ds[var].where(ds[var] != fill_value) From 92002bb3bc9b2e62e840aed97b1add6ccbf512dc Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 11 Sep 2025 10:09:36 +0200 Subject: [PATCH 020/162] Add cache to open_dataset --- isimip_utils/xarray.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index d19b32b..4ea75e3 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -7,6 +7,8 @@ logger = logging.getLogger(__name__) +_dataset_cache = {} + def init_dataset(lon=720, lat=360, time=None, time_unit='days since 1601-1-1 00:00:00', @@ -74,9 +76,15 @@ def init_dataset(lon=720, lat=360, time=None, return ds -def open_dataset(path, decode_cf=False, load=False): +def open_dataset(path, decode_cf=False, load=False, cache=False): path = Path(path) + if load and cache: + key = (path, decode_cf) + if key in _dataset_cache: + logger.info(f'use cached {path.absolute()}') + return _dataset_cache[key] + if not load: logger.info(f'open {path.absolute()}') else: @@ -96,6 +104,9 @@ def open_dataset(path, decode_cf=False, load=False): if load: ds.load() + if load and cache: + _dataset_cache[key] = ds + return ds @@ -103,6 +114,10 @@ def load_dataset(path, decode_cf=False): return open_dataset(path, decode_cf=False, load=True) +def cache_dataset(path, decode_cf=False): + return open_dataset(path, decode_cf=False, load=True, cache=True) + + def write_dataset(ds, path): path = Path(path) path.parent.mkdir(exist_ok=True, parents=True) @@ -118,6 +133,11 @@ def write_dataset(ds, path): ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) +def clear_cache(): + for key in _dataset_cache.keys(): + del _dataset_cache[key] + + def order_variables(ds): return ds[[*ds.coords, *ds.data_vars]] From 9cca986ffa00a14eacf0ff1db1b476e4e05ebb2a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 11 Sep 2025 10:09:45 +0200 Subject: [PATCH 021/162] Add files module --- isimip_utils/files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 isimip_utils/files.py diff --git a/isimip_utils/files.py b/isimip_utils/files.py new file mode 100644 index 0000000..977a98c --- /dev/null +++ b/isimip_utils/files.py @@ -0,0 +1,11 @@ +import re + + +def find_files(base_path, pattern): + files = [] + for path in sorted(base_path.rglob("*")): + match = re.search(str(pattern), str(path)) + if match: + files.append(dict(path=path, **match.groupdict())) + + return files From 21f2bdf1abd36dabd4cf0f6df7451ca3cff3fcc9 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 11 Sep 2025 10:09:56 +0200 Subject: [PATCH 022/162] Fix plot_grid --- isimip_utils/plot.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index c34fb9e..9e4d855 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -119,14 +119,13 @@ def plot_grid(parameters, plots, empty_plot, layer=True): rows = [] prev_permutation = None for permutation in get_permutations(parameters): - - # start a new row if prev_permutation is None or permutation[0] != prev_permutation[0]: - row = [] + # start a new row + column = [] + row = [(permutation[1], column)] rows.append((permutation[0], row)) - - # start a new column - if prev_permutation is None or permutation[1] != prev_permutation[1]: + elif prev_permutation is None or permutation[1] != prev_permutation[1]: + # start a new column column = [] row.append((permutation[1], column)) From a3316ffb5fa098567d87430b494a3a3ab280a208 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 11 Sep 2025 10:10:10 +0200 Subject: [PATCH 023/162] Add copy_placeholders and update_year utils --- isimip_utils/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index a4c70a6..780bd08 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -42,3 +42,26 @@ def join_parameters(parameters, max_count=5, max_label='various'): key: (max_label if len(values) > max_count else '+'.join(values)) for key, values in parameters.items() } + + +def copy_placeholders(*placeholder_args, **kwargs): + placeholders = { + key: value + for placeholder_arg in placeholder_args + for key, value in placeholder_arg.items() + } + placeholders.update(**kwargs) + return placeholders + + +def update_year(placeholders, key, year, operator): + if operator not in ('<', '>'): + raise RuntimeError(f'operator "{operator}" not supported') + + current = placeholders.get(key) + if ( + (current is None) or + (operator == '>' and int(current) < int(year)) or + (operator == '<' and int(current) > int(year)) + ): + placeholders[key] = year From ffae8f18430c8628b7ab84de88ae6a2df424a64e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 11 Sep 2025 20:29:44 +0200 Subject: [PATCH 024/162] Ignore case in find_files --- isimip_utils/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/files.py b/isimip_utils/files.py index 977a98c..deb646a 100644 --- a/isimip_utils/files.py +++ b/isimip_utils/files.py @@ -4,7 +4,7 @@ def find_files(base_path, pattern): files = [] for path in sorted(base_path.rglob("*")): - match = re.search(str(pattern), str(path)) + match = re.search(str(pattern), str(path), re.IGNORECASE) if match: files.append(dict(path=path, **match.groupdict())) From 662f5e4d6ea039cde8610c07756028b7c552d522 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 12 Sep 2025 13:46:33 +0200 Subject: [PATCH 025/162] Fix plot_grid --- isimip_utils/plot.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 9e4d855..9c06dff 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -118,16 +118,20 @@ def plot_map(df, x=None, y=None, color=None, empty=False): def plot_grid(parameters, plots, empty_plot, layer=True): rows = [] prev_permutation = None + for permutation in get_permutations(parameters): - if prev_permutation is None or permutation[0] != prev_permutation[0]: + row_title = permutation[0] if len(permutation) > 0 else '' + column_title = permutation[1] if len(permutation) > 1 else '' + + if prev_permutation is None or (len(permutation) > 0 and permutation[0] != prev_permutation[0]): # start a new row column = [] - row = [(permutation[1], column)] - rows.append((permutation[0], row)) - elif prev_permutation is None or permutation[1] != prev_permutation[1]: + row = [(row_title, column)] + rows.append((column_title, row)) + elif prev_permutation is None or (len(permutation) > 1 and permutation[1] != prev_permutation[1]): # start a new column column = [] - row.append((permutation[1], column)) + row.append((column_title, column)) plot = plots.get(permutation, empty_plot) if not layer: From acacc294d20cb9b61a967e26b13a2f5b97f16b41 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 24 Sep 2025 15:47:14 +0200 Subject: [PATCH 026/162] Update logging --- isimip_utils/extractions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 3c5691f..1e8fa23 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -89,7 +89,7 @@ def mask_bbox(ds, west, east, south, north): def mask_mask(ds, mask_ds, mask_var='mask'): - logger.info('mask mask') + logger.info(f'mask {mask_var}') return ds.where(mask_ds[mask_var] == 1) From dcaeb9c3fc9214aa391567c9d8869c013e31df3c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 24 Sep 2025 16:52:02 +0200 Subject: [PATCH 027/162] Fix plot_grid --- isimip_utils/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 9c06dff..90f79e6 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -126,8 +126,8 @@ def plot_grid(parameters, plots, empty_plot, layer=True): if prev_permutation is None or (len(permutation) > 0 and permutation[0] != prev_permutation[0]): # start a new row column = [] - row = [(row_title, column)] - rows.append((column_title, row)) + row = [(column_title, column)] + rows.append((row_title, row)) elif prev_permutation is None or (len(permutation) > 1 and permutation[1] != prev_permutation[1]): # start a new column column = [] From d3063ed046bc3fa19e15d4614bf7c52ffd63659e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Sep 2025 17:10:34 +0200 Subject: [PATCH 028/162] Add get_plot_title --- isimip_utils/plot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 90f79e6..b267c22 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -32,6 +32,14 @@ def save_plot(chart, path, *args, **kwargs): chart.save(path, *args, **kwargs) +def get_plot_title(permutation): + return { + "text": ' Β· '.join(permutation), + "fontSize": 16, + "dy": -10 + } + + def plot_line(df, x=None, y=None, color=None, empty=False, **mark_kwargs): if not x: x_field = get_coord(df) From ce347ea8ee14010631a32aee7b919660dc28cf5f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Sep 2025 17:10:48 +0200 Subject: [PATCH 029/162] Ensure log_file path --- isimip_utils/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 9ef33ff..4994b24 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -24,6 +24,8 @@ def setup_logs(log_level='WARN', log_file=None): root_logger.addHandler(RichHandler()) if log_file is not None: + Path(log_file).parent.mkdir(exist_ok=True, parents=True) + file_handler = logging.FileHandler(log_file) file_handler.setLevel(log_level) file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')) From 275cfc424a6b5978d43b4304cfbe950b9a7af87d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Sep 2025 18:59:57 +0200 Subject: [PATCH 030/162] Add log_console option --- isimip_utils/cli.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 4994b24..9762eea 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -12,16 +12,17 @@ def setup_env(): load_dotenv(Path().cwd() / '.env') -def setup_logs(log_level='WARN', log_file=None): +def setup_logs(log_level='WARN', log_file=None, log_console=True): log_level = log_level.upper() root_logger = logging.getLogger() root_logger.setLevel(log_level) - rich_handler = RichHandler() - rich_handler.setLevel(log_level) + if log_console: + rich_handler = RichHandler() + rich_handler.setLevel(log_level) - root_logger.addHandler(RichHandler()) + root_logger.addHandler(RichHandler()) if log_file is not None: Path(log_file).parent.mkdir(exist_ok=True, parents=True) From b9b2580ed01ab8a2e60ba8cb5837ef508f79a92f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 7 Oct 2025 13:29:14 +0200 Subject: [PATCH 031/162] Move normalize out of the group functions in pandas.py --- isimip_utils/pandas.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index bdfea4d..a70d892 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -66,7 +66,7 @@ def group_by_day(df): df = df.groupby('day')[data_var].mean().reset_index() df.attrs['coords'] = {'day': { 'long_name': 'Day of the year'}} - return normalize(df, data_var) + return df def group_by_month(df): @@ -76,12 +76,19 @@ def group_by_month(df): df = df.groupby('month')[data_var].mean().reset_index() df.attrs['coords'] = {'month': {'long_name': 'Month of the year'}} - return normalize(df, data_var) + return df + +def normalize(df): + data_var = get_data_var(df) + data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') + + mean, std = df[data_var].mean(), df[data_var].std() + df[data_var] = (df[data_var] - mean) / (std if std > 0 else 1.0) + if data_var_long_name: + df.attrs['data_vars'][data_var]['long_name'] = f'Normalized {data_var_long_name.lower()}' + del df.attrs['data_vars'][data_var]['units'] -def normalize(df, var): - mean, std = df[var].mean(), df[var].std() - df[var] = (df[var] - mean) / (std if std > 0 else 1.0) return df From 92d795c1fe1768ed9a65a69fe01410cfe1d496da Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 7 Oct 2025 15:32:33 +0200 Subject: [PATCH 032/162] Add resolve_scale arguments to plot_grid --- isimip_utils/plot.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index b267c22..deb0fe5 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -123,7 +123,7 @@ def plot_map(df, x=None, y=None, color=None, empty=False): ) -def plot_grid(parameters, plots, empty_plot, layer=True): +def plot_grid(parameters, plots, empty_plot, layer=True, x='shared', y='shared', color='shared'): rows = [] prev_permutation = None @@ -151,26 +151,12 @@ def plot_grid(parameters, plots, empty_plot, layer=True): chart = alt.vconcat(*[ alt.hconcat(*[ - alt.layer(*column, title=column_title).resolve_scale( - x='shared', - y='shared', - color='shared' - ) + alt.layer(*column, title=column_title) if layer else - alt.vconcat(*column, title=column_title).resolve_scale( - x='shared', - y='shared', - color='shared' - ) + alt.vconcat(*column, title=column_title).resolve_scale(x=x, y=y, color=color) for column_title, column in row - ], title=row_title).resolve_scale( - x='shared', - y='shared' - ) + ], title=row_title).resolve_scale(x=x, y=y) for row_title, row in rows - ]).resolve_scale( - x='shared', - y='shared' - ) + ]).resolve_scale(x=x, y=y) return chart From 97e491d309720bf3647bed01c16d8e21c78936b3 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 7 Oct 2025 15:33:04 +0200 Subject: [PATCH 033/162] Refactor plot_line --- isimip_utils/plot.py | 46 ++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index deb0fe5..6fc638e 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -40,23 +40,45 @@ def get_plot_title(permutation): } -def plot_line(df, x=None, y=None, color=None, empty=False, **mark_kwargs): - if not x: - x_field = get_coord(df) - x_label = get_coord_label(df) - x_type = 'T' if get_coord_axis(df) == 'T' else 'Q' - x = alt.X(f'{x_field}:{x_type}', title=x_label) +def plot_line(df, x_field=None, x_label=None, x_type=None, + y_field=None, y_label=None, y_type=None, y_format=None, + color_field=None, color_type=None, color_range=None, + legend=True, empty=False, **mark_kwargs): + + x_field = x_field or get_coord(df) + x_label = x_label or get_coord_label(df) + x_type = x_type or ('T' if get_coord_axis(df) == 'T' else 'Q') + x = alt.X( + f'{x_field}:{x_type}', + title=x_label + ) - if not y: - y_field = get_data_var(df) - y_label = get_data_var_label(df) - y = alt.Y(f'{y_field}:Q', title=y_label) + y_field = y_field or get_data_var(df) + y_label = y_label or get_data_var_label(df) + y_type = y_type or 'Q' + y = alt.Y( + f'{y_field}:{y_type}', + title=y_label, + axis=alt.Axis(format=y_format) if y_format else alt.Axis(), + scale=alt.Scale(zero=False, nice=False) + ) - if not color: + if empty: color = alt.Color() + else: + color_field = color_field or 'label' + color_type = color_type or 'N' + color = alt.Color( + f'{color_field}:{color_type}', + scale=alt.Scale(range=color_range) if color_range else alt.Scale(), + legend=alt.Legend(title='Legend', padding=10) if legend else None + ) if empty: - df = pd.DataFrame({x.to_dict().get('field'): [], y.to_dict().get('field'): []}) + df = pd.DataFrame({ + x_field: df[x_field], + y_field: np.full_like(df[y_field], np.nan, dtype=float) + }) # the base chart contains only the x axis base = alt.Chart(df).mark_line(**mark_kwargs).encode(x=x) From 373321bd784db44cbd660bab8d0c9ac239788b9d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 7 Oct 2025 15:50:11 +0200 Subject: [PATCH 034/162] Refactor plot_map --- isimip_utils/plot.py | 80 +++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 6fc638e..884b765 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -95,49 +95,59 @@ def plot_line(df, x_field=None, x_label=None, x_type=None, return chart -def plot_map(df, x=None, y=None, color=None, empty=False): - if not x: - lon = np.sort(df['lon'].unique()) - lon_size = len(lon) - lon_bin = float(abs(lon[1] - lon[0])) - lon_domain = (lon.min() - 0.5 * lon_bin, lon.max() + 0.5 * lon_bin) - lon_ticks = np.linspace(lon_domain[0], lon_domain[1], num=7) - - x = alt.X( - 'lon:Q', - title='lon', - bin=alt.Bin(step=lon_bin), - axis=alt.Axis(values=lon_ticks), - scale=alt.Scale(domain=lon_domain, padding=0, round=True) - ) +def plot_map(df, color_field=None, color_type=None, color_range=None, color_label=None, color_format=None, + bin_size=1, legend=True, empty=False): + lon = np.sort(df['lon'].unique()) + lon_size = len(lon) + lon_bin = float(abs(lon[1] - lon[0])) * bin_size + lon_domain = (lon.min() - 0.5 * lon_bin, lon.max() + 0.5 * lon_bin) + lon_ticks = np.linspace(lon_domain[0], lon_domain[1], num=7) - if not y: - lat = np.sort(df['lat'].unique()) - lat_size = len(lat) - lat_bin = float(abs(lat[1] - lat[0])) - lat_domain = (lat.min() - 0.5 * lat_bin, lat.max() + 0.5 * lat_bin) - lat_ticks = np.linspace(lat_domain[0], lat_domain[1], num=5) - - y = alt.Y( - 'lat:Q', - title='lat', - bin=alt.Bin(step=lat_bin), - axis=alt.Axis(values=lat_ticks), - scale=alt.Scale(domain=lat_domain, padding=0, round=True) - ) + x = alt.X( + 'lon:Q', + title='lon', + bin=alt.Bin(step=lon_bin), + axis=alt.Axis(values=lon_ticks), + scale=alt.Scale(domain=lon_domain, padding=0, round=True) + ) + + lat = np.sort(df['lat'].unique()) + lat_size = len(lat) + lat_bin = float(abs(lat[1] - lat[0])) * bin_size + lat_domain = (lat.min() - 0.5 * lat_bin, lat.max() + 0.5 * lat_bin) + lat_ticks = np.linspace(lat_domain[0], lat_domain[1], num=5) + + y = alt.Y( + 'lat:Q', + title='lat', + bin=alt.Bin(step=lat_bin), + axis=alt.Axis(values=lat_ticks), + scale=alt.Scale(domain=lat_domain, padding=0, round=True) + ) - if not color: - color_field = get_data_var(df) - color_label = get_data_var_label(df) + if empty: + color = alt.Color() + else: + color_field = color_field or get_data_var(df) + color_type = color_type or 'Q' + color_label = color_label or get_data_var_label(df) + + color_legend_args = {} + if color_format: + color_legend_args['format'] = color_format color = alt.Color( - f'{color_field}:Q', + f'{color_field}:{color_type}', title=color_label, - scale=alt.Scale() + scale=alt.Scale(range=color_range) if color_range else alt.Scale(), + legend=alt.Legend(padding=10, **color_legend_args) if legend else None ) if empty: - df = pd.DataFrame({'lon': [], 'lat': [], color.to_dict().get('field'): []}) + df = pd.DataFrame({ + 'lon': [], + 'lat': [] + }) return alt.Chart(df).mark_rect().encode(x=x, y=y, color=color).properties( width=lon_size, From 64c9949243d8bc7e289cfd63fe39bf86542197d6 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 7 Oct 2025 16:09:02 +0200 Subject: [PATCH 035/162] Update pre-commit config --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 419443e..259dd5d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: hooks: - id: check-hooks-apply - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: - id: check-ast - id: check-yaml @@ -11,7 +11,7 @@ repos: - id: trailing-whitespace - id: debug-statements - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.284 + rev: v0.13.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From ceb20cf2cdee4949429ff84451d07beaab355496 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 8 Oct 2025 17:15:59 +0200 Subject: [PATCH 036/162] Add save_index to plot.py --- isimip_utils/plot.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 884b765..ec85768 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -1,3 +1,4 @@ +import json import logging from pathlib import Path @@ -32,6 +33,66 @@ def save_plot(chart, path, *args, **kwargs): chart.save(path, *args, **kwargs) +def save_index(index_path): + index_json = json.dumps([ + str(p.name) for p in index_path.parent.iterdir() if p.suffix in ['.svg', '.png'] + ], indent=2).replace('\n', '\n ') + + logger.info(f'save {index_path.absolute()}') + index_path.with_suffix('.html').write_text(r''' + + + + + + + + +
+ + + + +
+
+ +
+ + +'''.replace(r'{{ index_json }}', index_json).strip()) + + def get_plot_title(permutation): return { "text": ' Β· '.join(permutation), From c796101ebffa52205dbde61b28f47002c3457319 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 9 Oct 2025 15:37:05 +0200 Subject: [PATCH 037/162] Add log_rich argument to setup logs --- isimip_utils/cli.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 9762eea..c06c8d8 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -12,24 +12,28 @@ def setup_env(): load_dotenv(Path().cwd() / '.env') -def setup_logs(log_level='WARN', log_file=None, log_console=True): +def setup_logs(log_level='WARN', log_file=None, log_console=True, log_rich=True): log_level = log_level.upper() root_logger = logging.getLogger() root_logger.setLevel(log_level) if log_console: - rich_handler = RichHandler() - rich_handler.setLevel(log_level) + if log_rich: + console_handler = RichHandler() + else: + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) - root_logger.addHandler(RichHandler()) + console_handler.setLevel(log_level) + root_logger.addHandler(console_handler) if log_file is not None: Path(log_file).parent.mkdir(exist_ok=True, parents=True) file_handler = logging.FileHandler(log_file) file_handler.setLevel(log_level) - file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')) + file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) root_logger.addHandler(file_handler) From 5e7cc18bb22e73de9d5ce7a75322cdbca7e88958 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 14 Oct 2025 10:25:59 +0200 Subject: [PATCH 038/162] Add set_nan_to_fill_value to write_dataset --- isimip_utils/xarray.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 4ea75e3..2070b83 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -125,6 +125,7 @@ def write_dataset(ds, path): logger.info(f'write {path.absolute()}') ds = add_fill_value_to_attrs(ds) + ds = set_nan_to_fill_value(ds) ds = order_variables(ds) # time should be an unlimited dimension @@ -201,6 +202,13 @@ def set_fill_value_to_nan(ds): return ds +def set_nan_to_fill_value(ds): + for var in ds.data_vars: + fill_value = ds[var].attrs.get('_FillValue', 1e+20) + ds[var] = ds[var].where(~np.isnan(ds[var]), fill_value) + return ds + + def create_mask(ds, df, layer): import shapely.geometry logger.info('create mask') From 70dd97fa78b4205284df236f240714fb3cee8c80 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 22 Oct 2025 12:58:15 +0200 Subject: [PATCH 039/162] Add config.py --- isimip_utils/cli.py | 41 ++++++++++++++++++++++++++++++++++++-- isimip_utils/config.py | 41 ++++++++++++++++++++++++++++++++++++++ isimip_utils/exceptions.py | 7 +++++++ isimip_utils/fetch.py | 4 ++-- pyproject.toml | 4 +++- 5 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 isimip_utils/config.py diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index c06c8d8..dd6604c 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -2,11 +2,14 @@ import logging import os import tomllib +from datetime import datetime from pathlib import Path from dotenv import load_dotenv from rich.logging import RichHandler +from .exceptions import ConfigError + def setup_env(): load_dotenv(Path().cwd() / '.env') @@ -49,6 +52,17 @@ def parse_list(string): return [value.strip() for value in string.split(',')] +def parse_version(value): + try: + return datetime.strptime(value, '%Y%m%d') + except ValueError as e: + raise argparse.ArgumentTypeError('incorrect format, should be YYYYMMDD') from e + + +def parse_path(value): + return Path(value).expanduser() + + class ArgumentParser(argparse.ArgumentParser): config_files = [ @@ -92,11 +106,34 @@ def build_default_args(self): if not action.required and action.dest != 'help': key = action.dest key_upper = key.upper() + + value = None + if os.getenv(key_upper): # if the attribute is in the environment, take the value - setattr(default_args, key, os.getenv(key_upper)) + value = os.getenv(key_upper) + # setattr(default_args, key, ) elif config and key in config: # if the attribute is in the config file, take it from there - setattr(default_args, key, config.get(key)) + value = config.get(key) + + if value is not None: + # apply action.type + if action.type is not None: + try: + value = action.type(value) + except argparse.ArgumentTypeError as e: + raise ConfigError(f'argument "{key}": {e}') from e + + # check action.action + if action.const and value not in [True, False]: + raise ConfigError(f'argument "{key}": invalid choice "{value}" (choose true or false)') + + # check action.choices + if action.choices and value not in action.choices: + raise ConfigError(f'argument "{key}": invalid choice "{value}" (choose from {action.choices})') + + # add the key and value to the default_args + setattr(default_args, key, value) return default_args diff --git a/isimip_utils/config.py b/isimip_utils/config.py new file mode 100644 index 0000000..8d1cb59 --- /dev/null +++ b/isimip_utils/config.py @@ -0,0 +1,41 @@ +import logging + +logger = logging.getLogger(__name__) + + +class Singleton: + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.data = {} + return cls._instance + + +class Settings(Singleton): + _settings = {} + + def __repr__(self): + return str(self._settings) + + def __getattr__(self, name): + if name in self._settings.keys(): + return self._settings[name] + + def __setattr__(self, name, value): + if name.startswith('_'): + # allow normal attribute for internal data + super().__setattr__(name, value) + else: + self._settings[name] = value + + def dict(self): + return self._settings + + @classmethod + def from_dict(cls, values): + instance = cls() + instance._settings = {key.upper(): value for key, value in values.items()} + logger.debug('settings = %s', instance) + return instance diff --git a/isimip_utils/exceptions.py b/isimip_utils/exceptions.py index 031f4d7..69726c2 100644 --- a/isimip_utils/exceptions.py +++ b/isimip_utils/exceptions.py @@ -1,11 +1,18 @@ class ExtractionError(RuntimeError): pass + class ValidationError(RuntimeError): pass + class DidNotMatch(RuntimeError): pass + class NotFound(RuntimeError): pass + + +class ConfigError(RuntimeError): + pass diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index d6b2524..51a5c86 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -81,7 +81,7 @@ def fetch_schema(path, protocol_locations=PROTOCOL_LOCATIONS): protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for schema_path, schema_json in find_json(protocol_location, 'pattern', path): + for schema_path, schema_json in find_json(protocol_location, 'schema', path): if schema_json: logger.debug('schema_path = %s', schema_path) logger.debug('schema_json = %s', schema_json) @@ -95,7 +95,7 @@ def fetch_tree(path, protocol_locations=PROTOCOL_LOCATIONS): protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for tree_path, tree_json in find_json(protocol_location, 'pattern', path): + for tree_path, tree_json in find_json(protocol_location, 'tree', path): if tree_json: logger.debug('tree_path = %s', tree_path) logger.debug('tree_json = %s', tree_json) diff --git a/pyproject.toml b/pyproject.toml index 70448a3..dd304a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,8 @@ version = { attr = "isimip_utils.__version__" } [tool.ruff] target-version = "py311" line-length = 120 + +[tool.ruff.lint] select = [ "B", # flake8-bugbear "C4", # flake8-comprehensions @@ -78,7 +80,7 @@ ignore = [ "RUF012", # mutable-class-default ] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = [ "isimip_utils" ] From 277255f63354d07b225aef92defe83bd923cbccf Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 22 Oct 2025 19:17:46 +0200 Subject: [PATCH 040/162] Move Singleton to utils --- isimip_utils/config.py | 12 ++---------- isimip_utils/utils.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 8d1cb59..4272990 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -1,16 +1,8 @@ import logging -logger = logging.getLogger(__name__) - +from .utils import Singleton -class Singleton: - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance.data = {} - return cls._instance +logger = logging.getLogger(__name__) class Settings(Singleton): diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 780bd08..187dcb3 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,6 +1,16 @@ from itertools import product +class Singleton: + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.data = {} + return cls._instance + + def parse_filelist(filelist_file): if filelist_file: with open(filelist_file) as f: From 261a4211230831301c188ea01d7527110633ba66 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 22 Oct 2025 19:19:27 +0200 Subject: [PATCH 041/162] Add inverse argument to mask_mask --- isimip_utils/extractions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 1e8fa23..ad3dae9 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -88,9 +88,9 @@ def mask_bbox(ds, west, east, south, north): return ds -def mask_mask(ds, mask_ds, mask_var='mask'): +def mask_mask(ds, mask_ds, mask_var='mask', inverse=False): logger.info(f'mask {mask_var}') - return ds.where(mask_ds[mask_var] == 1) + return ds.where(mask_ds[mask_var] == 0 if inverse else 1) def compute_spatial_average(ds, weights=None): From ac428ae715d5bdc4ae40dc8ea06d763920ee0a46 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 22 Oct 2025 19:20:25 +0200 Subject: [PATCH 042/162] Remove load_dataset and cache_dataset --- isimip_utils/xarray.py | 71 +++++++++++++----------------------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 2070b83..e39695c 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -7,8 +7,6 @@ logger = logging.getLogger(__name__) -_dataset_cache = {} - def init_dataset(lon=720, lat=360, time=None, time_unit='days since 1601-1-1 00:00:00', @@ -76,19 +74,10 @@ def init_dataset(lon=720, lat=360, time=None, return ds -def open_dataset(path, decode_cf=False, load=False, cache=False): +def open_dataset(path, decode_cf=False, load=False): path = Path(path) - if load and cache: - key = (path, decode_cf) - if key in _dataset_cache: - logger.info(f'use cached {path.absolute()}') - return _dataset_cache[key] - - if not load: - logger.info(f'open {path.absolute()}') - else: - logger.info(f'load {path.absolute()}') + logger.info(f'load {path.absolute()}' if load else f'open {path.absolute()}') try: ds = xr.open_dataset(path, decode_cf=decode_cf) @@ -104,20 +93,9 @@ def open_dataset(path, decode_cf=False, load=False, cache=False): if load: ds.load() - if load and cache: - _dataset_cache[key] = ds - return ds -def load_dataset(path, decode_cf=False): - return open_dataset(path, decode_cf=False, load=True) - - -def cache_dataset(path, decode_cf=False): - return open_dataset(path, decode_cf=False, load=True, cache=True) - - def write_dataset(ds, path): path = Path(path) path.parent.mkdir(exist_ok=True, parents=True) @@ -134,11 +112,6 @@ def write_dataset(ds, path): ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) -def clear_cache(): - for key in _dataset_cache.keys(): - del _dataset_cache[key] - - def order_variables(ds): return ds[[*ds.coords, *ds.data_vars]] @@ -175,26 +148,6 @@ def add_fill_value_to_attrs(ds): return ds -def to_dataframe(ds): - if 'time' in ds.coords: - ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') - - ds = ds.assign({ - data_var: ds[data_var].astype('float64') - for data_var in ds.data_vars - }) - - df = ds.to_dataframe().reset_index() - df.attrs['coords'] = { - coord: ds[coord].attrs for coord in ds.coords - } - df.attrs['data_vars'] = { - data_var: ds[data_var].attrs for data_var in ds.data_vars - } - - return df - - def set_fill_value_to_nan(ds): for var in ds.data_vars: fill_value = ds[var].attrs.get('_FillValue', 1e+20) @@ -228,3 +181,23 @@ def create_mask(ds, df, layer): mask_ds = mask_ds.rio.clip([geometry], drop=False) mask_ds = mask_ds.drop_vars('spatial_ref') return mask_ds + + +def to_dataframe(ds): + if 'time' in ds.coords: + ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') + + ds = ds.assign({ + data_var: ds[data_var].astype('float64') + for data_var in ds.data_vars + }) + + df = ds.to_dataframe().reset_index() + df.attrs['coords'] = { + coord: ds[coord].attrs for coord in ds.coords + } + df.attrs['data_vars'] = { + data_var: ds[data_var].attrs for data_var in ds.data_vars + } + + return df From c4cd4041a8fba515461424823779344eabaa0e5d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 24 Oct 2025 17:21:20 +0200 Subject: [PATCH 043/162] Update README.md --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3fb074c..77d5cb6 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,19 @@ This package contains common functionality for different ISIMIP tools, namely: It comprises of: * `isimip_utils.checksum`: Functions to compute the SHA-512 checksum of a file. +* `isimip_utils.cli`: Command-line interface utilities for argument parsing and configuration. * `isimip_utils.config`: A settings class to combine input from `argparse`, the environment (via `python-dotenv`) and config files. +* `isimip_utils.decorators`: Decorators including a cached property implementation. * `isimip_utils.exceptions`: Custom exceptions for ISIMIP tools. +* `isimip_utils.extractions`: Data extraction and manipulation utilities for xarray datasets. * `isimip_utils.fetch`: Functions to fetch files from the machine-actionable ISIMIP protocols. -* `isimip_utils.netcdf`: Functions to open and read NetCDF files. -* `isimip_utils.patterns`: Functions to match the file names and extract the ISIMIP specifiers. +* `isimip_utils.files`: File search utilities with regex pattern matching. +* `isimip_utils.netcdf`: Functions to open and read NetCDF files using netCDF4. +* `isimip_utils.pandas`: DataFrame utilities for ISIMIP data processing. +* `isimip_utils.patterns`: Functions to match file names and extract ISIMIP specifiers. +* `isimip_utils.plot`: Plotting utilities using Altair for data visualization. * `isimip_utils.utils`: Additional utility functions. +* `isimip_utils.xarray`: Functions for working with xarray datasets. Setup From f813707f1e6981132dec16f0e5f354f12f826c50 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 13:49:00 +0100 Subject: [PATCH 044/162] Add docstrings and types --- isimip_utils/checksum.py | 27 +++++- isimip_utils/cli.py | 78 +++++++++++++--- isimip_utils/config.py | 34 +++++-- isimip_utils/decorators.py | 22 +++-- isimip_utils/exceptions.py | 8 ++ isimip_utils/extractions.py | 176 +++++++++++++++++++++++++++++++++--- isimip_utils/fetch.py | 92 +++++++++++++++++-- isimip_utils/files.py | 13 ++- isimip_utils/netcdf.py | 116 +++++++++++++++++++++--- isimip_utils/pandas.py | 106 ++++++++++++++++++++-- isimip_utils/patterns.py | 89 ++++++++++++++++-- isimip_utils/plot.py | 102 ++++++++++++++++++--- isimip_utils/utils.py | 108 ++++++++++++++++++++-- isimip_utils/xarray.py | 142 ++++++++++++++++++++++++++--- 14 files changed, 1000 insertions(+), 113 deletions(-) diff --git a/isimip_utils/checksum.py b/isimip_utils/checksum.py index 8155f41..80c03d7 100644 --- a/isimip_utils/checksum.py +++ b/isimip_utils/checksum.py @@ -1,5 +1,7 @@ +"""Checksum computation utilities for file integrity verification.""" import hashlib import logging +from pathlib import Path logger = logging.getLogger(__name__) @@ -7,7 +9,16 @@ CHECKSUM_TYPE = 'sha512' -def get_checksum(abspath, checksum_type=CHECKSUM_TYPE): +def get_checksum(abspath: str | Path, checksum_type: str = CHECKSUM_TYPE) -> str: + """Compute the checksum of a file. + + Args: + abspath (str | Path): Absolute path to the file to checksum. + checksum_type (str): Type of checksum algorithm to use (default: 'sha512'). + + Returns: + The hexadecimal digest string of the file's checksum. + """ m = hashlib.new(checksum_type) with open(abspath, 'rb') as f: # read and update in blocks of 64K @@ -16,9 +27,19 @@ def get_checksum(abspath, checksum_type=CHECKSUM_TYPE): return m.hexdigest() -def get_checksum_type(): +def get_checksum_type() -> str: + """Get the default checksum type. + + Returns: + The default checksum algorithm name (e.g., 'sha512'). + """ return CHECKSUM_TYPE -def get_checksum_suffix(): +def get_checksum_suffix() -> str: + """Get the file suffix for checksum files. + + Returns: + The checksum file extension (e.g., '.sha512'). + """ return '.' + CHECKSUM_TYPE diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index dd6604c..060c91e 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -1,3 +1,4 @@ +"""Command-line interface utilities for ISIMIP tools.""" import argparse import logging import os @@ -11,11 +12,21 @@ from .exceptions import ConfigError -def setup_env(): +def setup_env() -> None: + """Load environment variables from .env file in current working directory.""" load_dotenv(Path().cwd() / '.env') -def setup_logs(log_level='WARN', log_file=None, log_console=True, log_rich=True): +def setup_logs(log_level: str = 'WARN', log_file: str | None = None, + log_console: bool = True, log_rich: bool = True) -> None: + """Configure logging with console and/or file handlers. + + Args: + log_level (str): Logging level (default: 'WARN'). + log_file (str | None): Path to log file, or None for no file logging (default: None). + log_console (bool): Whether to log to console (default: True). + log_rich (bool): Whether to use RichHandler for console logging (default: True). + """ log_level = log_level.upper() root_logger = logging.getLogger() @@ -41,29 +52,74 @@ def setup_logs(log_level='WARN', log_file=None, log_console=True, log_rich=True) root_logger.addHandler(file_handler) -def parse_dict(string): +def parse_dict(string: str) -> dict[str, list[str]]: + """Parse a string in format 'key=value1,value2' into a dictionary. + + Args: + string (str): String to parse in format 'key=value1,value2,value3'. + + Returns: + Dictionary with single key mapping to list of values. + """ key, values = string.split('=') return { key.strip(): [value.strip() for value in values.split(',')] } -def parse_list(string): +def parse_list(string: str) -> list[str]: + """Parse a comma-separated string into a list. + + Args: + string (str): Comma-separated string to parse. + + Returns: + List of stripped values. + """ return [value.strip() for value in string.split(',')] -def parse_version(value): +def parse_version(value: str) -> datetime: + """Parse a version string in YYYYMMDD format. + + Args: + value (str): Version string in YYYYMMDD format. + + Returns: + Parsed datetime object. + + Raises: + argparse.ArgumentTypeError: If format is incorrect. + """ try: return datetime.strptime(value, '%Y%m%d') except ValueError as e: raise argparse.ArgumentTypeError('incorrect format, should be YYYYMMDD') from e -def parse_path(value): +def parse_path(value: str) -> Path: + """Parse and expand a path string. + + Args: + value (str): Path string to parse. + + Returns: + Expanded Path object. + """ return Path(value).expanduser() class ArgumentParser(argparse.ArgumentParser): + """Extended ArgumentParser that reads defaults from config files and environment. + + Supports reading configuration from TOML files in the following order: + + - `./isimip.toml` + - `~/.isimip.toml` + - `/etc/isimip.toml` + + Environment variables (uppercase) override config file values. + """ config_files = [ 'isimip.toml', @@ -71,12 +127,10 @@ class ArgumentParser(argparse.ArgumentParser): '/etc/isimip.toml', ] - def parse_args(self, *args): - # parse the command line arguments with the default namespace - # obtained from the config file and the environment + def parse_args(self, *args) -> argparse.Namespace: return super().parse_args(*args, namespace=self.build_default_args()) - def get_defaults(self): + def get_defaults(self) -> dict: defaults = {} for action in self._actions: if not action.required and action.dest != 'help': @@ -85,7 +139,7 @@ def get_defaults(self): defaults.update(vars(self.build_default_args())) return defaults - def read_config(self): + def read_config(self) -> dict: for config_file in self.config_files: config_path = Path(config_file).expanduser() if config_path.is_file(): @@ -95,7 +149,7 @@ def read_config(self): return data[self.prog] return {} - def build_default_args(self): + def build_default_args(self) -> argparse.Namespace: # read config file config = self.read_config() diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 4272990..8f3b3af 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -1,4 +1,6 @@ +"""Configuration management for ISIMIP tools.""" import logging +from typing import Any from .utils import Singleton @@ -6,27 +8,47 @@ class Settings(Singleton): - _settings = {} + """Singleton settings class for managing application configuration. - def __repr__(self): + This class provides a centralized settings store that combines input from + argparse, environment variables, and config files. Settings are stored as + uppercase keys and can be accessed as attributes. + """ + _settings: dict[str, Any] = {} + + def __repr__(self) -> str: return str(self._settings) - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: if name in self._settings.keys(): return self._settings[name] - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any) -> None: if name.startswith('_'): # allow normal attribute for internal data super().__setattr__(name, value) else: self._settings[name] = value - def dict(self): + def dict(self) -> dict[str, Any]: + """Return the settings as a dictionary. + + Returns: + Dictionary of all settings. + """ return self._settings @classmethod - def from_dict(cls, values): + def from_dict(cls, values: dict[str, Any]) -> 'Settings': + """Create a Settings instance from a dictionary. + + Args: + values (dict[str, Any]): Dictionary of setting key-value pairs. + + Returns: + A Settings instance populated with the provided values. + All keys are converted to uppercase. + """ instance = cls() instance._settings = {key.upper(): value for key, value in values.items()} logger.debug('settings = %s', instance) diff --git a/isimip_utils/decorators.py b/isimip_utils/decorators.py index c3b165f..4b988aa 100644 --- a/isimip_utils/decorators.py +++ b/isimip_utils/decorators.py @@ -1,23 +1,29 @@ -""" -Simplified version of Django'd cached_property -https://github.com/django/django/blob/main/django/utils/functional.py -""" +from collections.abc import Callable +from typing import Any class cached_property: + """Decorator that converts a method into a cached property. - name = None + The property value is computed once and then cached as an instance attribute. + Subsequent accesses return the cached value without re-computing. - def __init__(self, func): + Simplified version of + [Django's cached_property](https://github.com/django/django/blob/main/django/utils/functional.py). + """ + + name: str | None = None + + def __init__(self, func: Callable) -> None: self.func = func - def __set_name__(self, owner, name): + def __set_name__(self, owner: type, name: str) -> None: if self.name is None: self.name = name else: raise TypeError("Cannot assign the same cached_property to two different names") - def __get__(self, instance, cls=None): + def __get__(self, instance: Any, cls: type | None = None) -> Any: if instance is None: return self value = instance.__dict__[self.name] = self.func(instance) diff --git a/isimip_utils/exceptions.py b/isimip_utils/exceptions.py index 69726c2..7d67629 100644 --- a/isimip_utils/exceptions.py +++ b/isimip_utils/exceptions.py @@ -1,18 +1,26 @@ +"""Custom exceptions for ISIMIP tools.""" + + class ExtractionError(RuntimeError): + """Raised when data extraction operations fail.""" pass class ValidationError(RuntimeError): + """Raised when data validation fails.""" pass class DidNotMatch(RuntimeError): + """Raised when a pattern does not match the expected format.""" pass class NotFound(RuntimeError): + """Raised when a required resource or file is not found.""" pass class ConfigError(RuntimeError): + """Raised when there is an error in configuration.""" pass diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index ad3dae9..708158a 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -1,4 +1,6 @@ +"""Data extraction and manipulation utilities for xarray datasets.""" import logging +from datetime import datetime import cftime import numpy as np @@ -9,7 +11,16 @@ logger = logging.getLogger(__name__) -def select_time(ds, timestamp): +def select_time(ds: xr.Dataset, timestamp: datetime) -> xr.Dataset | None: + """Select a single time point from a dataset. + + Args: + ds (xr.Dataset): Dataset with time dimension. + timestamp (datetime): Timestamp to select. + + Returns: + Dataset at the selected time, or None if timestamp is outside range. + """ logger.info(f'select time time={timestamp}') time = compute_time(ds, timestamp) if time < 0 or time > ds['time'].max(): @@ -19,7 +30,20 @@ def select_time(ds, timestamp): return ds.sel(time=time, method='nearest') -def select_period(ds, start, end): +def select_period(ds: xr.Dataset, start: datetime | None, end: datetime | None) -> xr.Dataset: + """Select a time period from a dataset. + + Args: + ds (xr.Dataset): Dataset with time dimension. + start (datetime | None): Start of period, or None for beginning. + end (datetime | None): End of period, or None for end. + + Returns: + Dataset with time dimension sliced to the period. + + Raises: + ExtractionError: If no time axis remains after selection. + """ logger.info(f'select period start={start} end={end}') units = ds.coords['time'].attrs['units'] calendar = ds.coords['time'].attrs['calendar'] @@ -35,14 +59,43 @@ def select_period(ds, start, end): return ds -def select_point(ds, lat, lon): +def select_point(ds: xr.Dataset, lat: float, lon: float) -> xr.Dataset: + """Select a single geographic point from a dataset. + + Args: + ds (xr.Dataset): Dataset with lat/lon dimensions. + lat (float): Latitude (-90 to 90). + lon (float): Longitude (-180 to 180). + + Returns: + Dataset at the nearest grid point. + + Raises: + ValidationError: If lat/lon are out of valid range. + """ logger.info(f'select point lat={lat} lon={lon}') validate_lat(lat) validate_lon(lon) return ds.sel(lat=lat, lon=lon, method='nearest') -def select_bbox(ds, west, east, south, north): +def select_bbox(ds: xr.Dataset, west: float, east: float, south: float, north: float) -> xr.Dataset: + """Select a bounding box region from a dataset. + + Args: + ds (xr.Dataset): Dataset with lat/lon dimensions. + west (float): Western longitude boundary (-180 to 180). + east (float): Eastern longitude boundary (-180 to 180). + south (float): Southern latitude boundary (-90 to 90). + north (float): Northern latitude boundary (-90 to 90). + + Returns: + Dataset with lat/lon dimensions sliced to the bounding box. + + Raises: + ValidationError: If coordinates are out of valid range. + ExtractionError: If no lat or lon axis remains after selection. + """ logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') validate_lat(south) @@ -63,7 +116,22 @@ def select_bbox(ds, west, east, south, north): return ds -def mask_bbox(ds, west, east, south, north): +def mask_bbox(ds: xr.Dataset, west: float, east: float, south: float, north: float) -> xr.Dataset: + """Mask a dataset to a bounding box, setting values outside to NaN. + + Args: + ds (xr.Dataset): Dataset with lat/lon dimensions. + west (float): Western longitude boundary (-180 to 180). + east (float): Eastern longitude boundary (-180 to 180). + south (float): Southern latitude boundary (-90 to 90). + north (float): Northern latitude boundary (-90 to 90). + + Returns: + Dataset with values outside bounding box masked as NaN. + + Raises: + ValidationError: If coordinates are out of valid range. + """ logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') validate_lat(south) @@ -88,12 +156,33 @@ def mask_bbox(ds, west, east, south, north): return ds -def mask_mask(ds, mask_ds, mask_var='mask', inverse=False): +def mask_mask(ds: xr.Dataset, mask_ds: xr.Dataset, mask_var: str = 'mask', + inverse: bool = False) -> xr.Dataset: + """Apply a mask dataset to another dataset. + + Args: + ds (xr.Dataset): Dataset to mask. + mask_ds (xr.Dataset): Dataset containing mask variable. + mask_var (str): Name of mask variable (default: 'mask'). + inverse (bool): Whether to invert the mask (default: False). + + Returns: + Masked dataset with values where mask is 1 (or 0 if inverse=True). + """ logger.info(f'mask {mask_var}') return ds.where(mask_ds[mask_var] == 0 if inverse else 1) -def compute_spatial_average(ds, weights=None): +def compute_spatial_average(ds: xr.Dataset, weights: xr.DataArray | None = None) -> xr.Dataset: + """Compute spatial average over lat/lon dimensions. + + Args: + ds (xr.Dataset): Dataset with lat/lon dimensions. + weights (xr.DataArray | None): Weights for averaging. If None, uses latitude-dependent weights. + + Returns: + Dataset with lat/lon dimensions averaged out. + """ logger.info('compute spatial average') if weights is None: @@ -103,17 +192,42 @@ def compute_spatial_average(ds, weights=None): return ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) -def compute_temporal_average(ds): +def compute_temporal_average(ds: xr.Dataset) -> xr.Dataset: + """Compute temporal average over time dimension. + + Args: + ds (xr.Dataset): Dataset with time dimension. + + Returns: + Dataset with time dimension averaged out. + """ logger.info('compute temporal average') return ds.mean(dim='time', skipna=True).astype(np.float32) -def count_values(ds): +def count_values(ds: xr.Dataset) -> xr.Dataset: + """Count non-NaN values over lat/lon dimensions. + + Args: + ds (xr.Dataset): Dataset with lat/lon dimensions. + + Returns: + Dataset with count of non-NaN values per time step. + """ logger.info('count values') return ds.count(dim=('lat', 'lon')).astype(np.float32) -def concat_extraction(ds1, ds2): +def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: + """Concatenate two datasets along time dimension with offset correction. + + Args: + ds1 (xr.Dataset | None): First dataset, or None. + ds2 (xr.Dataset): Second dataset to concatenate. + + Returns: + Concatenated dataset, or copy of ds2 if ds1 is None. + """ if ds1 is None: return ds2.copy() elif 'time' not in ds2.sizes: @@ -127,13 +241,31 @@ def concat_extraction(ds1, ds2): return xr.concat([ds1, ds2], 'time') -def compute_time(ds, timestamp): +def compute_time(ds: xr.Dataset, timestamp: datetime | None) -> float | None: + """Convert a datetime to numeric time value for dataset. + + Args: + ds (xr.Dataset): Dataset with time coordinate containing units and calendar. + timestamp (datetime | None): Timestamp to convert, or None. + + Returns: + Numeric time value in dataset's units, or None if timestamp is None. + """ units = ds.coords['time'].attrs['units'] calendar = ds.coords['time'].attrs['calendar'] return cftime.date2num(timestamp, units=units, calendar=calendar) if timestamp else None -def compute_offset(ds1, ds2): +def compute_offset(ds1: xr.Dataset, ds2: xr.Dataset) -> xr.DataArray | None: + """Compute time offset between two datasets with different time units. + + Args: + ds1 (xr.Dataset): First dataset with time coordinate. + ds2 (xr.Dataset): Second dataset with time coordinate. + + Returns: + Time offset to apply to ds2, or None if units/calendars match. + """ units1 = ds1.coords['time'].attrs['units'] units2 = ds2.coords['time'].attrs['units'] calendar1 = ds1.coords['time'].attrs['calendar'] @@ -147,14 +279,30 @@ def compute_offset(ds1, ds2): return offset -def validate_lat(lat): +def validate_lat(lat: float) -> None: + """Validate latitude value is within valid range. + + Args: + lat (float): Latitude value to validate. + + Raises: + ValidationError: If latitude is outside -90 to 90 range. + """ if lat < -90: raise ValidationError(f'lat={lat} must be > -90') elif lat > 90: raise ValidationError(f'lat={lat} must be < 90') -def validate_lon(lon): +def validate_lon(lon: float) -> None: + """Validate longitude value is within valid range. + + Args: + lon (float): Longitude value to validate. + + Raises: + ValidationError: If longitude is outside -180 to 180 range. + """ if lon < -180: raise ValidationError(f'lon={lon} must be > -180') elif lon > 180: diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index 51a5c86..5989865 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -1,8 +1,11 @@ +"""Functions to fetch files from machine-actionable ISIMIP protocols.""" import json import logging import os import re +from collections.abc import Generator from pathlib import Path +from typing import Any from urllib.parse import urlparse import requests @@ -16,7 +19,19 @@ 'https://protocol2.isimip.org', ] -def fetch_definitions(path, protocol_locations=PROTOCOL_LOCATIONS): +def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: + """Fetch definitions from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for definitions. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Dictionary of definitions with specifiers as keys. + + Raises: + NotFound: If no definitions are found for the given path. + """ if isinstance(protocol_locations, str): protocol_locations = [protocol_locations] @@ -42,7 +57,20 @@ def fetch_definitions(path, protocol_locations=PROTOCOL_LOCATIONS): raise NotFound(f'no definitions found for {path}') -def fetch_pattern(path, protocol_locations=PROTOCOL_LOCATIONS): +def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: + """Fetch pattern definitions from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for patterns. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Dictionary containing compiled regex patterns for 'path', 'file', 'dataset', + and lists of 'suffix', 'specifiers', and 'specifiers_map'. + + Raises: + NotFound: If no pattern is found for the given path. + """ if isinstance(protocol_locations, str): protocol_locations = [protocol_locations] @@ -76,7 +104,19 @@ def fetch_pattern(path, protocol_locations=PROTOCOL_LOCATIONS): raise NotFound(f'no pattern found for {path}') -def fetch_schema(path, protocol_locations=PROTOCOL_LOCATIONS): +def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: + """Fetch schema from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for schema. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Schema JSON object. + + Raises: + NotFound: If no schema is found for the given path. + """ if isinstance(protocol_locations, str): protocol_locations = [protocol_locations] @@ -90,7 +130,19 @@ def fetch_schema(path, protocol_locations=PROTOCOL_LOCATIONS): raise NotFound(f'no schema found for {path}') -def fetch_tree(path, protocol_locations=PROTOCOL_LOCATIONS): +def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: + """Fetch tree structure from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for tree structure. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Tree JSON object. + + Raises: + NotFound: If no tree is found for the given path. + """ if isinstance(protocol_locations, str): protocol_locations = [protocol_locations] @@ -104,7 +156,17 @@ def fetch_tree(path, protocol_locations=PROTOCOL_LOCATIONS): raise NotFound(f'no tree found for {path}') -def find_json(protocol_location, sub_location, path): +def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Generator[tuple[Path, Any], None, None]: + """Find JSON files in protocol locations by traversing path components. + + Args: + protocol_location (str): Base protocol location URL or path. + sub_location (str): Subdirectory within protocol location (e.g., 'definitions', 'pattern'). + path (str | Path): Path to search for JSON files. + + Yields: + Tuples of (current_path, json_content) for each path component. + """ path_components = Path(path).parts for i in range(len(path_components), 0, -1): current_path = Path(os.sep.join(path_components[:i+1])).with_suffix('.json') @@ -115,7 +177,15 @@ def find_json(protocol_location, sub_location, path): yield current_path, load_json(Path(protocol_location) / 'output' / sub_location / current_path) -def fetch_json(location): +def fetch_json(location: str) -> Any | None: + """Fetch JSON content from a URL. + + Args: + location (str): URL to fetch JSON from. + + Returns: + Parsed JSON object, or None if request fails or status is not 200. + """ logger.debug('location = %s', location) try: @@ -127,7 +197,15 @@ def fetch_json(location): return response.json() -def load_json(path): +def load_json(path: str | Path) -> Any | None: + """Load JSON content from a local file. + + Args: + path (str | Path): Path to the JSON file. + + Returns: + Parsed JSON object, or None if file doesn't exist. + """ path = Path(path).expanduser() logger.debug('path = %s', path) diff --git a/isimip_utils/files.py b/isimip_utils/files.py index deb646a..9732389 100644 --- a/isimip_utils/files.py +++ b/isimip_utils/files.py @@ -1,7 +1,18 @@ +"""File search utilities for ISIMIP tools.""" import re +from pathlib import Path -def find_files(base_path, pattern): +def find_files(base_path: Path, pattern: str) -> list[dict]: + """Find files matching a regex pattern in a directory tree. + + Args: + base_path (Path): Base directory to search in. + pattern (str): Regular expression pattern to match against file paths. + + Returns: + List of dictionaries containing 'path' and any named groups from the regex match. + """ files = [] for path in sorted(base_path.rglob("*")): match = re.search(str(pattern), str(path), re.IGNORECASE) diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 0bcf20f..ee3aad7 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -1,4 +1,7 @@ +"""Functions to open and read NetCDF files using netCDF4.""" from datetime import datetime +from pathlib import Path +from typing import Any import numpy as np from netCDF4 import Dataset @@ -9,17 +12,49 @@ INT_TYPES = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] -def open_dataset_read(file_path): +def open_dataset(file_path: str | Path) -> Dataset: + """Open a NetCDF dataset in read-only mode. + + Args: + file_path (str | Path): Path to the NetCDF file. + + Returns: + NetCDF4 Dataset object opened in read mode. + """ return Dataset(file_path, 'r') -def open_dataset_write(file_path): +def open_dataset_write(file_path: str | Path) -> Dataset: + """Open a NetCDF dataset in read/write mode. + + Args: + file_path (str | Path): Path to the NetCDF file. + + Returns: + NetCDF4 Dataset object opened in read/write mode. + """ return Dataset(file_path, 'r+') -def init_dataset(file_path, diskless=False, lon=720, lat=360, time=True, - time_unit='days since 1601-1-1 00:00:00', - time_calendar='proleptic_gregorian', attrs={}, **variables): +def init_dataset(file_path: str | Path, diskless: bool = False, lon: int = 720, lat: int = 360, + time: None | np.ndarray = None, time_unit: str = 'days since 1601-1-1 00:00:00', + time_calendar: str = 'proleptic_gregorian', attrs: dict = {}, **variables: Any) -> Dataset: + """Initialize a new NetCDF4 dataset with standard dimensions and variables. + + Args: + file_path (str | Path): Path where the NetCDF file will be created. + diskless (bool): If True, create dataset in memory (default: False). + lon (int): Number of longitude points (default: 720). + lat (int): Number of latitude points (default: 360). + time (None | np.ndarray): Time dimension configuration (default: None). + time_unit (str): Units for the time dimension (default: 'days since 1601-1-1 00:00:00'). + time_calendar (str): Calendar type for time dimension (default: 'proleptic_gregorian'). + attrs (dict): Dictionary of attributes for variables and global attributes. + **variables (Any): Data variables to create in the dataset. + + Returns: + Initialized NetCDF4 Dataset object. + """ # create NetCDF dataset ds = Dataset(file_path, 'w', format='NETCDF4_CLASSIC', diskless=diskless) @@ -83,11 +118,27 @@ def init_dataset(file_path, diskless=False, lon=720, lat=360, time=True, return ds -def get_data_model(dataset): +def get_data_model(dataset: Dataset) -> str: + """Get the data model of a NetCDF dataset. + + Args: + dataset (Dataset): NetCDF4 Dataset object. + + Returns: + String representing the data model (e.g., 'NETCDF4', 'NETCDF4_CLASSIC'). + """ return dataset.data_model -def get_dimensions(dataset): +def get_dimensions(dataset: Dataset) -> dict[str, int]: + """Get dimensions from a NetCDF dataset. + + Args: + dataset (Dataset): NetCDF4 Dataset object. + + Returns: + Dictionary mapping dimension names to their sizes. + """ dimensions = {} for dimension_name, dimension in dataset.dimensions.items(): dimensions[dimension_name] = dimension.size @@ -95,7 +146,16 @@ def get_dimensions(dataset): return dimensions -def get_variables(dataset, convert=False): +def get_variables(dataset: Dataset, convert: bool = False) -> dict[str, Any]: + """Get variables and their attributes from a NetCDF dataset. + + Args: + dataset (Dataset): NetCDF4 Dataset object. + convert (bool): If True, convert numpy types to Python types (default: False). + + Returns: + Dictionary mapping variable names to their attributes and dimensions. + """ variables = {} for variable_name, variable in dataset.variables.items(): @@ -111,7 +171,16 @@ def get_variables(dataset, convert=False): return variables -def get_global_attributes(dataset, convert=False): +def get_global_attributes(dataset: Dataset, convert: bool = False) -> dict[str, Any]: + """Get global attributes from a NetCDF dataset. + + Args: + dataset (Dataset): NetCDF4 Dataset object. + convert (bool): If True, convert numpy types to Python types (default: False). + + Returns: + Dictionary of global attributes. + """ if convert: global_attributes = {} for key, value in dataset.__dict__.items(): @@ -122,7 +191,15 @@ def get_global_attributes(dataset, convert=False): return global_attributes -def convert_attribute(value): +def convert_attribute(value: Any) -> Any: + """Convert numpy types to Python native types. + + Args: + value (Any): Value to convert (may be numpy array, float, int, or other type). + + Returns: + Converted value with Python native types. + """ if type(value) in LIST_TYPES: value = [convert_attribute(v) for v in value] elif type(value) in FLOAT_TYPES: @@ -132,7 +209,14 @@ def convert_attribute(value): return value -def update_global_attributes(dataset, set_attributes={}, delete_attributes=[]): +def update_global_attributes(dataset: Dataset, set_attributes: dict = {}, delete_attributes: list = []) -> None: + """Update global attributes of a NetCDF dataset. + + Args: + dataset (Dataset): NetCDF4 Dataset object. + set_attributes (dict): Dictionary of attributes to set or update. + delete_attributes (list): List of attribute names to delete. + """ for attr in dataset.__dict__: if attr in delete_attributes: dataset.delncattr(attr) @@ -141,7 +225,15 @@ def update_global_attributes(dataset, set_attributes={}, delete_attributes=[]): dataset.setncattr(attr, value2string(value)) -def value2string(value): +def value2string(value: Any) -> str: + """Convert a value to string representation. + + Args: + value (Any): Value to convert. Datetime objects get ISO format with 'Z' suffix. + + Returns: + String representation of the value. + """ if isinstance(value, datetime): return value.isoformat() + 'Z', else: diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index a70d892..0cb4f11 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -1,31 +1,84 @@ -def get_coord(df): +"""Pandas DataFrame utilities for ISIMIP data.""" +import pandas as pd + + +def get_coord(df: pd.DataFrame) -> str: + """Get the first coordinate name from DataFrame attributes. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. + + Returns: + Name of the first coordinate. + """ return next(iter(df.attrs['coords'])) -def get_coord_label(df): +def get_coord_label(df: pd.DataFrame) -> str: + """Get a formatted label for the coordinate with units. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. + + Returns: + Formatted string like "Coordinate Name [units]" or just the name if no units. + """ coord = get_coord(df) name = df.attrs['coords'][coord].get('long_name', coord) units = df.attrs['coords'][coord].get('units') return f'{name} [{units}]' if units else name -def get_coord_axis(df): +def get_coord_axis(df: pd.DataFrame) -> str | None: + """Get the axis attribute for the coordinate. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. + + Returns: + Axis attribute (e.g., 'T', 'X', 'Y'), or None if not set. + """ coord = get_coord(df) return df.attrs['coords'][coord].get('axis') -def get_data_var(df): +def get_data_var(df: pd.DataFrame) -> str: + """Get the first data variable name from DataFrame attributes. + + Args: + df (pd.DataFrame): DataFrame with 'data_vars' in attrs. + + Returns: + Name of the first data variable. + """ return next(iter(df.attrs['data_vars'])) -def get_data_var_label(df): +def get_data_var_label(df: pd.DataFrame) -> str: + """Get a formatted label for the data variable with units. + + Args: + df (pd.DataFrame): DataFrame with 'data_vars' in attrs. + + Returns: + Formatted string like "Variable Name [units]" or just the name if no units. + """ data_var = get_data_var(df) data_var_name = df.attrs['data_vars'][data_var].get('long_name', data_var) data_var_units = df.attrs['data_vars'][data_var].get('units') return f'{data_var_name} [{data_var_units}]' if data_var_units else data_var_name -def compute_average(df, area=True): +def compute_average(df: pd.DataFrame, area: bool = True) -> pd.DataFrame: + """Compute yearly average with optional standard deviation bounds. + + Args: + df (pd.DataFrame): DataFrame with time column and data variable. + area (bool): Whether to include lower/upper bounds using std (default: True). + + Returns: + DataFrame with yearly aggregated data. + """ data_var = get_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') data_var_units = df.attrs['data_vars'][data_var].get('units') @@ -59,7 +112,15 @@ def compute_average(df, area=True): return df -def group_by_day(df): +def group_by_day(df: pd.DataFrame) -> pd.DataFrame: + """Group data by day of year and compute mean. + + Args: + df (pd.DataFrame): DataFrame with time column and data variable. + + Returns: + DataFrame grouped by day of year (1-365/366). + """ data_var = get_data_var(df) df['day'] = df['time'].dt.dayofyear @@ -69,7 +130,15 @@ def group_by_day(df): return df -def group_by_month(df): +def group_by_month(df: pd.DataFrame) -> pd.DataFrame: + """Group data by month and compute mean. + + Args: + df (pd.DataFrame): DataFrame with time column and data variable. + + Returns: + DataFrame grouped by month (1-12). + """ data_var = get_data_var(df) df['month'] = df['time'].dt.month @@ -79,7 +148,15 @@ def group_by_month(df): return df -def normalize(df): +def normalize(df: pd.DataFrame) -> pd.DataFrame: + """Normalize data variable using z-score normalization. + + Args: + df (pd.DataFrame): DataFrame with data variable to normalize. + + Returns: + DataFrame with normalized data variable (mean=0, std=1). + """ data_var = get_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') @@ -92,6 +169,15 @@ def normalize(df): return df -def create_label(df, labels): +def create_label(df: pd.DataFrame, labels: list[str]) -> pd.DataFrame: + """Add a label column to DataFrame by joining label strings. + + Args: + df (pd.DataFrame): DataFrame to add label to. + labels (list[str]): List of label strings to join with spaces. + + Returns: + DataFrame with added 'label' column. + """ df['label'] = ' '.join(labels) return df diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index 7cd302f..e3c2c1b 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -1,3 +1,4 @@ +"""Functions to match file names and extract ISIMIP specifiers.""" import logging import re from pathlib import Path @@ -9,15 +10,54 @@ year_pattern = re.compile(r'^\d{4}$') -def match_dataset_path(pattern, dataset_path): +def match_dataset_path(pattern: dict, dataset_path: Path) -> tuple[Path, dict]: + """Match a dataset path against a pattern. + + Args: + pattern (dict): Pattern dictionary containing regex patterns. + dataset_path (Path): Path to the dataset to match. + + Returns: + Tuple of (matched_path, specifiers_dict). + + Raises: + DidNotMatch: If the path doesn't match the pattern. + """ return match_path(pattern, dataset_path, filename_pattern_key='dataset') -def match_file_path(pattern, file_path): +def match_file_path(pattern: dict, file_path: Path) -> tuple[Path, dict]: + """Match a file path against a pattern. + + Args: + pattern (dict): Pattern dictionary containing regex patterns. + file_path (Path): Path to the file to match. + + Returns: + Tuple of (matched_path, specifiers_dict). + + Raises: + DidNotMatch: If the path doesn't match the pattern. + """ return match_path(pattern, file_path) -def match_path(pattern, path, dirname_pattern_key='path', filename_pattern_key='file'): +def match_path(pattern: dict, path: Path, dirname_pattern_key: str = 'path', + filename_pattern_key: str = 'file') -> tuple[Path, dict]: + """Match both directory and filename components of a path against patterns. + + Args: + pattern (dict): Pattern dictionary containing regex patterns and specifiers. + path (Path): Path object to match. + dirname_pattern_key (str): Key in pattern dict for directory pattern (default: 'path'). + filename_pattern_key (str): Key in pattern dict for filename pattern (default: 'file'). + + Returns: + Tuple of (matched_path, specifiers_dict) containing extracted specifiers. + + Raises: + DidNotMatch: If dirname and filename specifiers conflict. + """ dirname_pattern = pattern[dirname_pattern_key] filename_pattern = pattern[filename_pattern_key] @@ -52,15 +92,52 @@ def match_path(pattern, path, dirname_pattern_key='path', filename_pattern_key=' return path, specifiers -def match_dataset(pattern, path): +def match_dataset(pattern: dict, path: Path) -> tuple[Path, dict]: + """Match a dataset name against a pattern. + + Args: + pattern (dict): Pattern dictionary containing regex patterns. + path (Path): Path object with dataset name. + + Returns: + Tuple of (matched_path, specifiers_dict). + + Raises: + DidNotMatch: If the dataset name doesn't match the pattern. + """ return match_string(pattern['dataset'], path.name) -def match_file(pattern, path): +def match_file(pattern: dict, path: Path) -> tuple[Path, dict]: + """Match a file name against a pattern. + + Args: + pattern (dict): Pattern dictionary containing regex patterns. + path (Path): Path object with file name. + + Returns: + Tuple of (matched_path, specifiers_dict). + + Raises: + DidNotMatch: If the file name doesn't match the pattern. + """ return match_string(pattern['file'], path.name) -def match_string(pattern, string): +def match_string(pattern: re.Pattern, string: str) -> tuple[Path, dict]: + """Match a string against a regex pattern and extract specifiers. + + Args: + pattern (re.Pattern): Compiled regex pattern with named groups. + string (str): String to match against the pattern. + + Returns: + Tuple of (Path of matched portion, specifiers_dict). + Year values (4-digit numbers) are converted to integers. + + Raises: + DidNotMatch: If the string doesn't match the pattern. + """ logger.debug(pattern.pattern) logger.debug(string) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index ec85768..9d02f2e 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -1,6 +1,8 @@ +"""Plotting utilities using Altair for ISIMIP data visualization.""" import json import logging from pathlib import Path +from typing import Any import altair as alt import numpy as np @@ -12,7 +14,7 @@ logger = logging.getLogger(__name__) -def default_color_theme(): +def default_color_theme() -> dict: return { "config": { "mark": {"color": "steelblue"} @@ -25,7 +27,15 @@ def default_color_theme(): alt.themes.enable("default_color_theme") -def save_plot(chart, path, *args, **kwargs): +def save_plot(chart: alt.Chart, path: str | Path, *args: Any, **kwargs: Any) -> None: + """Save an Altair chart to a file. + + Args: + chart (alt.Chart): Altair chart to save. + path (str | Path): Output file path. + *args (Any): Additional positional arguments for chart.save(). + **kwargs (Any): Additional keyword arguments for chart.save(). + """ path = Path(path) logger.info(f'save {path.absolute()}') @@ -33,7 +43,14 @@ def save_plot(chart, path, *args, **kwargs): chart.save(path, *args, **kwargs) -def save_index(index_path): +def save_index(index_path: Path) -> None: + """Save an HTML index file for browsing plot images. + + Creates an interactive HTML page for viewing SVG/PNG files in a directory. + + Args: + index_path (Path): Path where the index.html file will be saved. + """ index_json = json.dumps([ str(p.name) for p in index_path.parent.iterdir() if p.suffix in ['.svg', '.png'] ], indent=2).replace('\n', '\n ') @@ -93,7 +110,15 @@ def save_index(index_path): '''.replace(r'{{ index_json }}', index_json).strip()) -def get_plot_title(permutation): +def get_plot_title(permutation: tuple) -> dict: + """Create a plot title from a permutation tuple. + + Args: + permutation (tuple): Tuple of strings to join as title. + + Returns: + Dictionary with Altair title configuration. + """ return { "text": ' Β· '.join(permutation), "fontSize": 16, @@ -101,10 +126,32 @@ def get_plot_title(permutation): } -def plot_line(df, x_field=None, x_label=None, x_type=None, - y_field=None, y_label=None, y_type=None, y_format=None, - color_field=None, color_type=None, color_range=None, - legend=True, empty=False, **mark_kwargs): +def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None = None, + x_type: str | None = None, y_field: str | None = None, y_label: str | None = None, + y_type: str | None = None, y_format: str | None = None, color_field: str | None = None, + color_type: str | None = None, color_range: list | None = None, legend: bool = True, + empty: bool = False, **mark_kwargs: Any) -> alt.Chart: + """Create a line plot from a DataFrame. + + Args: + df (pd.DataFrame): DataFrame to plot. + x_field (str | None): Column name for x-axis (default: auto-detect from attrs). + x_label (str | None): Label for x-axis (default: auto-detect from attrs). + x_type (str | None): Altair type for x-axis (default: 'T' for time, 'Q' for quantitative). + y_field (str | None): Column name for y-axis (default: auto-detect from attrs). + y_label (str | None): Label for y-axis (default: auto-detect from attrs). + y_type (str | None): Altair type for y-axis (default: 'Q'). + y_format (str | None): Format string for y-axis values. + color_field (str | None): Column name for color encoding (default: 'label'). + color_type (str | None): Altair type for color (default: 'N'). + color_range (list | None): Custom color range for scale. + legend (bool): Whether to show legend (default: True). + empty (bool): Whether to create an empty plot with NaN values (default: False). + **mark_kwargs (Any): Additional keyword arguments for mark_line(). + + Returns: + Altair Chart object with line plot (and optional area for lower/upper bounds). + """ x_field = x_field or get_coord(df) x_label = x_label or get_coord_label(df) @@ -156,8 +203,26 @@ def plot_line(df, x_field=None, x_label=None, x_type=None, return chart -def plot_map(df, color_field=None, color_type=None, color_range=None, color_label=None, color_format=None, - bin_size=1, legend=True, empty=False): +def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | None = None, + color_range: list | None = None, color_label: str | None = None, + color_format: str | None = None, bin_size: int = 1, legend: bool = True, + empty: bool = False) -> alt.Chart: + """Create a geographic map plot from a DataFrame with lat/lon coordinates. + + Args: + df (pd.DataFrame): DataFrame with 'lat' and 'lon' columns. + color_field (str | None): Column name for color encoding (default: auto-detect from attrs). + color_type (str | None): Altair type for color (default: 'Q'). + color_range (list | None): Custom color range for scale. + color_label (str | None): Label for color legend (default: auto-detect from attrs). + color_format (str | None): Format string for color legend values. + bin_size (int): Bin size for aggregating grid cells (default: 1). + legend (bool): Whether to show legend (default: True). + empty (bool): Whether to create an empty plot (default: False). + + Returns: + Altair Chart object with rectangular heatmap. + """ lon = np.sort(df['lon'].unique()) lon_size = len(lon) lon_bin = float(abs(lon[1] - lon[0])) * bin_size @@ -216,7 +281,22 @@ def plot_map(df, color_field=None, color_type=None, color_range=None, color_labe ) -def plot_grid(parameters, plots, empty_plot, layer=True, x='shared', y='shared', color='shared'): +def plot_grid(parameters: dict, plots: dict, empty_plot: alt.Chart, layer: bool = True, + x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: + """Create a grid of plots organized by parameter permutations. + + Args: + parameters (dict): Dictionary of parameters with lists of values. + plots (dict): Dictionary mapping permutation tuples to Chart objects. + empty_plot (alt.Chart): Chart to use when a permutation has no data. + layer (bool): Whether to layer plots or concatenate vertically (default: True). + x (str): Scale resolution for x-axis ('shared', 'independent', default: 'shared'). + y (str): Scale resolution for y-axis ('shared', 'independent', default: 'shared'). + color (str): Scale resolution for color ('shared', 'independent', default: 'shared'). + + Returns: + Altair Chart object with grid layout. + """ rows = [] prev_permutation = None diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 187dcb3..8aef931 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,17 +1,34 @@ +"""Additional utility functions for ISIMIP tools.""" from itertools import product +from pathlib import Path +from typing import Any class Singleton: - _instance = None + """Base class for implementing the singleton pattern. - def __new__(cls): + Ensures only one instance of a class exists. Subclasses will share + a single instance with a 'data' attribute initialized as an empty dict. + """ + _instance: Any = None + + def __new__(cls) -> 'Singleton': if cls._instance is None: cls._instance = super().__new__(cls) cls._instance.data = {} return cls._instance -def parse_filelist(filelist_file): +def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: + """Parse a filelist file into a set of file paths. + + Args: + filelist_file (str | Path | None): Path to file containing list of paths (one per line). + Lines starting with '#' are treated as comments. + + Returns: + Set of file paths, or None if filelist_file is None/empty. + """ if filelist_file: with open(filelist_file) as f: filelist = {line for line in f.read().splitlines() if (line and not line.startswith('#'))} @@ -21,7 +38,17 @@ def parse_filelist(filelist_file): return filelist -def exclude_path(exclude, path): +def exclude_path(exclude: list[str] | None, path: Path | str) -> bool: + """Check if a path should be excluded based on exclude patterns. + + Args: + exclude (list[str] | None): List of exclude patterns (strings). Path is excluded if it + starts with any pattern. + path (Path | str): Path to check for exclusion. + + Returns: + True if path should be excluded, False otherwise. + """ if exclude: for exclude_string in exclude: if str(path).startswith(exclude_string): @@ -29,7 +56,17 @@ def exclude_path(exclude, path): return False -def include_path(include, path): +def include_path(include: list[str] | None, path: Path | str) -> bool: + """Check if a path should be included based on include patterns. + + Args: + include (list[str] | None): List of include patterns (strings). Path is included if it + starts with any pattern, or if include list is None/empty. + path (Path | str): Path to check for inclusion. + + Returns: + True if path should be included, False otherwise. + """ if include: for include_string in include: if str(path).startswith(include_string): @@ -39,22 +76,59 @@ def include_path(include, path): return True -def get_permutations(parameters): +def get_permutations(parameters: dict[str, list]) -> list[tuple]: + """Generate all permutations from parameter value lists. + + Args: + parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. + + Returns: + List of tuples representing all possible combinations of parameter values. + """ return list(product(*parameters.values())) -def get_placeholders(parameters, permutation): +def get_placeholders(parameters: dict[str, list], permutation: tuple) -> dict: + """Convert a permutation tuple into a dictionary of placeholders. + + Args: + parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. + permutation (tuple): Tuple of values representing one permutation. + + Returns: + Dictionary mapping parameter names to their values in this permutation. + """ return dict(zip(parameters.keys(), permutation, strict=True)) -def join_parameters(parameters, max_count=5, max_label='various'): +def join_parameters(parameters: dict[str, list[str]], max_count: int = 5, + max_label: str = 'various') -> dict[str, str]: + """Join parameter values into strings, with fallback for large value sets. + + Args: + parameters (dict[str, list[str]]): Dictionary mapping parameter names to lists of values. + max_count (int): Maximum number of values to join (default: 5). + max_label (str): Label to use when value count exceeds max_count (default: 'various'). + + Returns: + Dictionary mapping parameter names to joined strings or max_label. + """ return { key: (max_label if len(values) > max_count else '+'.join(values)) for key, values in parameters.items() } -def copy_placeholders(*placeholder_args, **kwargs): +def copy_placeholders(*placeholder_args: dict, **kwargs: Any) -> dict: + """Merge multiple placeholder dictionaries and additional kwargs. + + Args: + *placeholder_args (dict): Variable number of placeholder dictionaries to merge. + **kwargs (Any): Additional key-value pairs to add to the result. + + Returns: + Dictionary containing all merged placeholders. + """ placeholders = { key: value for placeholder_arg in placeholder_args @@ -64,7 +138,21 @@ def copy_placeholders(*placeholder_args, **kwargs): return placeholders -def update_year(placeholders, key, year, operator): +def update_year(placeholders: dict, key: str, year: int | str, operator: str) -> None: + """Update a year placeholder based on comparison operator. + + Args: + placeholders (dict): Dictionary of placeholders to update. + key (str): Key in placeholders dictionary to update. + year (int | str): Year value to compare/set. + operator (str): Comparison operator ('<' for minimum, '>' for maximum). + + Raises: + RuntimeError: If operator is not '<' or '>'. + + Note: + Updates placeholders[key] in-place if condition is met. + """ if operator not in ('<', '>'): raise RuntimeError(f'operator "{operator}" not supported') diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index e39695c..20f930c 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -1,16 +1,33 @@ +"""Functions for working with xarray datasets for ISIMIP data.""" import logging from pathlib import Path import cftime import numpy as np +import pandas as pd import xarray as xr logger = logging.getLogger(__name__) -def init_dataset(lon=720, lat=360, time=None, - time_unit='days since 1601-1-1 00:00:00', - time_calendar='proleptic_gregorian', attrs={}, **variables): +def init_dataset(lon: int = 720, lat: int = 360, time: np.array | None = None, + time_unit: str = 'days since 1601-1-1 00:00:00', + time_calendar: str = 'proleptic_gregorian', + attrs: dict = {}, **variables: list[np.array]) -> xr.Dataset: + """Initialize a new xarray dataset with standard ISIMIP dimensions. + + Args: + lon (int): Number of longitude points (default: 720). + lat (int): Number of latitude points (default: 360). + time (np.array | None): Time coordinate array, or None to omit time dimension (default: None). + time_unit (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). + time_calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). + attrs (dict): Dictionary of attributes for variables and global attributes. + **variables (list[np.array]): Data variables to include in the dataset. + + Returns: + Initialized xarray Dataset with coordinates and data variables. + """ # create coordinates coords = {} @@ -74,7 +91,21 @@ def init_dataset(lon=720, lat=360, time=None, return ds -def open_dataset(path, decode_cf=False, load=False): +def open_dataset(path: str | Path, decode_cf: bool = False, load: bool = False) -> xr.Dataset: + """Open a NetCDF dataset using xarray. + + Args: + path (str | Path): Path to the NetCDF file. + decode_cf (bool): Whether to decode CF conventions (default: False). + load (bool): Whether to load data into memory immediately (default: False). + + Returns: + Xarray Dataset object. + + Note: + Handles non-standard time units like 'growing seasons' by converting + them to 'common_years' with a 365_day calendar. + """ path = Path(path) logger.info(f'load {path.absolute()}' if load else f'open {path.absolute()}') @@ -96,7 +127,17 @@ def open_dataset(path, decode_cf=False, load=False): return ds -def write_dataset(ds, path): +def write_dataset(ds: xr.Dataset, path: str | Path): + """Write an xarray dataset to a NetCDF file. + + Args: + ds (xr.Dataset): Xarray Dataset to write. + path (str | Path): Path where the NetCDF file will be written. + + Note: + Automatically adds fill values, converts NaN to fill values, + orders variables, and sets time as unlimited dimension. + """ path = Path(path) path.parent.mkdir(exist_ok=True, parents=True) @@ -112,11 +153,27 @@ def write_dataset(ds, path): ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) -def order_variables(ds): +def order_variables(ds: xr.Dataset) -> xr.Dataset: + """Reorder dataset variables with coordinates first, then data variables. + + Args: + ds (xr.Dataset): Xarray Dataset to reorder. + + Returns: + Dataset with reordered variables. + """ return ds[[*ds.coords, *ds.data_vars]] -def get_attrs(ds): +def get_attrs(ds: xr.Dataset) -> dict: + """Get all attributes from coordinates and data variables. + + Args: + ds (xr.Dataset): Xarray Dataset. + + Returns: + Dictionary mapping variable names to their attributes. + """ attrs = {} for coord in ds.coords: attrs[coord] = ds[coord].attrs @@ -125,7 +182,16 @@ def get_attrs(ds): return attrs -def set_attrs(ds, attrs): +def set_attrs(ds: xr.Dataset, attrs: dict) -> xr.Dataset: + """Set attributes on coordinates and data variables. + + Args: + ds (xr.Dataset): Xarray Dataset to modify. + attrs (dict): Dictionary mapping variable names to their attributes. + + Returns: + Modified dataset with updated attributes. + """ for coord in ds.coords: if coord in attrs: ds[coord].attrs = attrs[coord] @@ -135,7 +201,15 @@ def set_attrs(ds, attrs): return ds -def add_fill_value_to_attrs(ds): +def add_fill_value_to_attrs(ds: xr.Dataset) -> xr.Dataset: + """Add _FillValue and missing_value attributes if not present. + + Args: + ds (xr.Dataset): Xarray Dataset to modify. + + Returns: + Dataset with fill value attributes added (default: 1.e+20). + """ for coord in ds.coords: if '_FillValue' not in ds.coords[coord].attrs: ds.coords[coord].attrs['_FillValue'] = 1.e+20 @@ -148,21 +222,50 @@ def add_fill_value_to_attrs(ds): return ds -def set_fill_value_to_nan(ds): +def set_fill_value_to_nan(ds: xr.Dataset) -> xr.Dataset: + """Replace fill values with NaN in data variables. + + Args: + ds (xr.Dataset): Xarray Dataset to modify. + + Returns: + Dataset with fill values replaced by NaN. + """ for var in ds.data_vars: fill_value = ds[var].attrs.get('_FillValue', 1e+20) ds[var] = ds[var].where(ds[var] != fill_value) return ds -def set_nan_to_fill_value(ds): +def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: + """Replace NaN values with fill values in data variables. + + Args: + ds (xr.Dataset): Xarray Dataset to modify. + + Returns: + Dataset with NaN values replaced by fill values. + """ for var in ds.data_vars: fill_value = ds[var].attrs.get('_FillValue', 1e+20) ds[var] = ds[var].where(~np.isnan(ds[var]), fill_value) return ds -def create_mask(ds, df, layer): +def create_mask(ds: xr.Dataset, df: pd.DataFrame, layer: int) -> xr.Dataset: + """Create a spatial mask from a geometry layer. + + Args: + ds (xr.Dataset): Xarray Dataset with lat/lon coordinates. + df (pd.DataFrame): GeoDataFrame with geometry column. + layer (int): Index of the layer to use from the GeoDataFrame. + + Returns: + Xarray dataset with a 'mask' variable clipped to the geometry. + + Note: + Requires geopandas and rioxarray to be installed. + """ import shapely.geometry logger.info('create mask') @@ -183,7 +286,20 @@ def create_mask(ds, df, layer): return mask_ds -def to_dataframe(ds): +def to_dataframe(ds: xr.Dataset) -> pd.DataFrame: + """Convert an xarray Dataset to a pandas DataFrame. + + Args: + ds (xr.Dataset): Xarray Dataset to convert. + + Returns: + Pandas DataFrame with coordinates as columns and data variables as columns. + Attributes are preserved in df.attrs['coords'] and df.attrs['data_vars']. + + Note: + Time coordinates are converted to datetime64[ns] format. + Data variables are converted to float64. + """ if 'time' in ds.coords: ds.coords['time'] = ds.coords['time'].astype('datetime64[ns]') From 3232c3cc971c0fa3d1906c1e4f0943885dacd330 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 14:04:17 +0100 Subject: [PATCH 045/162] Add mkdocs documentation --- docs/api/checksum.md | 3 +++ docs/api/cli.md | 3 +++ docs/api/config.md | 3 +++ docs/api/decorators.md | 3 +++ docs/api/exceptions.md | 3 +++ docs/api/extractions.md | 3 +++ docs/api/fetch.md | 3 +++ docs/api/files.md | 3 +++ docs/api/netcdf.md | 3 +++ docs/api/pandas.md | 3 +++ docs/api/patterns.md | 3 +++ docs/api/plot.md | 3 +++ docs/api/utils.md | 3 +++ docs/api/xarray.md | 3 +++ docs/index.md | 53 +++++++++++++++++++++++++++++++++++++++++ docs/releases.md | 24 ++++++++----------- mkdocs.yml | 33 +++++++++++++++++++++++++ pyproject.toml | 5 ++++ 18 files changed, 143 insertions(+), 14 deletions(-) create mode 100644 docs/api/checksum.md create mode 100644 docs/api/cli.md create mode 100644 docs/api/config.md create mode 100644 docs/api/decorators.md create mode 100644 docs/api/exceptions.md create mode 100644 docs/api/extractions.md create mode 100644 docs/api/fetch.md create mode 100644 docs/api/files.md create mode 100644 docs/api/netcdf.md create mode 100644 docs/api/pandas.md create mode 100644 docs/api/patterns.md create mode 100644 docs/api/plot.md create mode 100644 docs/api/utils.md create mode 100644 docs/api/xarray.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml diff --git a/docs/api/checksum.md b/docs/api/checksum.md new file mode 100644 index 0000000..53f5adf --- /dev/null +++ b/docs/api/checksum.md @@ -0,0 +1,3 @@ +# isimip_utils.checksum + +::: isimip_utils.checksum diff --git a/docs/api/cli.md b/docs/api/cli.md new file mode 100644 index 0000000..9066d91 --- /dev/null +++ b/docs/api/cli.md @@ -0,0 +1,3 @@ +# isimip_utils.cli + +::: isimip_utils.cli diff --git a/docs/api/config.md b/docs/api/config.md new file mode 100644 index 0000000..e6e2d51 --- /dev/null +++ b/docs/api/config.md @@ -0,0 +1,3 @@ +# isimip_utils.config + +::: isimip_utils.config diff --git a/docs/api/decorators.md b/docs/api/decorators.md new file mode 100644 index 0000000..0747bff --- /dev/null +++ b/docs/api/decorators.md @@ -0,0 +1,3 @@ +# isimip_utils.decorators + +::: isimip_utils.decorators diff --git a/docs/api/exceptions.md b/docs/api/exceptions.md new file mode 100644 index 0000000..3362831 --- /dev/null +++ b/docs/api/exceptions.md @@ -0,0 +1,3 @@ +# isimip_utils.exceptions + +::: isimip_utils.exceptions diff --git a/docs/api/extractions.md b/docs/api/extractions.md new file mode 100644 index 0000000..bc0be87 --- /dev/null +++ b/docs/api/extractions.md @@ -0,0 +1,3 @@ +# isimip_utils.extractions + +::: isimip_utils.extractions diff --git a/docs/api/fetch.md b/docs/api/fetch.md new file mode 100644 index 0000000..91317da --- /dev/null +++ b/docs/api/fetch.md @@ -0,0 +1,3 @@ +# isimip_utils.fetch + +::: isimip_utils.fetch diff --git a/docs/api/files.md b/docs/api/files.md new file mode 100644 index 0000000..e8a29c0 --- /dev/null +++ b/docs/api/files.md @@ -0,0 +1,3 @@ +# isimip_utils.files + +::: isimip_utils.files diff --git a/docs/api/netcdf.md b/docs/api/netcdf.md new file mode 100644 index 0000000..f47af95 --- /dev/null +++ b/docs/api/netcdf.md @@ -0,0 +1,3 @@ +# isimip_utils.netcdf + +::: isimip_utils.netcdf diff --git a/docs/api/pandas.md b/docs/api/pandas.md new file mode 100644 index 0000000..37731a9 --- /dev/null +++ b/docs/api/pandas.md @@ -0,0 +1,3 @@ +# isimip_utils.pandas + +::: isimip_utils.pandas diff --git a/docs/api/patterns.md b/docs/api/patterns.md new file mode 100644 index 0000000..e6e8bcf --- /dev/null +++ b/docs/api/patterns.md @@ -0,0 +1,3 @@ +# isimip_utils.patterns + +::: isimip_utils.patterns diff --git a/docs/api/plot.md b/docs/api/plot.md new file mode 100644 index 0000000..5f4aa30 --- /dev/null +++ b/docs/api/plot.md @@ -0,0 +1,3 @@ +# isimip_utils.plot + +::: isimip_utils.plot diff --git a/docs/api/utils.md b/docs/api/utils.md new file mode 100644 index 0000000..adb212f --- /dev/null +++ b/docs/api/utils.md @@ -0,0 +1,3 @@ +# isimip_utils.utils + +::: isimip_utils.utils diff --git a/docs/api/xarray.md b/docs/api/xarray.md new file mode 100644 index 0000000..243f615 --- /dev/null +++ b/docs/api/xarray.md @@ -0,0 +1,3 @@ +# isimip_utils.xarray + +::: isimip_utils.xarray diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..89da52e --- /dev/null +++ b/docs/index.md @@ -0,0 +1,53 @@ +ISIMIP utils +============ + +Overview +-------- + +This package contains common functionality for different ISIMIP tools, namely: + +* https://github.com/ISI-MIP/isimip-publisher +* https://github.com/ISI-MIP/isimip-qa +* https://github.com/ISI-MIP/isimip-qc + +It comprises of: + +* [`isimip_utils.checksum`](api/checksum.md): Functions to compute the SHA-512 checksum of a file. +* [`isimip_utils.cli`](api/cli.md): Command-line interface utilities for argument parsing and configuration. +* [`isimip_utils.config`](api/config.md): A settings class to combine input from `argparse`, the environment (via `python-dotenv`) and config files. +* [`isimip_utils.decorators`](api/decorators.md): Decorators including a cached property implementation. +* [`isimip_utils.exceptions`](api/exceptions.md): Custom exceptions for ISIMIP tools. +* [`isimip_utils.extractions`](api/extractions.md): Data extraction and manipulation utilities for xarray datasets. +* [`isimip_utils.fetch`](api/fetch.md): Functions to fetch files from the machine-actionable ISIMIP protocols. +* [`isimip_utils.files`](api/files.md): File search utilities with regex pattern matching. +* [`isimip_utils.netcdf`](api/netcdf.md): Functions to open and read NetCDF files using netCDF4. +* [`isimip_utils.pandas`](api/pandas.md): DataFrame utilities for ISIMIP data processing. +* [`isimip_utils.patterns`](api/patterns.md): Functions to match file names and extract ISIMIP specifiers. +* [`isimip_utils.plot`](api/plot.md): Plotting utilities using Altair for data visualization. +* [`isimip_utils.utils`](api/utils.md): Additional utility functions. +* [`isimip_utils.xarray`](api/xarray.md): Functions for working with xarray datasets. + + +Setup +----- + +Working on the package requires a running Python3 on your system. Installing those prerequisites is covered [here](https://github.com/ISI-MIP/isimip-utils/blob/master/docs/releases.md). + +The package itself can be installed via pip: + +``` +pip install isimip-utils +``` + +The package can also be installed directly from GitHub: + +``` +pip install git+https://github.com/ISI-MIP/isimip-utils +``` + +For a development setup, the repo should be cloned and installed in *editable* mode: + +``` +git clone git@github.com:ISI-MIP/isimip-utils +pip install -e isimip-utils +``` diff --git a/docs/releases.md b/docs/releases.md index de98902..9cab732 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,10 +1,9 @@ Releases ======== -Requirements ------------- +[PyPI](https://pypi.org/) releases of this repository, are done using the following steps: -Install `build` and `twine` +### Install `build` and `twine` ``` pip install build twine @@ -23,8 +22,8 @@ username: ... password: ... ``` -Prepare repo ------------- + +### Prepare repo 1) Ensure tests are passing. @@ -43,8 +42,7 @@ Prepare repo ``` -Release on Test PyPI --------------------- +### Release on Test PyPI 1) Upload with `twine` to Test PyPI: @@ -52,11 +50,10 @@ Release on Test PyPI twine upload -r testpypi dist/* ``` -2) Check at https://test.pypi.org/project/isimip-utils/. +2) Check at . -Release on PyPI ---------------- +### Release on PyPI 1) Upload with `twine` to PyPI: @@ -64,14 +61,13 @@ Release on PyPI twine upload dist/* ``` -2) Check at https://pypi.org/project/isimip-utils/. +2) Check at . -Create release on GitHub ------------------------- +### Create release on GitHub 1) Commit local changes. 2) Push changes. -3) Create release on https://github.com/ISI-MIP/isimip-utils/releases). +3) Create release on . diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..2bf4e07 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,33 @@ +site_name: ISIMIP utils + +theme: + name: material + +plugins: +- mkdocstrings: + handlers: + python: + options: + show_source: false + show_bases: false + members_order: source + +nav: + - README: index.md + - Prerequisites: prerequisites.md + - Releases: releases.md + - API reference: + - isimip_utils.checksum: api/checksum.md + - isimip_utils.cli: api/cli.md + - isimip_utils.config: api/config.md + - isimip_utils.decorators: api/decorators.md + - isimip_utils.exceptions: api/exceptions.md + - isimip_utils.extractions: api/extractions.md + - isimip_utils.fetch: api/fetch.md + - isimip_utils.files: api/files.md + - isimip_utils.netcdf: api/netcdf.md + - isimip_utils.pandas: api/pandas.md + - isimip_utils.patterns: api/patterns.md + - isimip_utils.plot: api/plot.md + - isimip_utils.utils: api/utils.md + - isimip_utils.xarray: api/xarray.md diff --git a/pyproject.toml b/pyproject.toml index dd304a2..4c4ff7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,11 @@ dev = [ "ruff", "twine", ] +docs = [ + "mkdocs", + "mkdocs-material", + "mkdocstrings-python", +] [tool.setuptools] packages = ["isimip_utils"] From ae46cee36cbb62e12e0ef7d6c854fbc38184aa1e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:13:41 +0100 Subject: [PATCH 046/162] Fix ArgumentParser --- isimip_utils/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 060c91e..f65fac9 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -166,7 +166,13 @@ def build_default_args(self) -> argparse.Namespace: if os.getenv(key_upper): # if the attribute is in the environment, take the value value = os.getenv(key_upper) - # setattr(default_args, key, ) + if value.lower() == 'true': + value = True + elif value.lower() == 'false': + value = False + elif value.lower() == 'none': + value = None + elif config and key in config: # if the attribute is in the config file, take it from there value = config.get(key) From 6ea510fd240b1be23b3a87f4cbacf645600b3869 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:14:10 +0100 Subject: [PATCH 047/162] Add load_dataset again --- isimip_utils/xarray.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 20f930c..4d52a83 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -127,6 +127,25 @@ def open_dataset(path: str | Path, decode_cf: bool = False, load: bool = False) return ds +def load_dataset(path: str | Path, decode_cf: bool = False) -> xr.Dataset: + """Open a NetCDF dataset using xarray and load data into memory immediately. + + Args: + path (str | Path): Path to the NetCDF file. + decode_cf (bool): Whether to decode CF conventions (default: False). + + Returns: + Xarray Dataset object. + + Note: + Handles non-standard time units like 'growing seasons' by converting + them to 'common_years' with a 365_day calendar. + + This is a shortcut for `open_dataset(path, decode_cf, load=True)`. + """ + return open_dataset(path, decode_cf, load=True) + + def write_dataset(ds: xr.Dataset, path: str | Path): """Write an xarray dataset to a NetCDF file. From a269689f2d48d6647879646be6d5d8d5201eceeb Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:14:33 +0100 Subject: [PATCH 048/162] Move cached_property to utils.py --- isimip_utils/decorators.py | 30 ------------------------------ isimip_utils/utils.py | 36 +++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 43 deletions(-) delete mode 100644 isimip_utils/decorators.py diff --git a/isimip_utils/decorators.py b/isimip_utils/decorators.py deleted file mode 100644 index 4b988aa..0000000 --- a/isimip_utils/decorators.py +++ /dev/null @@ -1,30 +0,0 @@ -from collections.abc import Callable -from typing import Any - - -class cached_property: - """Decorator that converts a method into a cached property. - - The property value is computed once and then cached as an instance attribute. - Subsequent accesses return the cached value without re-computing. - - Simplified version of - [Django's cached_property](https://github.com/django/django/blob/main/django/utils/functional.py). - """ - - name: str | None = None - - def __init__(self, func: Callable) -> None: - self.func = func - - def __set_name__(self, owner: type, name: str) -> None: - if self.name is None: - self.name = name - else: - raise TypeError("Cannot assign the same cached_property to two different names") - - def __get__(self, instance: Any, cls: type | None = None) -> Any: - if instance is None: - return self - value = instance.__dict__[self.name] = self.func(instance) - return value diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 8aef931..2067fd2 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,4 +1,5 @@ """Additional utility functions for ISIMIP tools.""" +from collections.abc import Callable from itertools import product from pathlib import Path from typing import Any @@ -19,23 +20,32 @@ def __new__(cls) -> 'Singleton': return cls._instance -def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: - """Parse a filelist file into a set of file paths. +class cached_property: + """Decorator that converts a method into a cached property. - Args: - filelist_file (str | Path | None): Path to file containing list of paths (one per line). - Lines starting with '#' are treated as comments. + The property value is computed once and then cached as an instance attribute. + Subsequent accesses return the cached value without re-computing. - Returns: - Set of file paths, or None if filelist_file is None/empty. + Simplified version of + [Django's cached_property](https://github.com/django/django/blob/main/django/utils/functional.py). """ - if filelist_file: - with open(filelist_file) as f: - filelist = {line for line in f.read().splitlines() if (line and not line.startswith('#'))} - else: - filelist = None - return filelist + name: str | None = None + + def __init__(self, func: Callable) -> None: + self.func = func + + def __set_name__(self, owner: type, name: str) -> None: + if self.name is None: + self.name = name + else: + raise TypeError("Cannot assign the same cached_property to two different names") + + def __get__(self, instance: Any, cls: type | None = None) -> Any: + if instance is None: + return self + value = instance.__dict__[self.name] = self.func(instance) + return value def exclude_path(exclude: list[str] | None, path: Path | str) -> bool: From 1279953a74aeb8300dadad712e8bd3801a0c6b8a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:17:02 +0100 Subject: [PATCH 049/162] Move find_files to patterns.py --- isimip_utils/files.py | 22 ---------------------- isimip_utils/patterns.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 22 deletions(-) delete mode 100644 isimip_utils/files.py diff --git a/isimip_utils/files.py b/isimip_utils/files.py deleted file mode 100644 index 9732389..0000000 --- a/isimip_utils/files.py +++ /dev/null @@ -1,22 +0,0 @@ -"""File search utilities for ISIMIP tools.""" -import re -from pathlib import Path - - -def find_files(base_path: Path, pattern: str) -> list[dict]: - """Find files matching a regex pattern in a directory tree. - - Args: - base_path (Path): Base directory to search in. - pattern (str): Regular expression pattern to match against file paths. - - Returns: - List of dictionaries containing 'path' and any named groups from the regex match. - """ - files = [] - for path in sorted(base_path.rglob("*")): - match = re.search(str(pattern), str(path), re.IGNORECASE) - if match: - files.append(dict(path=path, **match.groupdict())) - - return files diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index e3c2c1b..cb3ca94 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -1,6 +1,7 @@ """Functions to match file names and extract ISIMIP specifiers.""" import logging import re +from collections.abc import Iterable from pathlib import Path from .exceptions import DidNotMatch @@ -155,3 +156,22 @@ def match_string(pattern: re.Pattern, string: str) -> tuple[Path, dict]: return Path(match.group(0)), specifiers else: raise DidNotMatch(f'No match for {string} ("{pattern.pattern}")') + + +def find_files(pattern: re.Pattern, file_iter: Iterable[Path]) -> list[dict]: + """Find files matching a regex pattern from an iterator. + + Args: + pattern (re.Pattern): Compiled regular expression pattern to match against file paths. + file_iter (Iterable[Path]): Iterator over file paths to search through. + + Returns: + List of dictionaries containing 'path' and any named groups from the regex match. + """ + files = [] + for path in sorted(file_iter): + match = pattern.search(str(path)) + if match: + files.append(dict(path=path, **match.groupdict())) + + return files From e2af16d744608958e69ccd6b5f41e111643d48a3 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:17:45 +0100 Subject: [PATCH 050/162] Move parse_filelist to cli.py --- isimip_utils/cli.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index f65fac9..20a7b1d 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -109,6 +109,25 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() +def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: + """Parse a filelist file into a set of file paths. + + Args: + filelist_file (str | Path | None): Path to file containing list of paths (one per line). + Lines starting with '#' are treated as comments. + + Returns: + Set of file paths, or None if filelist_file is None/empty. + """ + if filelist_file: + with open(filelist_file) as f: + filelist = {line for line in f.read().splitlines() if (line and not line.startswith('#'))} + else: + filelist = None + + return filelist + + class ArgumentParser(argparse.ArgumentParser): """Extended ArgumentParser that reads defaults from config files and environment. From 74ffdc8640deb94ea4f29a4cd65eae5c48e134af Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 27 Oct 2025 17:18:06 +0100 Subject: [PATCH 051/162] Fix minor issues --- isimip_utils/cli.py | 7 ++++--- isimip_utils/config.py | 2 +- isimip_utils/fetch.py | 18 ++++++++++++++---- isimip_utils/netcdf.py | 2 +- isimip_utils/xarray.py | 8 ++++---- pyproject.toml | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 20a7b1d..2bdf8f3 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -79,20 +79,21 @@ def parse_list(string: str) -> list[str]: return [value.strip() for value in string.split(',')] -def parse_version(value: str) -> datetime: +def parse_version(value: str) -> str: """Parse a version string in YYYYMMDD format. Args: value (str): Version string in YYYYMMDD format. Returns: - Parsed datetime object. + Version string in YYYYMMDD format. Raises: argparse.ArgumentTypeError: If format is incorrect. """ try: - return datetime.strptime(value, '%Y%m%d') + datetime.strptime(value, '%Y%m%d') + return value except ValueError as e: raise argparse.ArgumentTypeError('incorrect format, should be YYYYMMDD') from e diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 8f3b3af..6ae51ae 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -30,7 +30,7 @@ def __setattr__(self, name: str, value: Any) -> None: else: self._settings[name] = value - def dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """Return the settings as a dictionary. Returns: diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index 5989865..2cef183 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -19,6 +19,7 @@ 'https://protocol2.isimip.org', ] + def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: """Fetch definitions from ISIMIP protocol locations. @@ -54,7 +55,7 @@ def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PR logger.debug('definitions = %s', definitions) return definitions - raise NotFound(f'no definitions found for {path}') + raise NotFound(f'No definitions found for {path}.') def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: @@ -101,7 +102,7 @@ def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOC return pattern - raise NotFound(f'no pattern found for {path}') + raise NotFound(f'No pattern found for {path}.') def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: @@ -127,7 +128,7 @@ def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCO logger.debug('schema_json = %s', schema_json) return schema_json - raise NotFound(f'no schema found for {path}') + raise NotFound(f'No schema found for {path}.') def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: @@ -153,7 +154,16 @@ def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_ logger.debug('tree_json = %s', tree_json) return tree_json - raise NotFound(f'no tree found for {path}') + raise NotFound(f'No tree found for {path}.') + + +def fetch_resource(resource_location: str | Path) -> dict: + if urlparse(resource_location).scheme: + return fetch_json(resource_location) + else: + return load_json(resource_location) + + raise NotFound(f'No resource found at {resource_location}.') def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Generator[tuple[Path, Any], None, None]: diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index ee3aad7..05c08ba 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -12,7 +12,7 @@ INT_TYPES = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] -def open_dataset(file_path: str | Path) -> Dataset: +def open_dataset_read(file_path: str | Path) -> Dataset: """Open a NetCDF dataset in read-only mode. Args: diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 4d52a83..5544784 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -10,20 +10,20 @@ logger = logging.getLogger(__name__) -def init_dataset(lon: int = 720, lat: int = 360, time: np.array | None = None, +def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, time_unit: str = 'days since 1601-1-1 00:00:00', time_calendar: str = 'proleptic_gregorian', - attrs: dict = {}, **variables: list[np.array]) -> xr.Dataset: + attrs: dict = {}, **variables: np.ndarray) -> xr.Dataset: """Initialize a new xarray dataset with standard ISIMIP dimensions. Args: lon (int): Number of longitude points (default: 720). lat (int): Number of latitude points (default: 360). - time (np.array | None): Time coordinate array, or None to omit time dimension (default: None). + time (np.ndarray | None): Time coordinate array, or None to omit time dimension (default: None). time_unit (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). time_calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). attrs (dict): Dictionary of attributes for variables and global attributes. - **variables (list[np.array]): Data variables to include in the dataset. + **variables (np.ndarray): Data variables to include in the dataset. Returns: Initialized xarray Dataset with coordinates and data variables. diff --git a/pyproject.toml b/pyproject.toml index 4c4ff7d..62c7258 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ packages = ["isimip_utils"] version = { attr = "isimip_utils.__version__" } [tool.ruff] -target-version = "py311" +target-version = "py312" line-length = 120 [tool.ruff.lint] From b164d6e41e73faee038cf1148093fdba4968cd21 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 28 Oct 2025 19:07:54 +0100 Subject: [PATCH 052/162] Add show_time and show_path arguments to setup_logs --- isimip_utils/cli.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 2bdf8f3..8f90f27 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -18,7 +18,8 @@ def setup_env() -> None: def setup_logs(log_level: str = 'WARN', log_file: str | None = None, - log_console: bool = True, log_rich: bool = True) -> None: + log_console: bool = True, log_rich: bool = True, + show_time: bool = False, show_path: bool = False) -> None: """Configure logging with console and/or file handlers. Args: @@ -26,6 +27,8 @@ def setup_logs(log_level: str = 'WARN', log_file: str | None = None, log_file (str | None): Path to log file, or None for no file logging (default: None). log_console (bool): Whether to log to console (default: True). log_rich (bool): Whether to use RichHandler for console logging (default: True). + show_time (bool): Whether to show the time in the console logs (default: False). + show_path (bool): Whether to show the path in the console logs (default: False). """ log_level = log_level.upper() @@ -34,10 +37,18 @@ def setup_logs(log_level: str = 'WARN', log_file: str | None = None, if log_console: if log_rich: - console_handler = RichHandler() + console_handler = RichHandler(show_time=show_time, show_path=show_path) else: + fmt = '' + if show_time: + fmt += '[%(asctime)s] ' + fmt += '%(levelname)s - ' + if show_path: + fmt += '%(filename)s:%(lineno)d - ' + fmt += '%(message)s' + console_handler = logging.StreamHandler() - console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) + console_handler.setFormatter(logging.Formatter(fmt)) console_handler.setLevel(log_level) root_logger.addHandler(console_handler) From 8b7074ce7088c0d07aef7762bbf076de943da236 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 29 Oct 2025 18:40:36 +0100 Subject: [PATCH 053/162] Fix netcdf.init_dataset --- isimip_utils/netcdf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 05c08ba..776d668 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -109,8 +109,12 @@ def init_dataset(file_path: str | Path, diskless: bool = False, lon: int = 720, for key, value in attrs.get(variable_name, {}).items(): setattr(var, key, value) + # set missing value var.missing_value = np.float32(FILL_VALUE) + # set variable data + var[:] = variable + # set global attributes for key, value in attrs.get('global', {}).items(): setattr(ds, key, value) From de612dcdb4003e87492fd9335d764de9f48f9997 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 29 Oct 2025 19:35:52 +0100 Subject: [PATCH 054/162] Fix concat_extraction --- isimip_utils/extractions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 708158a..d5abbd1 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -230,7 +230,7 @@ def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: """ if ds1 is None: return ds2.copy() - elif 'time' not in ds2.sizes: + elif not ds2.sizes.get('time'): return ds1 else: # apply offset when time units or calendar diverges From fb24bf5a966f8881bcd346a924543b7ab58afe83 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 30 Oct 2025 19:27:00 +0100 Subject: [PATCH 055/162] Refactor plot.py --- isimip_utils/plot.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 9d02f2e..375aacb 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -129,7 +129,8 @@ def get_plot_title(permutation: tuple) -> dict: def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None = None, x_type: str | None = None, y_field: str | None = None, y_label: str | None = None, y_type: str | None = None, y_format: str | None = None, color_field: str | None = None, - color_type: str | None = None, color_range: list | None = None, legend: bool = True, + color_type: str | None = None, color_domain: list | None = None, color_range: list | None = None, + color_scheme: str | None = None, color_title: str | None = 'Legend', legend: bool = True, empty: bool = False, **mark_kwargs: Any) -> alt.Chart: """Create a line plot from a DataFrame. @@ -144,7 +145,10 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None y_format (str | None): Format string for y-axis values. color_field (str | None): Column name for color encoding (default: 'label'). color_type (str | None): Altair type for color (default: 'N'). + color_domain (list | None): Custom color domain. color_range (list | None): Custom color range for scale. + color_scheme (str | None): Custom color scheme for scale. + color_title (str | None): Title for color (default: 'Legend'). legend (bool): Whether to show legend (default: True). empty (bool): Whether to create an empty plot with NaN values (default: False). **mark_kwargs (Any): Additional keyword arguments for mark_line(). @@ -176,10 +180,23 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None else: color_field = color_field or 'label' color_type = color_type or 'N' + + color_scale_args = {} + if color_domain: + color_scale_args['domain'] = color_domain + if color_range: + color_scale_args['range'] = color_range + if color_scheme: + color_scale_args['scheme'] = color_scheme + + color_legend_args = {} + if color_title: + color_legend_args['title'] = color_title + color = alt.Color( f'{color_field}:{color_type}', - scale=alt.Scale(range=color_range) if color_range else alt.Scale(), - legend=alt.Legend(title='Legend', padding=10) if legend else None + scale=alt.Scale(**color_scale_args), + legend=alt.Legend(padding=10, **color_legend_args) if legend else None ) if empty: @@ -204,8 +221,8 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | None = None, - color_range: list | None = None, color_label: str | None = None, - color_format: str | None = None, bin_size: int = 1, legend: bool = True, + color_domain: list | None = None, color_range: list | None = None, color_scheme: str | None = None, + color_label: str | None = None, color_format: str | None = None, bin_size: int = 1, legend: bool = True, empty: bool = False) -> alt.Chart: """Create a geographic map plot from a DataFrame with lat/lon coordinates. @@ -213,7 +230,9 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | df (pd.DataFrame): DataFrame with 'lat' and 'lon' columns. color_field (str | None): Column name for color encoding (default: auto-detect from attrs). color_type (str | None): Altair type for color (default: 'Q'). + color_domain (list | None): Custom color domain. color_range (list | None): Custom color range for scale. + color_scheme (str | None): Custom color scheme for scale. color_label (str | None): Label for color legend (default: auto-detect from attrs). color_format (str | None): Format string for color legend values. bin_size (int): Bin size for aggregating grid cells (default: 1). @@ -258,6 +277,14 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | color_type = color_type or 'Q' color_label = color_label or get_data_var_label(df) + color_scale_args = {} + if color_domain: + color_scale_args['domain'] = color_domain + if color_range: + color_scale_args['range'] = color_range + if color_scheme: + color_scale_args['scheme'] = color_scheme + color_legend_args = {} if color_format: color_legend_args['format'] = color_format @@ -265,7 +292,7 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | color = alt.Color( f'{color_field}:{color_type}', title=color_label, - scale=alt.Scale(range=color_range) if color_range else alt.Scale(), + scale=alt.Scale(**color_scale_args), legend=alt.Legend(padding=10, **color_legend_args) if legend else None ) From 9df3744c8a524e28021a90d4efb608094b53735c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 4 Nov 2025 16:36:05 +0100 Subject: [PATCH 056/162] Refactor find_json --- isimip_utils/fetch.py | 108 ++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index 2cef183..11bd475 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -37,23 +37,20 @@ def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PR protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for definitions_path, definitions_json in find_json(protocol_location, 'definitions', path): - if definitions_json: - logger.debug('definitions_path = %s', definitions_path) - logger.debug('definitions_json = %s', definitions_json) - - definitions = {} - for definition_name, definition in definitions_json.items(): - # convert the definitions to dicts if they are lists - if isinstance(definition, list): - definitions[definition_name] = { - row['specifier']: row for row in definition - } - else: - definitions[definition_name] = definition - - logger.debug('definitions = %s', definitions) - return definitions + definitions_json = find_json(protocol_location, 'definitions', path) + if definitions_json: + definitions = {} + for definition_name, definition in definitions_json.items(): + # convert the definitions to dicts if they are lists + if isinstance(definition, list): + definitions[definition_name] = { + row['specifier']: row for row in definition + } + else: + definitions[definition_name] = definition + + logger.debug('definitions = %s', definitions) + return definitions raise NotFound(f'No definitions found for {path}.') @@ -76,31 +73,28 @@ def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOC protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for pattern_path, pattern_json in find_json(protocol_location, 'pattern', path): - if pattern_json: - logger.debug('pattern_path = %s', pattern_path) - logger.debug('pattern_json = %s', pattern_json) - - if not all([ - isinstance(pattern_json['path'], str), - isinstance(pattern_json['file'], str), - isinstance(pattern_json['dataset'], str), - isinstance(pattern_json['suffix'], list) - ]): - break - - pattern = { - 'path': re.compile(pattern_json['path']), - 'file': re.compile(pattern_json['file']), - 'dataset': re.compile(pattern_json['dataset']), - 'suffix': pattern_json['suffix'], - 'specifiers': pattern_json.get('specifiers', []), - 'specifiers_map': pattern_json.get('specifiers_map', {}) - } - - logger.debug('pattern = %s', pattern) - - return pattern + pattern_json = find_json(protocol_location, 'pattern', path) + if pattern_json: + if not all([ + isinstance(pattern_json['path'], str), + isinstance(pattern_json['file'], str), + isinstance(pattern_json['dataset'], str), + isinstance(pattern_json['suffix'], list) + ]): + break + + pattern = { + 'path': re.compile(pattern_json['path']), + 'file': re.compile(pattern_json['file']), + 'dataset': re.compile(pattern_json['dataset']), + 'suffix': pattern_json['suffix'], + 'specifiers': pattern_json.get('specifiers', []), + 'specifiers_map': pattern_json.get('specifiers_map', {}) + } + + logger.debug('pattern = %s', pattern) + + return pattern raise NotFound(f'No pattern found for {path}.') @@ -122,11 +116,9 @@ def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCO protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for schema_path, schema_json in find_json(protocol_location, 'schema', path): - if schema_json: - logger.debug('schema_path = %s', schema_path) - logger.debug('schema_json = %s', schema_json) - return schema_json + schema_json = find_json(protocol_location, 'schema', path) + if schema_json: + return schema_json raise NotFound(f'No schema found for {path}.') @@ -148,11 +140,9 @@ def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_ protocol_locations = [protocol_locations] for protocol_location in protocol_locations: - for tree_path, tree_json in find_json(protocol_location, 'tree', path): - if tree_json: - logger.debug('tree_path = %s', tree_path) - logger.debug('tree_json = %s', tree_json) - return tree_json + tree_json = find_json(protocol_location, 'tree', path) + if tree_json: + return tree_json raise NotFound(f'No tree found for {path}.') @@ -174,17 +164,23 @@ def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Ge sub_location (str): Subdirectory within protocol location (e.g., 'definitions', 'pattern'). path (str | Path): Path to search for JSON files. - Yields: - Tuples of (current_path, json_content) for each path component. + Returns: + The JSON response from the first matching path. """ path_components = Path(path).parts for i in range(len(path_components), 0, -1): current_path = Path(os.sep.join(path_components[:i+1])).with_suffix('.json') if urlparse(protocol_location).scheme: - yield current_path, fetch_json(f'{protocol_location}/{sub_location}/{current_path}') + data = fetch_json(f'{protocol_location}/{sub_location}/{current_path}') else: - yield current_path, load_json(Path(protocol_location) / 'output' / sub_location / current_path) + data = load_json(Path(protocol_location) / 'output' / sub_location / current_path) + + logger.debug('path = %s', current_path) + logger.debug('data = %s', data) + + if data is not None: + return data def fetch_json(location: str) -> Any | None: From 11e5c5380ec5d1236bbbdb0e19b8b50d9167831d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 4 Nov 2025 16:49:56 +0100 Subject: [PATCH 057/162] Add tests --- .gitignore | 4 + isimip_utils/tests/test_checksum.py | 17 ++++ isimip_utils/tests/test_cli.py | 93 +++++++++++++++++++++ isimip_utils/tests/test_fetch.py | 123 ++++++++++++++++++++++++++++ pyproject.toml | 9 ++ testing/setup.sh | 35 ++++++++ 6 files changed, 281 insertions(+) create mode 100644 isimip_utils/tests/test_checksum.py create mode 100644 isimip_utils/tests/test_cli.py create mode 100644 isimip_utils/tests/test_fetch.py create mode 100644 testing/setup.sh diff --git a/.gitignore b/.gitignore index 3e122cd..f103619 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ __pycache__/ /dist /*.egg-info /.pytest_cache +.aider* + +/testing/datasets +/testing/protocol diff --git a/isimip_utils/tests/test_checksum.py b/isimip_utils/tests/test_checksum.py new file mode 100644 index 0000000..8add02b --- /dev/null +++ b/isimip_utils/tests/test_checksum.py @@ -0,0 +1,17 @@ +from pathlib import Path + +from isimip_utils.checksum import get_checksum, get_checksum_suffix, get_checksum_type + + +def test_get_checksum(): + file_path = Path('testing/datasets') / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' + checksum = get_checksum(file_path) + assert checksum == '30f34d0720b8a6b670d0c093d488a3cd564e232a94d7ebafef99c1d7c18cec5d127fbc663f6378b4b99f9434fa10f71e8413b533c5cc5314d149ab9e2f7cca98' # noqa: E501 + + +def test_get_checksum_type(): + assert get_checksum_type() == 'sha512' + + +def test_get_checksum_suffix(): + assert get_checksum_suffix() == '.sha512' diff --git a/isimip_utils/tests/test_cli.py b/isimip_utils/tests/test_cli.py new file mode 100644 index 0000000..d217a96 --- /dev/null +++ b/isimip_utils/tests/test_cli.py @@ -0,0 +1,93 @@ +import argparse +import os +import tempfile +from pathlib import Path + +import pytest + +from isimip_utils.cli import ArgumentParser, parse_dict, parse_filelist, parse_list, parse_path, parse_version + + +def test_parse_dict(): + result = parse_dict("key=value1,value2") + assert result == {"key": ["value1", "value2"]} + + +def test_parse_list(): + result = parse_list("a,b,c") + assert result == ["a", "b", "c"] + + +def test_parse_version(): + result = parse_version("20230101") + assert result == "20230101" + + +def test_parse_version_invalid(): + with pytest.raises(argparse.ArgumentTypeError): + parse_version("invalid") + + +def test_parse_path(): + result = parse_path("~/test") + assert isinstance(result, Path) + + +def test_parse_filelist(): + with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: + f.write("/path/to/file1\n") + f.write("#comment\n") + f.write("/path/to/file2\n") + temp_file = f.name + + try: + result = parse_filelist(temp_file) + assert "/path/to/file1" in result + assert "/path/to/file2" in result + assert "#comment" not in result + finally: + os.unlink(temp_file) + + +def test_parse_filelist_none(): + result = parse_filelist(None) + assert result is None + + +def test_argument_parser(): + parser = ArgumentParser() + parser.add_argument("--test", default="default") + + args = parser.parse_args([]) + assert args.test == "default" + + +def test_argument_parser_with_config(tmp_path): + config_file = tmp_path / "isimip.toml" + config_file.write_text("[test]\ntest = \"config_value\"\n") + + # Temporarily change the config files list to use our test config + original_config_files = ArgumentParser.config_files + ArgumentParser.config_files = [str(config_file)] + + try: + parser = ArgumentParser(prog="test") + parser.add_argument("--test", default="default") + + args = parser.parse_args([]) + assert args.test == "config_value" + finally: + ArgumentParser.config_files = original_config_files + + +def test_argument_parser_with_env(): + os.environ["TEST"] = "env_value" + + try: + parser = ArgumentParser() + parser.add_argument("--test", default="default") + + args = parser.parse_args([]) + assert args.test == "env_value" + finally: + del os.environ["TEST"] diff --git a/isimip_utils/tests/test_fetch.py b/isimip_utils/tests/test_fetch.py new file mode 100644 index 0000000..b3cf868 --- /dev/null +++ b/isimip_utils/tests/test_fetch.py @@ -0,0 +1,123 @@ +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from isimip_utils.fetch import ( + fetch_definitions, + fetch_json, + fetch_pattern, + fetch_schema, + fetch_tree, + find_json, + load_json, +) + +paths = [ + 'ISIMIP3a/OutputData/agriculture/ACEA/gswp3-w5e5.json', + 'ISIMIP3a/OutputData/agriculture/ACEA.json', + 'ISIMIP3a/OutputData/agriculture.json' +] + + +def mock_side_effect(url, *args, **kwargs): + mock_response = MagicMock() + mock_path = Path(url.replace('https://protocol.isimip.org', 'testing/protocol/output')) + + if mock_path.exists(): + with mock_path.open() as fp: + mock_response.status_code = 200 + mock_response.json.return_value = json.load(fp) + else: + mock_response.status_code = 404 + mock_response.json.return_value = None + + return mock_response + + +@pytest.mark.parametrize('path', paths) +def test_fetch_definitions(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_definitions(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_definitions_local(path): + data = fetch_definitions(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_pattern(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_pattern(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_pattern_local(path): + data = fetch_pattern(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_schema(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_schema(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_schema_local(path): + data = fetch_schema(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_tree(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_tree(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_tree_local(path): + data = fetch_tree(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_find_json_fetch(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = find_json('https://protocol.isimip.org', 'definitions', path) + assert data is not None + + +@pytest.mark.parametrize('path', paths) +def test_find_json_load(path): + data = find_json('testing/protocol', 'definitions', path) + assert data is not None + + +def test_fetch_json(): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_json("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture.json") + assert data is not None + + +def test_fetch_json_not_found(): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_json("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture/ACEA.json") + assert data is None + + +def test_load_json(): + data = load_json('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture.json') + assert data is not None + + +def test_load_json_not_found(): + data = load_json('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture/ACEA.json') + assert data is None diff --git a/pyproject.toml b/pyproject.toml index 62c7258..8fa1cc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,11 +44,16 @@ shapes = [ "rioxarray", ] dev = [ + "isimip-utils[pytest]", "build", "pre-commit", "ruff", "twine", ] +pytest = [ + "pytest", + "pytest-cov" +] docs = [ "mkdocs", "mkdocs-material", @@ -92,7 +97,11 @@ known-first-party = [ section-order = [ "future", "standard-library", + "pytest", "third-party", "first-party", "local-folder" ] + +[tool.ruff.lint.isort.sections] +pytest = ["pytest"] diff --git a/testing/setup.sh b/testing/setup.sh new file mode 100644 index 0000000..57b5140 --- /dev/null +++ b/testing/setup.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +DATASETS_PATH=testing/datasets + +DATASETS_FILES=( + ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc + ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc + ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2030_2040.nc + ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc +) + +mkdir -p "${DATASETS_PATH}" + +for FILE_PATH in "${DATASETS_FILES[@]}"; do + # Create parent directories for the file + mkdir -p "${DATASETS_PATH}/$(dirname "${FILE_PATH}")" + wget -c "https://files.isimip.org/${FILE_PATH}" -O "${DATASETS_PATH}/${FILE_PATH}" +done + +PROTOCOL_PATH=testing/protocol/output + +PROTOCOL_FILES=( + definitions/ISIMIP3a/OutputData/agriculture.json + pattern/ISIMIP3a/OutputData/agriculture.json + schema/ISIMIP3a/OutputData/agriculture.json + tree/ISIMIP3a/OutputData/agriculture.json +) + +mkdir -p "${PROTOCOL_PATH}" + +for FILE_PATH in "${PROTOCOL_FILES[@]}"; do + # Create parent directories for the file + mkdir -p "${PROTOCOL_PATH}/$(dirname "${FILE_PATH}")" + wget -c "https://protocol.isimip.org/${FILE_PATH}" -O "${PROTOCOL_PATH}/${FILE_PATH}" +done From 52b7d57d1d3de1053bf2b235e8e763d139c7822a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 12 Nov 2025 18:14:15 +0100 Subject: [PATCH 058/162] Add overwrite to netcdf.init_dataset --- isimip_utils/netcdf.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 776d668..59bdf4f 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -36,7 +36,7 @@ def open_dataset_write(file_path: str | Path) -> Dataset: return Dataset(file_path, 'r+') -def init_dataset(file_path: str | Path, diskless: bool = False, lon: int = 720, lat: int = 360, +def init_dataset(file_path: str | Path, diskless: bool = False, overwrite: bool = False, lon: int = 720, lat: int = 360, time: None | np.ndarray = None, time_unit: str = 'days since 1601-1-1 00:00:00', time_calendar: str = 'proleptic_gregorian', attrs: dict = {}, **variables: Any) -> Dataset: """Initialize a new NetCDF4 dataset with standard dimensions and variables. @@ -44,6 +44,7 @@ def init_dataset(file_path: str | Path, diskless: bool = False, lon: int = 720, Args: file_path (str | Path): Path where the NetCDF file will be created. diskless (bool): If True, create dataset in memory (default: False). + overwrite (bool): If True, overwrite existing dataset (default: False). lon (int): Number of longitude points (default: 720). lat (int): Number of latitude points (default: 360). time (None | np.ndarray): Time dimension configuration (default: None). @@ -55,6 +56,10 @@ def init_dataset(file_path: str | Path, diskless: bool = False, lon: int = 720, Returns: Initialized NetCDF4 Dataset object. """ + # overwrite existing file + if overwrite and file_path.exists(): + file_path.unlink() + # create NetCDF dataset ds = Dataset(file_path, 'w', format='NETCDF4_CLASSIC', diskless=diskless) From c44854ca3c54bcfdd84ccfefcdcdd2943f368711 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 12 Nov 2025 18:15:18 +0100 Subject: [PATCH 059/162] Add test_netcdf.py and fix issues --- .gitignore | 1 + isimip_utils/netcdf.py | 17 +++-- isimip_utils/tests/test_netcdf.py | 120 ++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 7 deletions(-) create mode 100644 isimip_utils/tests/test_netcdf.py diff --git a/.gitignore b/.gitignore index f103619..ea928bd 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ __pycache__/ .aider* /testing/datasets +/testing/output /testing/protocol diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 59bdf4f..f1ca5a4 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -218,7 +218,8 @@ def convert_attribute(value: Any) -> Any: return value -def update_global_attributes(dataset: Dataset, set_attributes: dict = {}, delete_attributes: list = []) -> None: +def update_global_attributes(dataset: Dataset, set_attributes: dict | None = None, + delete_attributes: list | None = None) -> None: """Update global attributes of a NetCDF dataset. Args: @@ -226,12 +227,14 @@ def update_global_attributes(dataset: Dataset, set_attributes: dict = {}, delete set_attributes (dict): Dictionary of attributes to set or update. delete_attributes (list): List of attribute names to delete. """ - for attr in dataset.__dict__: - if attr in delete_attributes: - dataset.delncattr(attr) + if delete_attributes is not None: + for attr in dataset.__dict__: + if attr in delete_attributes: + dataset.delncattr(attr) - for attr, value in set_attributes.items(): - dataset.setncattr(attr, value2string(value)) + if set_attributes is not None: + for attr, value in set_attributes.items(): + dataset.setncattr(attr, value2string(value)) def value2string(value: Any) -> str: @@ -244,6 +247,6 @@ def value2string(value: Any) -> str: String representation of the value. """ if isinstance(value, datetime): - return value.isoformat() + 'Z', + return value.isoformat() + 'Z' else: return str(value) diff --git a/isimip_utils/tests/test_netcdf.py b/isimip_utils/tests/test_netcdf.py new file mode 100644 index 0000000..11926e2 --- /dev/null +++ b/isimip_utils/tests/test_netcdf.py @@ -0,0 +1,120 @@ +from datetime import datetime +from pathlib import Path + +import pytest + +import numpy as np +from netCDF4 import Dataset + +from isimip_utils.netcdf import ( + convert_attribute, + get_data_model, + get_dimensions, + get_global_attributes, + get_variables, + init_dataset, + open_dataset_read, + open_dataset_write, + update_global_attributes, + value2string, +) + +landseamask_path = Path('testing/datasets') / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' +test_path = Path('testing/output') / 'test.nc' +test_path.parent.mkdir(exist_ok=True) + + +def test_open_dataset_read(): + dataset = open_dataset_read(landseamask_path) + assert isinstance(dataset, Dataset) + + +def test_open_dataset_write(): + test_path.unlink(missing_ok=True) + + dataset = open_dataset_write(test_path) + assert isinstance(dataset, Dataset) + + +def test_init_dataset(): + dataset = init_dataset(test_path) + assert isinstance(dataset, Dataset) + + +def test_get_data_model(): + dataset = Dataset(landseamask_path) + data_model = get_data_model(dataset) + assert data_model == 'NETCDF4_CLASSIC' + + +def test_get_dimensions(): + dataset = init_dataset(test_path, overwrite=True) + dimensions = get_dimensions(dataset) + assert list(dimensions.items()) == [ + ('lon', 720), + ('lat', 360) + ] + + +def test_get_variables(): + dataset = init_dataset(test_path, overwrite=True) + variables = get_variables(dataset) + assert [(variable_name, variable['standard_name']) for variable_name, variable in variables.items()] == [ + ('lon', 'longitude'), + ('lat', 'latitude') + ] + + +def test_get_global_attributes(): + dataset = init_dataset(test_path, overwrite=True, attrs={ + 'global': { + 'egg': 'spam', + 'x': np.float32(3.0) + } + }) + global_attrs = get_global_attributes(dataset) + + assert global_attrs['egg'] == 'spam' + assert global_attrs['x'] == np.float32(3.0) + + +@pytest.mark.parametrize('value,return_value', [ + (np.float32(3.0), 3.0), + (np.int32(42), 42), + ([1, 2, 3], [1, 2, 3]), + (np.array([1, 2, 3]), [1, 2, 3]), + ([np.float32(1.0), np.int32(2)], [1.0, 2]) +]) +def test_convert_attribute(value, return_value): + assert convert_attribute(value) == return_value + + +def test_update_global_attributes_set(): + dataset = init_dataset(test_path, overwrite=True) + update_global_attributes(dataset, set_attributes={ + 'egg': 'spam' + }) + + assert dataset.egg == 'spam' + + +def test_update_global_attributes_delete(): + dataset = init_dataset(test_path, overwrite=True, attrs={ + 'global': { + 'egg': 'spam' + } + }) + update_global_attributes(dataset, delete_attributes=['egg']) + + with pytest.raises(AttributeError): + assert dataset.egg + + +@pytest.mark.parametrize('value,string', [ + (datetime(2023, 1, 1, 12, 0, 0), '2023-01-01T12:00:00Z'), + (123, '123'), + ('test', 'test'), + (None, 'None') +]) +def test_value2string(value, string): + assert value2string(value) == string From 3b23bafe72813925d93c5408603d3c04c7597188 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 17 Nov 2025 17:43:23 +0100 Subject: [PATCH 060/162] Use decode_cf=True as default --- isimip_utils/xarray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 5544784..524fd7a 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -91,12 +91,12 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, return ds -def open_dataset(path: str | Path, decode_cf: bool = False, load: bool = False) -> xr.Dataset: +def open_dataset(path: str | Path, decode_cf: bool = True, load: bool = False) -> xr.Dataset: """Open a NetCDF dataset using xarray. Args: path (str | Path): Path to the NetCDF file. - decode_cf (bool): Whether to decode CF conventions (default: False). + decode_cf (bool): Whether to decode CF conventions (default: True). load (bool): Whether to load data into memory immediately (default: False). Returns: @@ -127,12 +127,12 @@ def open_dataset(path: str | Path, decode_cf: bool = False, load: bool = False) return ds -def load_dataset(path: str | Path, decode_cf: bool = False) -> xr.Dataset: +def load_dataset(path: str | Path, decode_cf: bool = True) -> xr.Dataset: """Open a NetCDF dataset using xarray and load data into memory immediately. Args: path (str | Path): Path to the NetCDF file. - decode_cf (bool): Whether to decode CF conventions (default: False). + decode_cf (bool): Whether to decode CF conventions (default: True). Returns: Xarray Dataset object. From 81b57f36fbe0cc26aad3a763e3fe87f2f95cef3a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 17 Nov 2025 17:43:48 +0100 Subject: [PATCH 061/162] Filter one value coords and data_vars in to_dataframe --- isimip_utils/xarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 524fd7a..49e1e82 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -329,10 +329,10 @@ def to_dataframe(ds: xr.Dataset) -> pd.DataFrame: df = ds.to_dataframe().reset_index() df.attrs['coords'] = { - coord: ds[coord].attrs for coord in ds.coords + coord: ds[coord].attrs for coord in ds.coords if (ds[coord].size > 1) } df.attrs['data_vars'] = { - data_var: ds[data_var].attrs for data_var in ds.data_vars + data_var: ds[data_var].attrs for data_var in ds.data_vars if (ds[data_var].size > 1) } return df From 9c83376f950870d382a97b6ef66aad8ce049b64b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 17 Nov 2025 17:44:23 +0100 Subject: [PATCH 062/162] Refactor pandas.py --- isimip_utils/pandas.py | 116 +++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 28 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 0cb4f11..4d8ad13 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -2,7 +2,19 @@ import pandas as pd -def get_coord(df: pd.DataFrame) -> str: +def get_coords(df: pd.DataFrame) -> tuple: + """Get the coordinate names from DataFrame attributes. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. + + Returns: + Name of the coordinates. + """ + return tuple(df.attrs['coords']) + + +def get_first_coord(df: pd.DataFrame) -> str: """Get the first coordinate name from DataFrame attributes. Args: @@ -11,10 +23,27 @@ def get_coord(df: pd.DataFrame) -> str: Returns: Name of the first coordinate. """ - return next(iter(df.attrs['coords'])) + return next(iter(get_coords(df))) + +def get_coord_labels(df: pd.DataFrame) -> tuple: + """Get a formatted labels for the coordinates with units. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. -def get_coord_label(df: pd.DataFrame) -> str: + Returns: + Formatted string like "Coordinate Name [units]" or just the name if no units. + """ + labels = [] + for coord in get_coords(df): + name = df.attrs['coords'][coord].get('long_name', coord) + units = df.attrs['coords'][coord].get('units') + labels.append(f'{name} [{units}]' if units else name) + return tuple(labels) + + +def get_first_coord_label(df: pd.DataFrame) -> str | None: """Get a formatted label for the coordinate with units. Args: @@ -23,14 +52,26 @@ def get_coord_label(df: pd.DataFrame) -> str: Returns: Formatted string like "Coordinate Name [units]" or just the name if no units. """ - coord = get_coord(df) - name = df.attrs['coords'][coord].get('long_name', coord) - units = df.attrs['coords'][coord].get('units') - return f'{name} [{units}]' if units else name + return next(iter(get_coord_labels(df))) + + +def get_coord_axes(df: pd.DataFrame) -> tuple: + """Get the axis attribute for all coordinates. + + Args: + df (pd.DataFrame): DataFrame with 'coords' in attrs. + + Returns: + Axis attribute (e.g., 'T', 'X', 'Y'). + """ + axes = [] + for coord in get_coords(df): + axes.append(df.attrs['coords'][coord].get('axis')) + return tuple(axes) -def get_coord_axis(df: pd.DataFrame) -> str | None: - """Get the axis attribute for the coordinate. +def get_first_coord_axis(df: pd.DataFrame) -> str | None: + """Get the axis attribute for the first coordinate. Args: df (pd.DataFrame): DataFrame with 'coords' in attrs. @@ -38,11 +79,22 @@ def get_coord_axis(df: pd.DataFrame) -> str | None: Returns: Axis attribute (e.g., 'T', 'X', 'Y'), or None if not set. """ - coord = get_coord(df) - return df.attrs['coords'][coord].get('axis') + return next(iter(get_coord_axes(df))) + + +def get_data_vars(df: pd.DataFrame) -> tuple: + """Get the data variable names from DataFrame attributes. + + Args: + df (pd.DataFrame): DataFrame with 'data_vars' in attrs. + + Returns: + Names of the data variables. + """ + return tuple(df.attrs['data_vars']) -def get_data_var(df: pd.DataFrame) -> str: +def get_first_data_var(df: pd.DataFrame) -> str: """Get the first data variable name from DataFrame attributes. Args: @@ -51,10 +103,10 @@ def get_data_var(df: pd.DataFrame) -> str: Returns: Name of the first data variable. """ - return next(iter(df.attrs['data_vars'])) + return next(iter(get_data_vars(df))) -def get_data_var_label(df: pd.DataFrame) -> str: +def get_data_var_labels(df: pd.DataFrame) -> str: """Get a formatted label for the data variable with units. Args: @@ -63,13 +115,27 @@ def get_data_var_label(df: pd.DataFrame) -> str: Returns: Formatted string like "Variable Name [units]" or just the name if no units. """ - data_var = get_data_var(df) - data_var_name = df.attrs['data_vars'][data_var].get('long_name', data_var) - data_var_units = df.attrs['data_vars'][data_var].get('units') - return f'{data_var_name} [{data_var_units}]' if data_var_units else data_var_name + labels = [] + for data_var in get_data_vars(df): + data_var_name = df.attrs['data_vars'][data_var].get('long_name', data_var) + data_var_units = df.attrs['data_vars'][data_var].get('units') + labels.append(f'{data_var_name} [{data_var_units}]' if data_var_units else data_var_name) + return tuple(labels) -def compute_average(df: pd.DataFrame, area: bool = True) -> pd.DataFrame: +def get_first_data_var_label(df: pd.DataFrame) -> str: + """Get a formatted label for the data variable with units. + + Args: + df (pd.DataFrame): DataFrame with 'data_vars' in attrs. + + Returns: + Formatted string like "Variable Name [units]" or just the name if no units. + """ + return next(iter(get_data_var_labels(df))) + + +def compute_average(df: pd.DataFrame, data_var: str, area: bool = True) -> pd.DataFrame: """Compute yearly average with optional standard deviation bounds. Args: @@ -79,7 +145,6 @@ def compute_average(df: pd.DataFrame, area: bool = True) -> pd.DataFrame: Returns: DataFrame with yearly aggregated data. """ - data_var = get_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') data_var_units = df.attrs['data_vars'][data_var].get('units') @@ -112,7 +177,7 @@ def compute_average(df: pd.DataFrame, area: bool = True) -> pd.DataFrame: return df -def group_by_day(df: pd.DataFrame) -> pd.DataFrame: +def group_by_day(df: pd.DataFrame, data_var: str) -> pd.DataFrame: """Group data by day of year and compute mean. Args: @@ -121,8 +186,6 @@ def group_by_day(df: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame grouped by day of year (1-365/366). """ - data_var = get_data_var(df) - df['day'] = df['time'].dt.dayofyear df = df.groupby('day')[data_var].mean().reset_index() df.attrs['coords'] = {'day': { 'long_name': 'Day of the year'}} @@ -130,7 +193,7 @@ def group_by_day(df: pd.DataFrame) -> pd.DataFrame: return df -def group_by_month(df: pd.DataFrame) -> pd.DataFrame: +def group_by_month(df: pd.DataFrame, data_var: str) -> pd.DataFrame: """Group data by month and compute mean. Args: @@ -139,8 +202,6 @@ def group_by_month(df: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame grouped by month (1-12). """ - data_var = get_data_var(df) - df['month'] = df['time'].dt.month df = df.groupby('month')[data_var].mean().reset_index() df.attrs['coords'] = {'month': {'long_name': 'Month of the year'}} @@ -148,7 +209,7 @@ def group_by_month(df: pd.DataFrame) -> pd.DataFrame: return df -def normalize(df: pd.DataFrame) -> pd.DataFrame: +def normalize(df: pd.DataFrame, data_var: str) -> pd.DataFrame: """Normalize data variable using z-score normalization. Args: @@ -157,7 +218,6 @@ def normalize(df: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame with normalized data variable (mean=0, std=1). """ - data_var = get_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') mean, std = df[data_var].mean(), df[data_var].std() From ceef144e2573f273367aee146c9b8963cc74dcdb Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 17 Nov 2025 17:45:01 +0100 Subject: [PATCH 063/162] Use setup.py instead of setup.sh --- .gitignore | 1 + testing/setup.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ testing/setup.sh | 35 -------------------- 3 files changed, 86 insertions(+), 35 deletions(-) create mode 100755 testing/setup.py delete mode 100644 testing/setup.sh diff --git a/.gitignore b/.gitignore index ea928bd..8ab7c8b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,6 @@ __pycache__/ .aider* /testing/datasets +/testing/extractions /testing/output /testing/protocol diff --git a/testing/setup.py b/testing/setup.py new file mode 100755 index 0000000..5f37682 --- /dev/null +++ b/testing/setup.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +from pathlib import Path +from subprocess import check_call + +from isimip_utils.extractions import concat_extraction, select_bbox, select_point +from isimip_utils.xarray import open_dataset, write_dataset + +datasets_path = Path("testing/datasets") +extractions_path = Path("testing/extractions") +protocol_path = Path("testing/protocol/output") + +mask_paths = [ + "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" +] + +dataset_paths = [ + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc", + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc", + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc" +] + +protocol_paths = [ + "definitions/ISIMIP3a/OutputData/agriculture.json", + "pattern/ISIMIP3a/OutputData/agriculture.json", + "schema/ISIMIP3a/OutputData/agriculture.json", + "tree/ISIMIP3a/OutputData/agriculture.json" +] + +bbox = (0, 10, -5, 5) +bbox_path = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_bbox_daily.nc" # noqa: E501 + +point = (52.395833, 13.061389) +point_path = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_point_daily.nc" # noqa: E501 + +def main(): + # download_datasets() + # download_protocol() + create_extractions() + + +def download_datasets(): + datasets_path.mkdir(parents=True, exist_ok=True) + + for path in mask_paths + dataset_paths: + file_path = datasets_path / path + file_path.parent.mkdir(parents=True, exist_ok=True) + + url = f"https://files.isimip.org/{path}" + + check_call(['wget', '-c', url, '-O', file_path]) + + +def download_protocol(): + protocol_path.mkdir(parents=True, exist_ok=True) + + for path in protocol_paths: + file_path = protocol_path / path + file_path.parent.mkdir(parents=True, exist_ok=True) + + url = f"https://protocol.isimip.org/{path}" + + check_call(['wget', '-c', url, '-O', file_path]) + + +def create_extractions(): + west, east, south, north = bbox + lat, lon = point + + for path in dataset_paths: + file_path = datasets_path / path + + extraction_bbox = None + extraction_point = None + with open_dataset(file_path) as ds_file: + ds_bbox = select_bbox(ds_file, west, east, south, north) + extraction_bbox = concat_extraction(extraction_bbox, ds_bbox) + + ds_point = select_point(ds_file, lat, lon) + extraction_point = concat_extraction(extraction_point, ds_point) + + write_dataset(extraction_bbox, extractions_path / bbox_path) + write_dataset(extraction_point, extractions_path / point_path) + +if __name__ == "__main__": + main() diff --git a/testing/setup.sh b/testing/setup.sh deleted file mode 100644 index 57b5140..0000000 --- a/testing/setup.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -DATASETS_PATH=testing/datasets - -DATASETS_FILES=( - ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc - ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc - ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2030_2040.nc - ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc -) - -mkdir -p "${DATASETS_PATH}" - -for FILE_PATH in "${DATASETS_FILES[@]}"; do - # Create parent directories for the file - mkdir -p "${DATASETS_PATH}/$(dirname "${FILE_PATH}")" - wget -c "https://files.isimip.org/${FILE_PATH}" -O "${DATASETS_PATH}/${FILE_PATH}" -done - -PROTOCOL_PATH=testing/protocol/output - -PROTOCOL_FILES=( - definitions/ISIMIP3a/OutputData/agriculture.json - pattern/ISIMIP3a/OutputData/agriculture.json - schema/ISIMIP3a/OutputData/agriculture.json - tree/ISIMIP3a/OutputData/agriculture.json -) - -mkdir -p "${PROTOCOL_PATH}" - -for FILE_PATH in "${PROTOCOL_FILES[@]}"; do - # Create parent directories for the file - mkdir -p "${PROTOCOL_PATH}/$(dirname "${FILE_PATH}")" - wget -c "https://protocol.isimip.org/${FILE_PATH}" -O "${PROTOCOL_PATH}/${FILE_PATH}" -done From 4561a8582a891daccdb6d9dfe3e7dd3ec8e01122 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 17 Nov 2025 17:45:53 +0100 Subject: [PATCH 064/162] Add test_pandas.py --- isimip_utils/tests/test_pandas.py | 167 ++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 isimip_utils/tests/test_pandas.py diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py new file mode 100644 index 0000000..805e322 --- /dev/null +++ b/isimip_utils/tests/test_pandas.py @@ -0,0 +1,167 @@ +from pathlib import Path + +import pytest + +from isimip_utils.pandas import ( + compute_average, + create_label, + get_coord_axes, + get_coord_labels, + get_coords, + get_data_var_labels, + get_data_vars, + get_first_coord, + get_first_coord_axis, + get_first_coord_label, + get_first_data_var, + get_first_data_var_label, + group_by_day, + group_by_month, + normalize, +) +from isimip_utils.xarray import open_dataset, to_dataframe + +extractions_path = Path("testing/extractions") + +extractions = { + 'bbox': "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_bbox_daily.nc", # noqa: E501 + 'point': "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_point_daily.nc" # noqa: E501 +} + +@pytest.mark.parametrize('extraction,result', [ + ('bbox', ('lon', 'lat', 'time')), + ('point', ('time', )) +]) +def test_get_coords(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_coords(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('point', 'time') +]) +def test_get_first_coord(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_first_coord(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('bbox', ('Longitude [degrees_east]', 'Latitude [degrees_north]', 'time')), + ('point', ('time', )) +]) +def test_get_coord_labels(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_coord_labels(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('point', 'time') +]) +def test_get_first_coord_label(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_first_coord_label(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('bbox', ('X', 'Y', 'T')), + ('point', ('T', )) +]) +def test_get_coord_axes(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_coord_axes(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('point', 'T') +]) +def test_get_first_coord_axis(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_first_coord_axis(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('bbox', ('tas', )), + ('point', ('tas', )) +]) +def test_get_data_vars(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_data_vars(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('point', 'tas') +]) +def test_get_first_data_var(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_first_data_var(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('bbox', ('Near-Surface Air Temperature [K]', )), + ('point', ('Near-Surface Air Temperature [K]', )) +]) +def test_get_data_var_labels(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_data_var_labels(df) == result + + +@pytest.mark.parametrize('extraction,result', [ + ('point', 'Near-Surface Air Temperature [K]') +]) +def test_get_first_data_var_label(extraction, result): + with open_dataset(extractions_path / extractions[extraction]) as ds: + df = to_dataframe(ds) + assert get_first_data_var_label(df) == result + + +def test_compute_average(): + with open_dataset(extractions_path / extractions['point']) as ds: + df = to_dataframe(ds) + df = compute_average(df, 'tas') + + assert df['lower'].between(270, 280).all() + assert df['mean'].between(280, 290).all() + assert df['upper'].between(290, 300).all() + + +def test_group_by_day(): + with open_dataset(extractions_path / extractions['point']) as ds: + df = to_dataframe(ds) + df = group_by_day(df, 'tas') + + assert len(df) == 366 + assert df['tas'].between(270, 300).all() + + +def test_group_by_month(): + with open_dataset(extractions_path / extractions['point']) as ds: + df = to_dataframe(ds) + df = group_by_month(df, 'tas') + + assert len(df) == 12 + assert df['tas'].between(270, 300).all() + + +def test_normalize(): + with open_dataset(extractions_path / extractions['point']) as ds: + df = to_dataframe(ds) + df = normalize(df, 'tas') + + assert df['tas'].between(-4, 4).all() + + +def test_create_label(): + with open_dataset(extractions_path / extractions['point']) as ds: + df = to_dataframe(ds) + df = create_label(df, ['x', 'y', 'z']) + + assert (df['label'] == 'x y z').all() From 685914cee3ce76eb5d6d6c7d9c90b7306f85e992 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 18 Nov 2025 17:31:32 +0100 Subject: [PATCH 065/162] Fix xarray.py --- isimip_utils/xarray.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 49e1e82..a0d40b0 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -11,7 +11,7 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, - time_unit: str = 'days since 1601-1-1 00:00:00', + time_units: str = 'days since 1601-1-1 00:00:00', time_calendar: str = 'proleptic_gregorian', attrs: dict = {}, **variables: np.ndarray) -> xr.Dataset: """Initialize a new xarray dataset with standard ISIMIP dimensions. @@ -20,7 +20,7 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, lon (int): Number of longitude points (default: 720). lat (int): Number of latitude points (default: 360). time (np.ndarray | None): Time coordinate array, or None to omit time dimension (default: None). - time_unit (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). + time_units (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). time_calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). attrs (dict): Dictionary of attributes for variables and global attributes. **variables (np.ndarray): Data variables to include in the dataset. @@ -30,8 +30,10 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, """ # create coordinates + dims = ('lat', 'lon') coords = {} if time is not None: + dims = ('time', 'lat', 'lon') coords['time'] = time lon_delta = 360.0 / lon @@ -42,7 +44,7 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, # create data variables data_vars = { - var_name: (['time', 'lon', 'lat'], var) + var_name: (dims, var) for var_name, var in variables.items() } @@ -54,7 +56,7 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, ds.coords['time'].attrs = { 'standard_name': 'time', 'long_name': 'Time', - 'units': time_unit, + 'units': time_units, 'calendar': time_calendar, 'axis': 'T', '_FillValue': 1.e+20 From 19dfa8635e9464ccaf156a030db6b886ad273579 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 18 Nov 2025 17:31:48 +0100 Subject: [PATCH 066/162] Add text_xarray.py --- isimip_utils/tests/test_xarray.py | 221 ++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 isimip_utils/tests/test_xarray.py diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py new file mode 100644 index 0000000..57a5607 --- /dev/null +++ b/isimip_utils/tests/test_xarray.py @@ -0,0 +1,221 @@ +from pathlib import Path + +import geopandas as gpd +import numpy as np +import xarray as xr +from shapely.geometry import box + +from isimip_utils.netcdf import open_dataset_read +from isimip_utils.xarray import ( + add_fill_value_to_attrs, + create_mask, + get_attrs, + init_dataset, + load_dataset, + open_dataset, + order_variables, + set_attrs, + set_fill_value_to_nan, + set_nan_to_fill_value, + to_dataframe, + write_dataset, +) + +datasets_path = Path('testing/datasets') + +dataset_path = datasets_path / "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" # noqa: E501 + +landseamask_path = datasets_path / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' + +test_path = Path('testing/output') / 'test.nc' +test_path.parent.mkdir(exist_ok=True) + + +def test_init_dataset(): + ds = init_dataset() + + assert isinstance(ds, xr.Dataset) + assert ds.sizes['lon'] == 720 + assert ds.sizes['lat'] == 360 + + +def test_init_dataset_args(): + lon_size, lat_size, time_size = 180, 90, 10 + + time = np.arange(time_size, dtype=np.float64) + var = np.random.rand(time_size, lat_size, lon_size).astype(np.float64) + + time_units = 'days since 2000-01-01 00:00:00' + time_calendar = '365_day' + + attrs = { + 'var': { + 'long_name': 'Variable' + } + } + + ds = init_dataset(lon=lon_size, lat=lat_size, time=time, + time_units=time_units, time_calendar=time_calendar, + attrs=attrs, var=var) + + assert isinstance(ds, xr.Dataset) + assert ds.sizes['lon'] == lon_size + assert ds.sizes['lat'] == lat_size + + assert ds['time'].units == time_units + assert ds['time'].calendar == time_calendar + + assert np.array_equal(ds['var'].values, var) + assert ds['var'].long_name == attrs['var']['long_name'] + + +def test_open_dataset(): + with open_dataset(dataset_path) as ds: + assert isinstance(ds, xr.Dataset) + assert ds['time'].dtype.type == np.datetime64 + + +def test_open_dataset_no_cf(): + with open_dataset(dataset_path, decode_cf=False) as ds: + assert isinstance(ds, xr.Dataset) + assert ds['time'].dtype.type == np.float64 + + +def test_load_dataset(): + with load_dataset(landseamask_path) as ds: + assert isinstance(ds, xr.Dataset) + + +def test_write_dataset(): + test_path.unlink(missing_ok=True) + + ds = init_dataset() + write_dataset(ds, test_path) + + +def test_order_variables(): + test_path.unlink(missing_ok=True) + + ds = init_dataset( + var=np.random.rand(360, 720).astype(np.float64) + ) + ds = ds[[*ds.data_vars, *ds.coords]] + ds.to_netcdf(test_path) + + dataset = open_dataset_read(test_path) + assert tuple(dataset.variables) == ('var', 'lon', 'lat') + + test_path.unlink(missing_ok=True) + + ds = order_variables(ds) + ds.to_netcdf(test_path) + + dataset = open_dataset_read(test_path) + assert tuple(dataset.variables) == ('lon', 'lat', 'var') + + +def test_get_attrs(): + with open_dataset(dataset_path) as ds: + attrs = get_attrs(ds) + assert attrs['lon']['long_name'] == 'Longitude' + assert attrs['lat']['long_name'] == 'Latitude' + assert attrs['tas']['long_name'] == 'Near-Surface Air Temperature' + + +def test_set_attrs(): + with open_dataset(dataset_path) as ds: + attrs = get_attrs(ds) + attrs['tas']['egg'] = 'spam' + set_attrs(ds, attrs) + assert attrs['tas']['egg'] == 'spam' + + +def test_add_fill_value_to_attrs(): + ds = xr.Dataset( + coords={ + 'time': np.arange(10, dtype=np.float64) + }, + data_vars={ + 'var': (['time'], np.ones(10)) + } + ) + add_fill_value_to_attrs(ds) + assert ds['time'].attrs['_FillValue'] == 1e20 + assert ds['var'].attrs['_FillValue'] == 1e20 + assert ds['var'].attrs['missing_value'] == 1e20 + + +def test_set_fill_value_to_nan(): + ds = xr.Dataset( + coords={ + 'time': np.arange(10, dtype=np.float64) + }, + data_vars={ + 'var': (['time'], np.ones(10)) + } + ) + ds['var'].values[0] = 1e20 + ds['var'].attrs['_FillValue'] = 1e20 + ds = set_fill_value_to_nan(ds) + assert np.isnan(ds['var'].values[0]) + +def test_set_nan_to_fill_value(): + ds = xr.Dataset( + coords={ + 'time': np.arange(10, dtype=np.float64) + }, + data_vars={ + 'var': (['time'], np.ones(10)) + } + ) + ds['var'].values[0] = np.nan + ds['var'].attrs['_FillValue'] = 1e20 + ds = set_nan_to_fill_value(ds) + assert ds['var'].values[0] == 1e20 + + +def test_create_mask(): + ds = init_dataset( + var=np.ones((360, 720)) + ) + + geometry = box(-10, -5, 10, 5) + + df = gpd.GeoDataFrame( + [{'geometry': geometry}], + crs='EPSG:4326' # WGS84 coordinate system + ) + + mask_ds = create_mask(ds, df, layer=0) + + assert mask_ds['lon'].shape == (720, ) + assert mask_ds['lat'].shape == (360, ) + + assert mask_ds['mask'].dims == ('lat', 'lon') + assert mask_ds['mask'].shape == (360, 720) + + inside_region = mask_ds.sel(lat=slice(5, -5), lon=slice(-10, 10)) + assert np.all(inside_region['mask'].values == 1.0) + + outside_regions = [ + mask_ds.sel(lon=slice(90, 5)), + mask_ds.sel(lon=slice(-5, -90)), + mask_ds.sel(lon=slice(10, 180)), + mask_ds.sel(lon=slice(-180, -10)) + ] + for outside_region in outside_regions: + assert np.all(np.isnan(outside_region['mask'].values)) + + +def test_to_dataframe(): + ds = xr.Dataset( + coords={ + 'time': np.arange(10, dtype=np.float64) + }, + data_vars={ + 'var': (['time'], np.ones(10)) + } + ) + df = to_dataframe(ds) + assert np.array_equal(df['time'], ds['time']) + assert np.array_equal(df['var'], ds['var']) From 839c2171cda80f8664e012eb1391093316b18e0d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 18 Nov 2025 17:55:12 +0100 Subject: [PATCH 067/162] Update test_xarray.py --- .gitignore | 4 +++- isimip_utils/tests/test_xarray.py | 21 ++++++++++++----- testing/setup.py | 38 +++++++++++++++++++------------ 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 8ab7c8b..a709a47 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,9 @@ __pycache__/ /dist /*.egg-info /.pytest_cache -.aider* +/.aider* +/.coverage +/htmlcov /testing/datasets /testing/extractions diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 57a5607..b79a306 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -23,7 +23,9 @@ datasets_path = Path('testing/datasets') -dataset_path = datasets_path / "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" # noqa: E501 +input_path = datasets_path / "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" # noqa: E501 + +output_path = datasets_path / "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" # noqa: E501 landseamask_path = datasets_path / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' @@ -70,17 +72,24 @@ def test_init_dataset_args(): def test_open_dataset(): - with open_dataset(dataset_path) as ds: + with open_dataset(input_path) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.datetime64 -def test_open_dataset_no_cf(): - with open_dataset(dataset_path, decode_cf=False) as ds: +def test_open_dataset_decode_cf_false(): + with open_dataset(input_path, decode_cf=False) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.float64 +def test_open_dataset_growing_seasons(): + with open_dataset(output_path) as ds: + assert isinstance(ds, xr.Dataset) + assert isinstance(ds['time'].dtype, object) + assert ds['time'].values[0].isoformat() == '1901-01-01T00:00:00' + + def test_load_dataset(): with load_dataset(landseamask_path) as ds: assert isinstance(ds, xr.Dataset) @@ -115,7 +124,7 @@ def test_order_variables(): def test_get_attrs(): - with open_dataset(dataset_path) as ds: + with open_dataset(input_path) as ds: attrs = get_attrs(ds) assert attrs['lon']['long_name'] == 'Longitude' assert attrs['lat']['long_name'] == 'Latitude' @@ -123,7 +132,7 @@ def test_get_attrs(): def test_set_attrs(): - with open_dataset(dataset_path) as ds: + with open_dataset(input_path) as ds: attrs = get_attrs(ds) attrs['tas']['egg'] = 'spam' set_attrs(ds, attrs) diff --git a/testing/setup.py b/testing/setup.py index 5f37682..731beb2 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -13,12 +13,16 @@ "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" ] -dataset_paths = [ +input_paths = [ "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc", "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc", "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc" ] +output_paths = [ + "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" +] + protocol_paths = [ "definitions/ISIMIP3a/OutputData/agriculture.json", "pattern/ISIMIP3a/OutputData/agriculture.json", @@ -33,15 +37,15 @@ point_path = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_point_daily.nc" # noqa: E501 def main(): - # download_datasets() - # download_protocol() + download_datasets() + download_protocol() create_extractions() def download_datasets(): datasets_path.mkdir(parents=True, exist_ok=True) - for path in mask_paths + dataset_paths: + for path in mask_paths + input_paths + output_paths: file_path = datasets_path / path file_path.parent.mkdir(parents=True, exist_ok=True) @@ -66,20 +70,24 @@ def create_extractions(): west, east, south, north = bbox lat, lon = point - for path in dataset_paths: - file_path = datasets_path / path + extraction_bbox_path = extractions_path / bbox_path + extraction_point_path = extractions_path / point_path + + if not all([extraction_bbox_path.exists(), extraction_bbox_path.exists()]): + for path in input_paths: + file_path = datasets_path / path - extraction_bbox = None - extraction_point = None - with open_dataset(file_path) as ds_file: - ds_bbox = select_bbox(ds_file, west, east, south, north) - extraction_bbox = concat_extraction(extraction_bbox, ds_bbox) + extraction_bbox = None + extraction_point = None + with open_dataset(file_path) as ds_file: + ds_bbox = select_bbox(ds_file, west, east, south, north) + extraction_bbox = concat_extraction(extraction_bbox, ds_bbox) - ds_point = select_point(ds_file, lat, lon) - extraction_point = concat_extraction(extraction_point, ds_point) + ds_point = select_point(ds_file, lat, lon) + extraction_point = concat_extraction(extraction_point, ds_point) - write_dataset(extraction_bbox, extractions_path / bbox_path) - write_dataset(extraction_point, extractions_path / point_path) + write_dataset(extraction_bbox, extraction_bbox_path) + write_dataset(extraction_point, extraction_point_path) if __name__ == "__main__": main() From 08c32ecc48508b8b9dc3fa1809424a8e7776594e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 19 Nov 2025 15:15:50 +0100 Subject: [PATCH 068/162] Refactor find_files --- isimip_utils/patterns.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index cb3ca94..3583f74 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -170,8 +170,9 @@ def find_files(pattern: re.Pattern, file_iter: Iterable[Path]) -> list[dict]: """ files = [] for path in sorted(file_iter): - match = pattern.search(str(path)) - if match: - files.append(dict(path=path, **match.groupdict())) + try: + files.append(match_string(pattern, path)) + except DidNotMatch: + pass return files From b4a96121fb842c628d0676ce35915dd5d91cbd00 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 19 Nov 2025 15:16:08 +0100 Subject: [PATCH 069/162] Add test_patterns.py --- isimip_utils/tests/test_patterns.py | 99 +++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 isimip_utils/tests/test_patterns.py diff --git a/isimip_utils/tests/test_patterns.py b/isimip_utils/tests/test_patterns.py new file mode 100644 index 0000000..27c8f43 --- /dev/null +++ b/isimip_utils/tests/test_patterns.py @@ -0,0 +1,99 @@ +from pathlib import Path + +from isimip_utils.fetch import fetch_pattern +from isimip_utils.patterns import find_files, match_dataset, match_dataset_path, match_file, match_file_path, match_path + +protocol_locations = ['testing/protocol'] +pattern_path = 'ISIMIP3a/OutputData/agriculture.json' + +datasets_path = Path('testing/datasets') +dataset_path = Path('ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs') # noqa: E501 +file_path = Path('ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc') # noqa: E501 + +additional_paths = [ + Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc"), + Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc"), + Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc") +] + +path_specifiers = { + 'simulation_round': 'ISIMIP3a', + 'product': 'OutputData', + 'sector': 'agriculture', + 'period': 'historical' +} + +dataset_specifiers = { + 'model': 'lpjml', + 'climate_forcing': 'gswp3-w5e5', + 'climate_scenario': 'obsclim', + 'soc_scenario': '2015soc', + 'sens_scenario': 'default', + 'variable': 'yield', + 'crop': 'mai', + 'irrigation': 'noirr', + 'region': 'global', + 'time_step': 'annual-gs' +} + +file_specifiers = { + **dataset_specifiers, + 'start_year': 1901, + 'end_year': 2016, +} + + +def test_match_dataset_path(): + pattern = fetch_pattern(pattern_path, protocol_locations) + path, specifiers = match_dataset_path(pattern, datasets_path / dataset_path) + assert str(path) == str(dataset_path) + assert specifiers == {**path_specifiers, **dataset_specifiers} + + +def test_match_file_path(): + pattern = fetch_pattern(pattern_path, protocol_locations) + path, specifiers = match_file_path(pattern, datasets_path / file_path) + assert str(path) == str(file_path) + assert specifiers == {**path_specifiers, **file_specifiers} + + +def test_match_dataset(): + pattern = fetch_pattern(pattern_path, protocol_locations) + path, specifiers = match_dataset(pattern, datasets_path / dataset_path) + assert str(path) == dataset_path.name + assert specifiers == dataset_specifiers + + +def test_match_file(): + pattern = fetch_pattern(pattern_path, protocol_locations) + path, specifiers = match_file(pattern, datasets_path / file_path) + assert str(path) == file_path.name + assert specifiers == file_specifiers + + +def test_match_path(): + pattern = fetch_pattern(pattern_path, protocol_locations) + path, specifiers = match_path(pattern, datasets_path / file_path) + assert str(path) == str(file_path) + assert specifiers == {**path_specifiers, **file_specifiers} + + +def test_match_path_specifiers_map(): + pattern = fetch_pattern(pattern_path, protocol_locations) + pattern['specifiers_map'] = { + 'global': 'spam' + } + path, specifiers = match_path(pattern, datasets_path / file_path) + assert str(path) == str(file_path) + assert specifiers == {**path_specifiers, **file_specifiers, 'region': 'spam'} + + +def test_find_files(): + pattern = fetch_pattern(pattern_path, protocol_locations) + files = [file_path.name] + [path.name for path in additional_paths] + result = find_files(pattern['file'], files) + assert len(result) + assert result == [( + Path(file_path.name), + file_specifiers + )] From fc322420bfc6987a96e546b143b7aafeac2996a9 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 19 Nov 2025 17:38:29 +0100 Subject: [PATCH 070/162] Update get_permutations --- isimip_utils/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 2067fd2..999b2d9 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -86,16 +86,16 @@ def include_path(include: list[str] | None, path: Path | str) -> bool: return True -def get_permutations(parameters: dict[str, list]) -> list[tuple]: +def get_permutations(parameters: dict[str, list]) -> tuple[tuple]: """Generate all permutations from parameter value lists. Args: parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. Returns: - List of tuples representing all possible combinations of parameter values. + Tuple of tuples representing all possible combinations of parameter values. """ - return list(product(*parameters.values())) + return tuple(product(*parameters.values())) def get_placeholders(parameters: dict[str, list], permutation: tuple) -> dict: From e9ff449a0614a4c5154a6fede6640fc761ae9835 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 19 Nov 2025 17:39:09 +0100 Subject: [PATCH 071/162] Add test_utils.py --- isimip_utils/tests/test_utils.py | 121 +++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 isimip_utils/tests/test_utils.py diff --git a/isimip_utils/tests/test_utils.py b/isimip_utils/tests/test_utils.py new file mode 100644 index 0000000..a46ed02 --- /dev/null +++ b/isimip_utils/tests/test_utils.py @@ -0,0 +1,121 @@ +from isimip_utils.utils import ( + Singleton, + cached_property, + copy_placeholders, + exclude_path, + get_permutations, + get_placeholders, + include_path, + join_parameters, + update_year, +) + +paths = [ + 'a/b/c', + 'a/b/d', + 'a/b/e' +] + +parameters = { + 'model': ['model_a', 'model_b'], + 'variable': ['x', 'y', 'z'] +} + + +def test_singleton(): + a = Singleton() + a.egg = 'spam' + + b = Singleton() + assert b.egg == 'spam' + + +def test_cached_property(): + + class Test: + + def __init__(self): + self.counter = 0 + + @cached_property + def egg(self): + self.counter += 1 + return 'spam' + + t = Test() + assert t.egg == 'spam' + assert t.egg == 'spam' + assert t.counter == 1 + + +def test_exclude_path(): + assert exclude_path([], 'a/b/c') is False + assert exclude_path(paths, 'a/b/c') is True + assert exclude_path(paths, 'a/b/cc') is True + assert exclude_path(paths, 'a/b/f') is False + + +def test_include_path(): + assert include_path([], 'a/b/c') is True + assert include_path(paths, 'a/b/c') is True + assert include_path(paths, 'a/b/cc') is True + assert include_path(paths, 'a/b/f') is False + + +def test_get_permutations(): + assert get_permutations(parameters) == ( + ('model_a', 'x'), + ('model_a', 'y'), + ('model_a', 'z'), + ('model_b', 'x'), + ('model_b', 'y'), + ('model_b', 'z') + ) + + +def test_get_placeholders(): + assert get_placeholders(parameters, ('model_a', 'x')) == { + 'model': 'model_a', + 'variable': 'x' + } + + +def test_join_parameters(): + assert join_parameters(parameters) == { + 'model': 'model_a+model_b', + 'variable': 'x+y+z' + } + + +def test_join_parameters_max_count(): + assert join_parameters(parameters, 2) == { + 'model': 'model_a+model_b', + 'variable': 'various' + } + + +def test_join_parameters_max_count_label(): + assert join_parameters(parameters, 2, 'label') == { + 'model': 'model_a+model_b', + 'variable': 'label' + } + + +def test_copy_placeholders(): + assert copy_placeholders({'foo': 'bar'}, {'egg': 'spam'}) == { + 'foo': 'bar', + 'egg': 'spam' + } + + +def test_update_year(): + placeholders = {'year': 2000} + + update_year(placeholders, 'year', 2001, '<') + assert placeholders == {'year': 2000} + + update_year(placeholders, 'year', 2001, '>') + assert placeholders == {'year': 2001} + + update_year(placeholders, 'year', 2000, '<') + assert placeholders == {'year': 2000} From e3f57971317b25b3a3269a738056e9a319517fae Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 21 Nov 2025 22:29:06 +0100 Subject: [PATCH 072/162] Refactor extractions.py --- isimip_utils/extractions.py | 105 ++++++++---------------------------- isimip_utils/netcdf.py | 24 ++++++++- isimip_utils/utils.py | 37 +++++++++++++ isimip_utils/xarray.py | 39 ++++++++++++++ 4 files changed, 120 insertions(+), 85 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index d5abbd1..16999dc 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -2,11 +2,12 @@ import logging from datetime import datetime -import cftime import numpy as np import xarray as xr -from isimip_utils.exceptions import ExtractionError, ValidationError +from isimip_utils.exceptions import ExtractionError +from isimip_utils.utils import validate_lat, validate_lon +from isimip_utils.xarray import compute_offset, compute_time logger = logging.getLogger(__name__) @@ -22,12 +23,16 @@ def select_time(ds: xr.Dataset, timestamp: datetime) -> xr.Dataset | None: Dataset at the selected time, or None if timestamp is outside range. """ logger.info(f'select time time={timestamp}') - time = compute_time(ds, timestamp) - if time < 0 or time > ds['time'].max(): + if ds.time.encoding.get('units'): + time = np.datetime64(timestamp) + else: + time = compute_time(ds, timestamp) + + if time < ds['time'].min() or time > ds['time'].max(): logger.warn(f'Selected time={time} is outside the dataset.') return None - else: - return ds.sel(time=time, method='nearest') + + return ds.sel(time=time, method='nearest') def select_period(ds: xr.Dataset, start: datetime | None, end: datetime | None) -> xr.Dataset: @@ -45,11 +50,10 @@ def select_period(ds: xr.Dataset, start: datetime | None, end: datetime | None) ExtractionError: If no time axis remains after selection. """ logger.info(f'select period start={start} end={end}') - units = ds.coords['time'].attrs['units'] - calendar = ds.coords['time'].attrs['calendar'] - - start_time = cftime.date2num(start, units=units, calendar=calendar) if start else None - end_time = cftime.date2num(end, units=units, calendar=calendar) if end else None + if ds.time.encoding.get('units'): + start_time, end_time = np.datetime64(start), np.datetime64(end) + else: + start_time, end_time = compute_time(ds, start), compute_time(ds, end) ds = ds.sel(time=slice(start_time, end_time)) @@ -170,7 +174,7 @@ def mask_mask(ds: xr.Dataset, mask_ds: xr.Dataset, mask_var: str = 'mask', Masked dataset with values where mask is 1 (or 0 if inverse=True). """ logger.info(f'mask {mask_var}') - return ds.where(mask_ds[mask_var] == 0 if inverse else 1) + return ds.where(np.isclose(mask_ds[mask_var], 0 if inverse else 1)) def compute_spatial_average(ds: xr.Dataset, weights: xr.DataArray | None = None) -> xr.Dataset: @@ -233,77 +237,10 @@ def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: elif not ds2.sizes.get('time'): return ds1 else: - # apply offset when time units or calendar diverges - offset = compute_offset(ds1, ds2) - if offset is not None: - ds2 = ds2.assign_coords(time=ds2['time'] + offset) + if not ds1.time.encoding: + # apply offset when time units or calendar diverges, but only if times where not decoded + offset = compute_offset(ds1, ds2) + if offset is not None: + ds2 = ds2.assign_coords(time=ds2['time'] + offset) return xr.concat([ds1, ds2], 'time') - - -def compute_time(ds: xr.Dataset, timestamp: datetime | None) -> float | None: - """Convert a datetime to numeric time value for dataset. - - Args: - ds (xr.Dataset): Dataset with time coordinate containing units and calendar. - timestamp (datetime | None): Timestamp to convert, or None. - - Returns: - Numeric time value in dataset's units, or None if timestamp is None. - """ - units = ds.coords['time'].attrs['units'] - calendar = ds.coords['time'].attrs['calendar'] - return cftime.date2num(timestamp, units=units, calendar=calendar) if timestamp else None - - -def compute_offset(ds1: xr.Dataset, ds2: xr.Dataset) -> xr.DataArray | None: - """Compute time offset between two datasets with different time units. - - Args: - ds1 (xr.Dataset): First dataset with time coordinate. - ds2 (xr.Dataset): Second dataset with time coordinate. - - Returns: - Time offset to apply to ds2, or None if units/calendars match. - """ - units1 = ds1.coords['time'].attrs['units'] - units2 = ds2.coords['time'].attrs['units'] - calendar1 = ds1.coords['time'].attrs['calendar'] - calendar2 = ds2.coords['time'].attrs['calendar'] - - if units1 != units2 or calendar1 != calendar2: - start_time = ds2['time'][0] - start_date = cftime.num2date(start_time, units=units2, calendar=calendar2) - offset = cftime.date2num(start_date, units=units1, calendar=calendar1) - start_time - logger.debug(f'time axis diverges "{units1}"/"{units2}" "{calendar1}"/"{calendar2}" offset={offset.values}') - return offset - - -def validate_lat(lat: float) -> None: - """Validate latitude value is within valid range. - - Args: - lat (float): Latitude value to validate. - - Raises: - ValidationError: If latitude is outside -90 to 90 range. - """ - if lat < -90: - raise ValidationError(f'lat={lat} must be > -90') - elif lat > 90: - raise ValidationError(f'lat={lat} must be < 90') - - -def validate_lon(lon: float) -> None: - """Validate longitude value is within valid range. - - Args: - lon (float): Longitude value to validate. - - Raises: - ValidationError: If longitude is outside -180 to 180 range. - """ - if lon < -180: - raise ValidationError(f'lon={lon} must be > -180') - elif lon > 180: - raise ValidationError(f'lon={lon} must be < 180') diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index f1ca5a4..2d996de 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -12,6 +12,18 @@ INT_TYPES = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] +def open_dataset(file_path: str | Path, mode: str = 'r') -> Dataset: + """Open a NetCDF dataset (just a wrapper for netcdf.Dataset). + + Args: + file_path (str | Path): Path to the NetCDF file. + mode (str): + Returns: + NetCDF4 Dataset object opened in the selected mode. + """ + return Dataset(file_path, mode) + + def open_dataset_read(file_path: str | Path) -> Dataset: """Open a NetCDF dataset in read-only mode. @@ -21,7 +33,7 @@ def open_dataset_read(file_path: str | Path) -> Dataset: Returns: NetCDF4 Dataset object opened in read mode. """ - return Dataset(file_path, 'r') + return open_dataset(file_path) def open_dataset_write(file_path: str | Path) -> Dataset: @@ -200,6 +212,16 @@ def get_global_attributes(dataset: Dataset, convert: bool = False) -> dict[str, return global_attributes +def get_index(dataset: Dataset, lat: float, lon: float) -> tuple[int, int]: + dx = dataset.variables['lon'][1] - dataset.variables['lon'][0] + dy = dataset.variables['lat'][1] - dataset.variables['lat'][0] + + ix = round(float((lon - dataset.variables['lon'][0]) / dx)) + iy = round(float((lat - dataset.variables['lat'][0]) / dy)) + + return ix, iy + + def convert_attribute(value: Any) -> Any: """Convert numpy types to Python native types. diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 999b2d9..c0188b4 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Any +from .exceptions import ValidationError + class Singleton: """Base class for implementing the singleton pattern. @@ -86,6 +88,41 @@ def include_path(include: list[str] | None, path: Path | str) -> bool: return True +def validate_lat(lat: float) -> None: + """Validate latitude value is within valid range. + + Args: + lat (float): Latitude value to validate. + + Raises: + ValidationError: If latitude is outside -90 to 90 range. + """ + try: + if lat < -90: + raise ValidationError(f'lat={lat} must be > -90') + elif lat > 90: + raise ValidationError(f'lat={lat} must be < 90') + except TypeError as e: + raise ValidationError(f'lat={lat} is a valid number') from e + + +def validate_lon(lon: float) -> None: + """Validate longitude value is within valid range. + + Args: + lon (float): Longitude value to validate. + + Raises: + ValidationError: If longitude is outside -180 to 180 range. + """ + try: + if lon < -180: + raise ValidationError(f'lon={lon} must be > -180') + elif lon > 180: + raise ValidationError(f'lon={lon} must be < 180') + except TypeError as e: + raise ValidationError(f'lon={lon} is a valid number') from e + def get_permutations(parameters: dict[str, list]) -> tuple[tuple]: """Generate all permutations from parameter value lists. diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index a0d40b0..9049efc 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -1,5 +1,6 @@ """Functions for working with xarray datasets for ISIMIP data.""" import logging +from datetime import datetime from pathlib import Path import cftime @@ -273,6 +274,44 @@ def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: return ds +def compute_time(ds: xr.Dataset, timestamp: datetime | None) -> float | None: + """Convert a datetime to numeric time value for dataset. + + Args: + ds (xr.Dataset): Dataset with time coordinate containing units and calendar. + timestamp (datetime | None): Timestamp to convert, or None. + + Returns: + Numeric time value in dataset's units, or None if timestamp is None. + """ + units = ds.time.encoding.get('units') or ds.coords['time'].attrs.get('units') + calendar = ds.time.encoding.get('calendar') or ds.coords['time'].attrs.get('calendar') + return cftime.date2num(timestamp, units=units, calendar=calendar) if timestamp else None + + +def compute_offset(ds1: xr.Dataset, ds2: xr.Dataset) -> xr.DataArray | None: + """Compute time offset between two datasets with different time units. + + Args: + ds1 (xr.Dataset): First dataset with time coordinate. + ds2 (xr.Dataset): Second dataset with time coordinate. + + Returns: + Time offset to apply to ds2, or None if units/calendars match. + """ + + units1 = ds1.time.encoding.get('units') or ds1.coords['time'].attrs.get('units') + calendar1 = ds1.time.encoding.get('calendar') or ds1.coords['time'].attrs.get('calendar') + units2 = ds2.time.encoding.get('units') or ds2.coords['time'].attrs.get('units') + calendar2 = ds2.time.encoding.get('calendar') or ds2.coords['time'].attrs.get('calendar') + if units1 != units2 or calendar1 != calendar2: + start_time = ds2['time'][0] + start_date = cftime.num2date(start_time, units=units2, calendar=calendar2) + offset = cftime.date2num(start_date, units=units1, calendar=calendar1) - start_time + logger.debug(f'time axis diverges "{units1}"/"{units2}" "{calendar1}"/"{calendar2}" offset={offset.values}') + return offset + + def create_mask(ds: xr.Dataset, df: pd.DataFrame, layer: int) -> xr.Dataset: """Create a spatial mask from a geometry layer. From 1ccf1094303cd04f865cc6528460a7da5b500ffa Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 21 Nov 2025 22:29:33 +0100 Subject: [PATCH 073/162] Refactor tests.py --- .gitignore | 1 + isimip_utils/tests/__init__.py | 0 isimip_utils/tests/constants.py | 36 ++++ isimip_utils/tests/helper.py | 6 + isimip_utils/tests/test_checksum.py | 5 +- isimip_utils/tests/test_netcdf.py | 56 +++++- isimip_utils/tests/test_pandas.py | 43 +++-- isimip_utils/tests/test_patterns.py | 46 +++-- isimip_utils/tests/test_utils.py | 27 +++ isimip_utils/tests/test_xarray.py | 29 ++-- testing/setup.py | 254 +++++++++++++++++++++------- 11 files changed, 375 insertions(+), 128 deletions(-) create mode 100644 isimip_utils/tests/__init__.py create mode 100644 isimip_utils/tests/constants.py create mode 100644 isimip_utils/tests/helper.py diff --git a/.gitignore b/.gitignore index a709a47..9f2e42b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ __pycache__/ /testing/extractions /testing/output /testing/protocol +/testing/share diff --git a/isimip_utils/tests/__init__.py b/isimip_utils/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py new file mode 100644 index 0000000..b4cb70f --- /dev/null +++ b/isimip_utils/tests/constants.py @@ -0,0 +1,36 @@ +from pathlib import Path + +DATASETS_PATH = Path("testing/datasets") +EXTRACTIONS_PATH = Path("testing/extractions") +OUTPUT_PATH = Path("testing/output") + +PROTOCOL_PATH = Path("testing/protocol/output") +SHARE_PATH = Path("testing/share") + +LANDSEAMASK_PATH = "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" + +TAS_PATHS = [ + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc", + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc", + "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc" +] + +YIELD_PATH = "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" # noqa: E501 + +PROTOCOL_PATHS = [ + "definitions/ISIMIP3a/OutputData/agriculture.json", + "pattern/ISIMIP3a/OutputData/agriculture.json", + "schema/ISIMIP3a/OutputData/agriculture.json", + "tree/ISIMIP3a/OutputData/agriculture.json" +] + +PROTOCOL_LOCATIONS = ['testing/protocol'] +PATTERN_PATH = 'ISIMIP3a/OutputData/agriculture.json' + +DATE = '2018-01-01' +PERIOD = ('2017-01-01', '2018-12-31') + +BBOX = (0, 10, -5, 5) + +POINT = (52.395833, 13.061389) +POINT_INDEX = (386, 75) diff --git a/isimip_utils/tests/helper.py b/isimip_utils/tests/helper.py new file mode 100644 index 0000000..e3886b6 --- /dev/null +++ b/isimip_utils/tests/helper.py @@ -0,0 +1,6 @@ +from subprocess import check_call + + +def call(cmd): + print(cmd) + check_call(cmd, shell=True) diff --git a/isimip_utils/tests/test_checksum.py b/isimip_utils/tests/test_checksum.py index 8add02b..ab9f239 100644 --- a/isimip_utils/tests/test_checksum.py +++ b/isimip_utils/tests/test_checksum.py @@ -1,10 +1,9 @@ -from pathlib import Path - from isimip_utils.checksum import get_checksum, get_checksum_suffix, get_checksum_type +from isimip_utils.tests import constants def test_get_checksum(): - file_path = Path('testing/datasets') / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' + file_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH checksum = get_checksum(file_path) assert checksum == '30f34d0720b8a6b670d0c093d488a3cd564e232a94d7ebafef99c1d7c18cec5d127fbc663f6378b4b99f9434fa10f71e8413b533c5cc5314d149ab9e2f7cca98' # noqa: E501 diff --git a/isimip_utils/tests/test_netcdf.py b/isimip_utils/tests/test_netcdf.py index 11926e2..b7f9cb3 100644 --- a/isimip_utils/tests/test_netcdf.py +++ b/isimip_utils/tests/test_netcdf.py @@ -11,25 +11,31 @@ get_data_model, get_dimensions, get_global_attributes, + get_index, get_variables, init_dataset, + open_dataset, open_dataset_read, open_dataset_write, update_global_attributes, value2string, ) +from isimip_utils.tests import constants -landseamask_path = Path('testing/datasets') / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' -test_path = Path('testing/output') / 'test.nc' -test_path.parent.mkdir(exist_ok=True) + +def test_open_dataset(): + dataset = open_dataset(constants.DATASETS_PATH / constants.LANDSEAMASK_PATH) + assert isinstance(dataset, Dataset) def test_open_dataset_read(): - dataset = open_dataset_read(landseamask_path) + dataset = open_dataset_read(constants.DATASETS_PATH / constants.LANDSEAMASK_PATH) assert isinstance(dataset, Dataset) def test_open_dataset_write(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) test_path.unlink(missing_ok=True) dataset = open_dataset_write(test_path) @@ -37,17 +43,41 @@ def test_open_dataset_write(): def test_init_dataset(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path) assert isinstance(dataset, Dataset) +@pytest.mark.parametrize('point,result', [ + ((89.75, -179.75), (0, 0)), + ((89.75, -179.25), (1, 0)), + ((89.25, -179.75), (0, 1)), + ((52.395833, 13.061389), (386, 75)) +]) +def test_get_index(point, result): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + + lat, lon = point + dataset = init_dataset(test_path, overwrite=True) + assert get_index(dataset, lat, lon) == result + + def test_get_data_model(): - dataset = Dataset(landseamask_path) + dataset = Dataset(constants.DATASETS_PATH / constants.LANDSEAMASK_PATH) data_model = get_data_model(dataset) assert data_model == 'NETCDF4_CLASSIC' def test_get_dimensions(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path, overwrite=True) dimensions = get_dimensions(dataset) assert list(dimensions.items()) == [ @@ -57,6 +87,10 @@ def test_get_dimensions(): def test_get_variables(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path, overwrite=True) variables = get_variables(dataset) assert [(variable_name, variable['standard_name']) for variable_name, variable in variables.items()] == [ @@ -66,6 +100,10 @@ def test_get_variables(): def test_get_global_attributes(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path, overwrite=True, attrs={ 'global': { 'egg': 'spam', @@ -90,6 +128,10 @@ def test_convert_attribute(value, return_value): def test_update_global_attributes_set(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path, overwrite=True) update_global_attributes(dataset, set_attributes={ 'egg': 'spam' @@ -99,6 +141,10 @@ def test_update_global_attributes_set(): def test_update_global_attributes_delete(): + test_path = Path('testing/output') / 'test.nc' + test_path.parent.mkdir(exist_ok=True) + test_path.unlink(missing_ok=True) + dataset = init_dataset(test_path, overwrite=True, attrs={ 'global': { 'egg': 'spam' diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py index 805e322..98df640 100644 --- a/isimip_utils/tests/test_pandas.py +++ b/isimip_utils/tests/test_pandas.py @@ -1,5 +1,3 @@ -from pathlib import Path - import pytest from isimip_utils.pandas import ( @@ -19,13 +17,12 @@ group_by_month, normalize, ) +from isimip_utils.tests import constants from isimip_utils.xarray import open_dataset, to_dataframe -extractions_path = Path("testing/extractions") - extractions = { - 'bbox': "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_bbox_daily.nc", # noqa: E501 - 'point': "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_point_daily.nc" # noqa: E501 + 'bbox': constants.TAS_PATHS[0].replace('_global_', '_select-bbox-cdo_'), + 'point': constants.TAS_PATHS[0].replace('_global_', '_select-point-cdo_') } @pytest.mark.parametrize('extraction,result', [ @@ -33,7 +30,7 @@ ('point', ('time', )) ]) def test_get_coords(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_coords(df) == result @@ -42,7 +39,7 @@ def test_get_coords(extraction, result): ('point', 'time') ]) def test_get_first_coord(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_first_coord(df) == result @@ -52,7 +49,7 @@ def test_get_first_coord(extraction, result): ('point', ('time', )) ]) def test_get_coord_labels(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_coord_labels(df) == result @@ -61,7 +58,7 @@ def test_get_coord_labels(extraction, result): ('point', 'time') ]) def test_get_first_coord_label(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_first_coord_label(df) == result @@ -71,7 +68,7 @@ def test_get_first_coord_label(extraction, result): ('point', ('T', )) ]) def test_get_coord_axes(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_coord_axes(df) == result @@ -80,7 +77,7 @@ def test_get_coord_axes(extraction, result): ('point', 'T') ]) def test_get_first_coord_axis(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_first_coord_axis(df) == result @@ -90,7 +87,7 @@ def test_get_first_coord_axis(extraction, result): ('point', ('tas', )) ]) def test_get_data_vars(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_data_vars(df) == result @@ -99,7 +96,7 @@ def test_get_data_vars(extraction, result): ('point', 'tas') ]) def test_get_first_data_var(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_first_data_var(df) == result @@ -109,7 +106,7 @@ def test_get_first_data_var(extraction, result): ('point', ('Near-Surface Air Temperature [K]', )) ]) def test_get_data_var_labels(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_data_var_labels(df) == result @@ -118,13 +115,13 @@ def test_get_data_var_labels(extraction, result): ('point', 'Near-Surface Air Temperature [K]') ]) def test_get_first_data_var_label(extraction, result): - with open_dataset(extractions_path / extractions[extraction]) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: df = to_dataframe(ds) assert get_first_data_var_label(df) == result def test_compute_average(): - with open_dataset(extractions_path / extractions['point']) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) df = compute_average(df, 'tas') @@ -134,25 +131,25 @@ def test_compute_average(): def test_group_by_day(): - with open_dataset(extractions_path / extractions['point']) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) df = group_by_day(df, 'tas') assert len(df) == 366 - assert df['tas'].between(270, 300).all() + assert df['tas'].between(260, 300).all() def test_group_by_month(): - with open_dataset(extractions_path / extractions['point']) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) df = group_by_month(df, 'tas') assert len(df) == 12 - assert df['tas'].between(270, 300).all() + assert df['tas'].between(260, 300).all() def test_normalize(): - with open_dataset(extractions_path / extractions['point']) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) df = normalize(df, 'tas') @@ -160,7 +157,7 @@ def test_normalize(): def test_create_label(): - with open_dataset(extractions_path / extractions['point']) as ds: + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) df = create_label(df, ['x', 'y', 'z']) diff --git a/isimip_utils/tests/test_patterns.py b/isimip_utils/tests/test_patterns.py index 27c8f43..27c5b2f 100644 --- a/isimip_utils/tests/test_patterns.py +++ b/isimip_utils/tests/test_patterns.py @@ -2,19 +2,11 @@ from isimip_utils.fetch import fetch_pattern from isimip_utils.patterns import find_files, match_dataset, match_dataset_path, match_file, match_file_path, match_path +from isimip_utils.tests import constants protocol_locations = ['testing/protocol'] -pattern_path = 'ISIMIP3a/OutputData/agriculture.json' - -datasets_path = Path('testing/datasets') -dataset_path = Path('ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs') # noqa: E501 -file_path = Path('ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc') # noqa: E501 -additional_paths = [ - Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc"), - Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc"), - Path("ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc") -] +pattern_path = 'ISIMIP3a/OutputData/agriculture.json' path_specifiers = { 'simulation_round': 'ISIMIP3a', @@ -44,53 +36,73 @@ def test_match_dataset_path(): + dataset_path = Path(constants.YIELD_PATH.replace('_1901_2016.nc', '')) + pattern = fetch_pattern(pattern_path, protocol_locations) - path, specifiers = match_dataset_path(pattern, datasets_path / dataset_path) + path, specifiers = match_dataset_path(pattern, constants.DATASETS_PATH / dataset_path) + assert str(path) == str(dataset_path) assert specifiers == {**path_specifiers, **dataset_specifiers} def test_match_file_path(): + file_path = Path(constants.YIELD_PATH) + pattern = fetch_pattern(pattern_path, protocol_locations) - path, specifiers = match_file_path(pattern, datasets_path / file_path) + path, specifiers = match_file_path(pattern, constants.DATASETS_PATH / file_path) + assert str(path) == str(file_path) assert specifiers == {**path_specifiers, **file_specifiers} def test_match_dataset(): + dataset_path = Path(constants.YIELD_PATH.replace('_1901_2016.nc', '')) + pattern = fetch_pattern(pattern_path, protocol_locations) - path, specifiers = match_dataset(pattern, datasets_path / dataset_path) + path, specifiers = match_dataset(pattern, constants.DATASETS_PATH / dataset_path) + assert str(path) == dataset_path.name assert specifiers == dataset_specifiers def test_match_file(): + file_path = Path(constants.YIELD_PATH) + pattern = fetch_pattern(pattern_path, protocol_locations) - path, specifiers = match_file(pattern, datasets_path / file_path) + path, specifiers = match_file(pattern, constants.DATASETS_PATH / file_path) + assert str(path) == file_path.name assert specifiers == file_specifiers def test_match_path(): + file_path = Path(constants.YIELD_PATH) + pattern = fetch_pattern(pattern_path, protocol_locations) - path, specifiers = match_path(pattern, datasets_path / file_path) + path, specifiers = match_path(pattern, constants.DATASETS_PATH / constants.YIELD_PATH) + assert str(path) == str(file_path) assert specifiers == {**path_specifiers, **file_specifiers} def test_match_path_specifiers_map(): + file_path = Path(constants.YIELD_PATH) + pattern = fetch_pattern(pattern_path, protocol_locations) pattern['specifiers_map'] = { 'global': 'spam' } - path, specifiers = match_path(pattern, datasets_path / file_path) + path, specifiers = match_path(pattern, constants.DATASETS_PATH / file_path) + assert str(path) == str(file_path) assert specifiers == {**path_specifiers, **file_specifiers, 'region': 'spam'} def test_find_files(): + file_path = Path(constants.YIELD_PATH) + files = [file_path.name] + [Path(path).name for path in constants.TAS_PATHS] + pattern = fetch_pattern(pattern_path, protocol_locations) - files = [file_path.name] + [path.name for path in additional_paths] result = find_files(pattern['file'], files) assert len(result) assert result == [( diff --git a/isimip_utils/tests/test_utils.py b/isimip_utils/tests/test_utils.py index a46ed02..46c1d79 100644 --- a/isimip_utils/tests/test_utils.py +++ b/isimip_utils/tests/test_utils.py @@ -1,3 +1,6 @@ +import pytest + +from isimip_utils.exceptions import ValidationError from isimip_utils.utils import ( Singleton, cached_property, @@ -8,6 +11,8 @@ include_path, join_parameters, update_year, + validate_lat, + validate_lon, ) paths = [ @@ -48,6 +53,28 @@ def egg(self): assert t.counter == 1 +@pytest.mark.parametrize('lat', (-90.0, -45.5, 0, 45, 90)) +def test_validate_lat(lat): + validate_lat(lat) + + +@pytest.mark.parametrize('lat', (-91, 91, None, '', 'none')) +def test_validate_lat_error(lat): + with pytest.raises(ValidationError): + validate_lat(lat) + + +@pytest.mark.parametrize('lon', (-180.0, -45.5, 0, 45, 180)) +def test_validate_lon(lon): + validate_lon(lon) + + +@pytest.mark.parametrize('lon', (-181, 181, None, '', 'none')) +def test_validate_lon_error(lon): + with pytest.raises(ValidationError): + validate_lon(lon) + + def test_exclude_path(): assert exclude_path([], 'a/b/c') is False assert exclude_path(paths, 'a/b/c') is True diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index b79a306..38ebfb1 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -1,11 +1,10 @@ -from pathlib import Path - import geopandas as gpd import numpy as np import xarray as xr from shapely.geometry import box from isimip_utils.netcdf import open_dataset_read +from isimip_utils.tests import constants from isimip_utils.xarray import ( add_fill_value_to_attrs, create_mask, @@ -21,17 +20,6 @@ write_dataset, ) -datasets_path = Path('testing/datasets') - -input_path = datasets_path / "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" # noqa: E501 - -output_path = datasets_path / "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" # noqa: E501 - -landseamask_path = datasets_path / 'ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc' - -test_path = Path('testing/output') / 'test.nc' -test_path.parent.mkdir(exist_ok=True) - def test_init_dataset(): ds = init_dataset() @@ -72,30 +60,31 @@ def test_init_dataset_args(): def test_open_dataset(): - with open_dataset(input_path) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.datetime64 def test_open_dataset_decode_cf_false(): - with open_dataset(input_path, decode_cf=False) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0], decode_cf=False) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.float64 def test_open_dataset_growing_seasons(): - with open_dataset(output_path) as ds: + with open_dataset(constants.DATASETS_PATH / constants.YIELD_PATH) as ds: assert isinstance(ds, xr.Dataset) assert isinstance(ds['time'].dtype, object) assert ds['time'].values[0].isoformat() == '1901-01-01T00:00:00' def test_load_dataset(): - with load_dataset(landseamask_path) as ds: + with load_dataset(constants.DATASETS_PATH / constants.LANDSEAMASK_PATH) as ds: assert isinstance(ds, xr.Dataset) def test_write_dataset(): + test_path = constants.OUTPUT_PATH / 'test.nc' test_path.unlink(missing_ok=True) ds = init_dataset() @@ -103,6 +92,7 @@ def test_write_dataset(): def test_order_variables(): + test_path = constants.OUTPUT_PATH / 'test.nc' test_path.unlink(missing_ok=True) ds = init_dataset( @@ -124,7 +114,7 @@ def test_order_variables(): def test_get_attrs(): - with open_dataset(input_path) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: attrs = get_attrs(ds) assert attrs['lon']['long_name'] == 'Longitude' assert attrs['lat']['long_name'] == 'Latitude' @@ -132,7 +122,7 @@ def test_get_attrs(): def test_set_attrs(): - with open_dataset(input_path) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: attrs = get_attrs(ds) attrs['tas']['egg'] = 'spam' set_attrs(ds, attrs) @@ -168,6 +158,7 @@ def test_set_fill_value_to_nan(): ds = set_fill_value_to_nan(ds) assert np.isnan(ds['var'].values[0]) + def test_set_nan_to_fill_value(): ds = xr.Dataset( coords={ diff --git a/testing/setup.py b/testing/setup.py index 731beb2..e6e7319 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -1,93 +1,225 @@ #!/usr/bin/env python3 -from pathlib import Path -from subprocess import check_call +from isimip_utils.tests import constants, helper -from isimip_utils.extractions import concat_extraction, select_bbox, select_point -from isimip_utils.xarray import open_dataset, write_dataset - -datasets_path = Path("testing/datasets") -extractions_path = Path("testing/extractions") -protocol_path = Path("testing/protocol/output") - -mask_paths = [ - "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" -] - -input_paths = [ - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc", - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc", - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc" -] - -output_paths = [ - "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" -] - -protocol_paths = [ - "definitions/ISIMIP3a/OutputData/agriculture.json", - "pattern/ISIMIP3a/OutputData/agriculture.json", - "schema/ISIMIP3a/OutputData/agriculture.json", - "tree/ISIMIP3a/OutputData/agriculture.json" -] - -bbox = (0, 10, -5, 5) -bbox_path = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_bbox_daily.nc" # noqa: E501 - -point = (52.395833, 13.061389) -point_path = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_point_daily.nc" # noqa: E501 def main(): download_datasets() download_protocol() - create_extractions() + run_gridfile() + run_select_time() + run_select_period() + run_select_point() + run_select_bbox() + run_select_bbox_mean() + run_select_bbox_map() + run_mask_bbox() + run_mask_mask() def download_datasets(): - datasets_path.mkdir(parents=True, exist_ok=True) + constants.DATASETS_PATH.mkdir(parents=True, exist_ok=True) - for path in mask_paths + input_paths + output_paths: - file_path = datasets_path / path + for path in [constants.LANDSEAMASK_PATH, *constants.TAS_PATHS, constants.YIELD_PATH]: + file_path = constants.DATASETS_PATH / path file_path.parent.mkdir(parents=True, exist_ok=True) url = f"https://files.isimip.org/{path}" - check_call(['wget', '-c', url, '-O', file_path]) + helper.call(f'wget -c {url} -O {file_path}') def download_protocol(): - protocol_path.mkdir(parents=True, exist_ok=True) + constants.PROTOCOL_PATH.mkdir(parents=True, exist_ok=True) - for path in protocol_paths: - file_path = protocol_path / path + for path in constants.PROTOCOL_PATHS: + file_path = constants.PROTOCOL_PATH / path file_path.parent.mkdir(parents=True, exist_ok=True) url = f"https://protocol.isimip.org/{path}" - check_call(['wget', '-c', url, '-O', file_path]) + helper.call(f'wget -c {url} -O {file_path}') + + +def run_gridfile(): + input_path = constants.DATASETS_PATH / constants.TAS_PATHS[0] + output_path = constants.SHARE_PATH / 'gridarea.nc' + output_path.parent.mkdir(parents=True, exist_ok=True) + + helper.call(f'cdo gridarea {input_path} {output_path}') + + +def run_select_time(): + date = constants.DATE + + path = constants.TAS_PATHS[0] + + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date} {input_path} {output_path}') + + +def run_select_period(): + start_date, end_date = constants.PERIOD + + path = constants.TAS_PATHS[0] + + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ + .replace('2015_2020', '2017_2018') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') + + +def run_select_point(): + ix, iy = constants.POINT_INDEX + + # add one since cdo is counting from 1! + ix, iy = ix + 1, iy + 1 + + output_paths = [] + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + output_paths.append(str(output_path)) + + helper.call(f'cdo -f nc4c -z zip_5 -L -selindexbox,{ix},{ix},{iy},{iy} {input_path} {output_path}') + + input_paths = ' '.join(output_paths) + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') \ + .replace('2031_2040', '2015_2040') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') + + +def run_select_bbox(): + west, east, south, north = constants.BBOX + + output_paths = [] + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + output_paths.append(str(output_path)) + + helper.call(f'cdo -f nc4c -z zip_5 -L -sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + + input_paths = ' '.join(output_paths) + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') \ + .replace('2031_2040', '2015_2040') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') + + +def run_select_bbox_mean(): + west, east, south, north = constants.BBOX + + output_paths = [] + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + output_paths.append(str(output_path)) + + helper.call('cdo -f nc4c -z zip_5 -L -fldmean ' \ + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + + input_paths = ' '.join(output_paths) + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') \ + .replace('2031_2040', '2015_2040') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') + + +def run_select_bbox_map(): + west, east, south, north = constants.BBOX + + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call('cdo -f nc4c -z zip_5 -L timmean ' \ + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + + +def run_mask_bbox(): + west, east, south, north = constants.BBOX + + output_paths = [] + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + output_paths.append(str(output_path)) + + helper.call(f'cdo -f nc4c -z zip_5 -L -masklonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + + input_paths = ' '.join(output_paths) + + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') \ + .replace('2031_2040', '2015_2040') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) + + helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') + + +def run_mask_mask(): + mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH + + output_paths = [] + for path in constants.TAS_PATHS: + input_path = constants.DATASETS_PATH / path + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) -def create_extractions(): - west, east, south, north = bbox - lat, lon = point + output_paths.append(str(output_path)) - extraction_bbox_path = extractions_path / bbox_path - extraction_point_path = extractions_path / point_path + helper.call(f'cdo -f nc4c -z zip_5 -L -ifthen -selname,mask {mask_path} {input_path} {output_path}') - if not all([extraction_bbox_path.exists(), extraction_bbox_path.exists()]): - for path in input_paths: - file_path = datasets_path / path + input_paths = ' '.join(output_paths) - extraction_bbox = None - extraction_point = None - with open_dataset(file_path) as ds_file: - ds_bbox = select_bbox(ds_file, west, east, south, north) - extraction_bbox = concat_extraction(extraction_bbox, ds_bbox) + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') \ + .replace('2031_2040', '2015_2040') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.unlink(missing_ok=True) - ds_point = select_point(ds_file, lat, lon) - extraction_point = concat_extraction(extraction_point, ds_point) + helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - write_dataset(extraction_bbox, extraction_bbox_path) - write_dataset(extraction_point, extraction_point_path) if __name__ == "__main__": main() From fc545c5f5871c5d9e699a5e4b5df1a5a7deb6b27 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 21 Nov 2025 22:29:51 +0100 Subject: [PATCH 074/162] Add test_extractions.py --- isimip_utils/tests/test_extractions.py | 313 +++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 isimip_utils/tests/test_extractions.py diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py new file mode 100644 index 0000000..6de6dff --- /dev/null +++ b/isimip_utils/tests/test_extractions.py @@ -0,0 +1,313 @@ +from datetime import datetime + +import pytest + +from isimip_utils.extractions import ( + compute_spatial_average, + compute_temporal_average, + concat_extraction, + count_values, + mask_bbox, + mask_mask, + select_bbox, + select_period, + select_point, + select_time, +) +from isimip_utils.tests import constants, helper +from isimip_utils.xarray import get_attrs, open_dataset, set_attrs, write_dataset + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_select_time(decode_cf): + date = constants.DATE + + path = constants.TAS_PATHS[0] + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time_') \ + .replace('2015_2020', '20180101') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_time(file_ds, datetime.strptime(date, "%Y-%m-%d")) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_select_period(decode_cf): + start_date, end_date = constants.PERIOD + + path = constants.TAS_PATHS[0] + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period_') \ + .replace('2015_2020', '2017_2018') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_period(file_ds, datetime.strptime(start_date, "%Y-%m-%d"), datetime.strptime(end_date, "%Y-%m-%d")) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ + .replace('2015_2020', '2017_2018') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_select_point(decode_cf, path): + lat, lon = constants.POINT + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_point(file_ds, lat, lon) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_select_point_concat(decode_cf): + lat, lon = constants.POINT + + extraction_ds = None + for path in constants.TAS_PATHS: + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_point(file_ds, lat, lon) + extraction_ds = concat_extraction(extraction_ds, ds) + + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point_') \ + .replace('2031_2040', '2015_2040') + extraction_path.unlink(missing_ok=True) + + write_dataset(extraction_ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') \ + .replace('2031_2040', '2015_2040') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_select_bbox(decode_cf, path): + west, east, south, north = constants.BBOX + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_bbox(file_ds, west, east, south, north) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_select_bbox_concat(decode_cf): + west, east, south, north = constants.BBOX + + extraction_ds = None + for path in constants.TAS_PATHS: + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = select_bbox(file_ds, west, east, south, north) + extraction_ds = concat_extraction(extraction_ds, ds) + + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox_') \ + .replace('2031_2040', '2015_2040') + extraction_path.unlink(missing_ok=True) + + write_dataset(extraction_ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') \ + .replace('2031_2040', '2015_2040') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_mask_bbox(decode_cf, path): + west, east, south, north = constants.BBOX + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = mask_bbox(file_ds, west, east, south, north) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_mask_bbox_concat(decode_cf): + west, east, south, north = constants.BBOX + + extraction_ds = None + for path in constants.TAS_PATHS: + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = mask_bbox(file_ds, west, east, south, north) + extraction_ds = concat_extraction(extraction_ds, ds) + + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox_') \ + .replace('2031_2040', '2015_2040') + extraction_path.unlink(missing_ok=True) + + write_dataset(extraction_ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') \ + .replace('2031_2040', '2015_2040') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_mask_mask(decode_cf, path): + mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH + mask_ds = open_dataset(mask_path) + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = mask_mask(file_ds, mask_ds) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_mask_mask_concat(decode_cf): + mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH + mask_ds = open_dataset(mask_path) + + extraction_ds = None + for path in constants.TAS_PATHS: + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = mask_mask(file_ds, mask_ds) + extraction_ds = concat_extraction(extraction_ds, ds) + + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask_') \ + .replace('2031_2040', '2015_2040') + extraction_path.unlink(missing_ok=True) + + write_dataset(extraction_ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') \ + .replace('2031_2040', '2015_2040') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_compute_spatial_average(decode_cf, path): + gridarea_path = constants.SHARE_PATH / 'gridarea.nc' + gridarea_ds = open_dataset(gridarea_path) + + west, east, south, north = constants.BBOX + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + attrs = get_attrs(file_ds) + ds = select_bbox(file_ds, west, east, south, north) + ds = compute_spatial_average(ds, weights=gridarea_ds["cell_area"]) + ds = set_attrs(ds, attrs) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_compute_spatial_average_concat(decode_cf): + gridarea_path = constants.SHARE_PATH / 'gridarea.nc' + gridarea_ds = open_dataset(gridarea_path) + + west, east, south, north = constants.BBOX + + extraction_ds = None + for path in constants.TAS_PATHS: + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + attrs = get_attrs(file_ds) + ds = select_bbox(file_ds, west, east, south, north) + ds = compute_spatial_average(ds, weights=gridarea_ds["cell_area"]) + ds = set_attrs(ds, attrs) + extraction_ds = concat_extraction(extraction_ds, ds) + + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean_') \ + .replace('2031_2040', '2015_2040') + extraction_path.unlink(missing_ok=True) + + write_dataset(extraction_ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') \ + .replace('2031_2040', '2015_2040') + helper.call(f'cdo diff {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_compute_temporal_average(decode_cf, path): + west, east, south, north = constants.BBOX + + dataset_path = constants.DATASETS_PATH / path + extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map_') + extraction_path.unlink(missing_ok=True) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + attrs = get_attrs(file_ds) + ds = select_bbox(file_ds, west, east, south, north) + ds = compute_temporal_average(ds) + ds = set_attrs(ds, attrs) + write_dataset(ds, extraction_path) + + cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map-cdo_') + helper.call(f'cdo diff,abslim=0.001 {extraction_path} {cdo_path}') + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_count_values(decode_cf, path): + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = count_values(file_ds) + assert (ds['tas'] == 720*360).all() + + +@pytest.mark.parametrize('decode_cf', (True, False)) +@pytest.mark.parametrize('path', constants.TAS_PATHS) +def test_count_values_mask(decode_cf, path): + west, east, south, north = constants.BBOX + + dataset_path = constants.DATASETS_PATH / path + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = mask_bbox(file_ds, west, east, south, north) + ds = count_values(ds) + assert (ds['tas'] == 400).all() From 2c42c5619e0df17bbeba8a7591fdf18ff77b7e9a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 12:23:21 +0100 Subject: [PATCH 075/162] Add typos pre-commit hook, update pyproject.toml and fix issues --- .gitignore | 12 ++++++------ .pre-commit-config.yaml | 6 ++++++ LICENSE | 2 +- docs/prerequisites.md | 2 +- isimip_utils/netcdf.py | 14 ++++++++------ isimip_utils/patterns.py | 2 +- isimip_utils/xarray.py | 12 +++++++----- pyproject.toml | 6 +++--- 8 files changed, 33 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 9f2e42b..408f225 100644 --- a/.gitignore +++ b/.gitignore @@ -11,13 +11,13 @@ __pycache__/ /build /dist /*.egg-info -/.pytest_cache + /.aider* +/.pytest_cache +/.ruff_cache + /.coverage /htmlcov -/testing/datasets -/testing/extractions -/testing/output -/testing/protocol -/testing/share +/testing +!/testing/setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 259dd5d..2f909d1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,8 +10,14 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace - id: debug-statements + - repo: https://github.com/charliermarsh/ruff-pre-commit rev: v0.13.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + + - repo: https://github.com/crate-ci/typos + rev: v1 + hooks: + - id: typos diff --git a/LICENSE b/LICENSE index fe4013f..587e9da 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 Potsdam Institute for Climate Impact Research +Copyright (c) 2022-2025 Potsdam Institute for Climate Impact Research Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/prerequisites.md b/docs/prerequisites.md index a739038..482f828 100644 --- a/docs/prerequisites.md +++ b/docs/prerequisites.md @@ -49,4 +49,4 @@ All further steps need to be performed using the windows shell `cmd.exe`. You ca #### Using the Windows Subsystem for Linux (WSL) -As an alternative for advanced users, you can use the Windows Subsystem for Linux (WSL) to install a Linux distribution whithin Windows 10. The installation is explained in the [Microsoft documentation](https://docs.microsoft.com/en-us/windows/wsl/install-win10). When using WSL, please install Python3 as explained in the Linux section. +As an alternative for advanced users, you can use the Windows Subsystem for Linux (WSL) to install a Linux distribution within Windows 10. The installation is explained in the [Microsoft documentation](https://docs.microsoft.com/en-us/windows/wsl/install-win10). When using WSL, please install Python3 as explained in the Linux section. diff --git a/isimip_utils/netcdf.py b/isimip_utils/netcdf.py index 2d996de..f8e2430 100644 --- a/isimip_utils/netcdf.py +++ b/isimip_utils/netcdf.py @@ -50,7 +50,7 @@ def open_dataset_write(file_path: str | Path) -> Dataset: def init_dataset(file_path: str | Path, diskless: bool = False, overwrite: bool = False, lon: int = 720, lat: int = 360, time: None | np.ndarray = None, time_unit: str = 'days since 1601-1-1 00:00:00', - time_calendar: str = 'proleptic_gregorian', attrs: dict = {}, **variables: Any) -> Dataset: + time_calendar: str = 'proleptic_gregorian', attrs: None | dict = None, **variables: Any) -> Dataset: """Initialize a new NetCDF4 dataset with standard dimensions and variables. Args: @@ -59,7 +59,7 @@ def init_dataset(file_path: str | Path, diskless: bool = False, overwrite: bool overwrite (bool): If True, overwrite existing dataset (default: False). lon (int): Number of longitude points (default: 720). lat (int): Number of latitude points (default: 360). - time (None | np.ndarray): Time dimension configuration (default: None). + time (np.ndarray): Time dimension configuration (default: None). time_unit (str): Units for the time dimension (default: 'days since 1601-1-1 00:00:00'). time_calendar (str): Calendar type for time dimension (default: 'proleptic_gregorian'). attrs (dict): Dictionary of attributes for variables and global attributes. @@ -123,8 +123,9 @@ def init_dataset(file_path: str | Path, diskless: bool = False, overwrite: bool fill_value=FILL_VALUE, compression='zlib') # set variable attributes - for key, value in attrs.get(variable_name, {}).items(): - setattr(var, key, value) + if attrs: + for key, value in attrs.get(variable_name, {}).items(): + setattr(var, key, value) # set missing value var.missing_value = np.float32(FILL_VALUE) @@ -133,8 +134,9 @@ def init_dataset(file_path: str | Path, diskless: bool = False, overwrite: bool var[:] = variable # set global attributes - for key, value in attrs.get('global', {}).items(): - setattr(ds, key, value) + if attrs: + for key, value in attrs.get('global', {}).items(): + setattr(ds, key, value) return ds diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index 3583f74..b70c372 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -71,7 +71,7 @@ def match_path(pattern: dict, path: Path, dirname_pattern_key: str = 'path', # assert that any value in dirname_specifiers at least starts with # its corresponding value (same key) in filename_specifiers # e.g. 'ewe' and 'ewe_north-sea' - for key, value in filename_specifiers.items(): + for key, _ in filename_specifiers.items(): if key in dirname_specifiers: f, d = filename_specifiers[key], dirname_specifiers[key] diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 9049efc..b270b79 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -14,13 +14,13 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, time_units: str = 'days since 1601-1-1 00:00:00', time_calendar: str = 'proleptic_gregorian', - attrs: dict = {}, **variables: np.ndarray) -> xr.Dataset: + attrs: None | dict = None, **variables: np.ndarray) -> xr.Dataset: """Initialize a new xarray dataset with standard ISIMIP dimensions. Args: lon (int): Number of longitude points (default: 720). lat (int): Number of latitude points (default: 360). - time (np.ndarray | None): Time coordinate array, or None to omit time dimension (default: None). + time (np.ndarray): Time coordinate array, or None to omit time dimension (default: None). time_units (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). time_calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). attrs (dict): Dictionary of attributes for variables and global attributes. @@ -83,13 +83,15 @@ def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, # set variable attributes for data_var in ds.data_vars: - if data_var in attrs: - ds.data_vars[data_var].attrs.update(attrs[data_var]) + if attrs: + if data_var in attrs: + ds.data_vars[data_var].attrs.update(attrs[data_var]) ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 # set global attributes - ds.attrs = attrs.get('global', {}) + if attrs: + ds.attrs = attrs.get('global', {}) return ds diff --git a/pyproject.toml b/pyproject.toml index 8fa1cc4..0337d73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,9 +84,6 @@ select = [ "YTT", # flake8-2020 ] ignore = [ - "B006", # mutable-argument-default - "B007", # unused-loop-control-variable - "B018", # useless-expression "RUF012", # mutable-class-default ] @@ -105,3 +102,6 @@ section-order = [ [tool.ruff.lint.isort.sections] pytest = ["pytest"] + +[tool.typos.default.extend-words] +iy = "iy" From 8a8338cf7414b190bd18f78f97c06cf0e13d957e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 12:39:20 +0100 Subject: [PATCH 076/162] Refactor optional dependencies --- pyproject.toml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0337d73..67a3172 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,23 +8,20 @@ maintainers = [ ] description = "This package contains common functionality for different ISIMIP tools." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" license = { file = "LICENSE" } classifiers = [ 'Operating System :: OS Independent', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', ] dependencies = [ - "netCDF4", "python-dotenv", "requests", "rich", - "xarray" ] dynamic = ["version"] @@ -33,9 +30,15 @@ Repository = "https://github.com/ISI-MIP/isimip-utils" [project.optional-dependencies] all = [ - "isimip-utils[plot,shapes]" + "isimip-utils[netcdf,plots,shapes,xarray,dev,pytest,docs]" +] +recommended = [ + "isimip-utils[netcdf,plots,shapes,xarray]" +] +netcdf = [ + "netCDF4" ] -plot = [ +plots = [ "altair[all]", "palettable", ] @@ -43,8 +46,10 @@ shapes = [ "geopandas", "rioxarray", ] +xarray = [ + "xarray" +] dev = [ - "isimip-utils[pytest]", "build", "pre-commit", "ruff", From 503dd46ad9e320d3456f02def336170c5608579f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 12:39:29 +0100 Subject: [PATCH 077/162] Use setuptools_scm and importlib.metadata to determine version --- isimip_utils/__init__.py | 8 +++++++- pyproject.toml | 8 ++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/isimip_utils/__init__.py b/isimip_utils/__init__.py index a080f04..787bb6e 100644 --- a/isimip_utils/__init__.py +++ b/isimip_utils/__init__.py @@ -1 +1,7 @@ -VERSION = __version__ = '1.3.2' +from importlib.metadata import PackageNotFoundError +from importlib.metadata import version as _version + +try: + VERSION = __version__ = _version(__package__) +except PackageNotFoundError: + VERSION = __version__ = "0.0.0+unknown" diff --git a/pyproject.toml b/pyproject.toml index 67a3172..642a806 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "setuptools_scm"] + [project] name = "isimip-utils" authors = [ @@ -68,8 +72,8 @@ docs = [ [tool.setuptools] packages = ["isimip_utils"] -[tool.setuptools.dynamic] -version = { attr = "isimip_utils.__version__" } +[tool.setuptools_scm] +version_scheme = "release-branch-semver" [tool.ruff] target-version = "py312" From daf4cb9803b4d58d37b497f8780322d01ebb794f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 16:07:41 +0100 Subject: [PATCH 078/162] Update pandas.py and plot.py --- isimip_utils/pandas.py | 3 ++- isimip_utils/plot.py | 46 +++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 4d8ad13..1177df6 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -135,7 +135,7 @@ def get_first_data_var_label(df: pd.DataFrame) -> str: return next(iter(get_data_var_labels(df))) -def compute_average(df: pd.DataFrame, data_var: str, area: bool = True) -> pd.DataFrame: +def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = True) -> pd.DataFrame: """Compute yearly average with optional standard deviation bounds. Args: @@ -145,6 +145,7 @@ def compute_average(df: pd.DataFrame, data_var: str, area: bool = True) -> pd.Da Returns: DataFrame with yearly aggregated data. """ + data_var = data_var or get_first_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') data_var_units = df.attrs['data_vars'][data_var].get('units') diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 375aacb..f7117c1 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -8,23 +8,28 @@ import numpy as np import pandas as pd -from isimip_utils.pandas import get_coord, get_coord_axis, get_coord_label, get_data_var, get_data_var_label +from isimip_utils.pandas import ( + get_first_coord, + get_first_coord_axis, + get_first_coord_label, + get_first_data_var, + get_first_data_var_label, +) from isimip_utils.utils import get_permutations logger = logging.getLogger(__name__) +alt.data_transformers.enable('vegafusion') -def default_color_theme() -> dict: - return { +@alt.theme.register('isimip_utils', enable=True) +def custom_theme(): + return alt.theme.ThemeConfig({ "config": { - "mark": {"color": "steelblue"} + "mark": { + "color": "steelblue" + } } - } - - -alt.data_transformers.enable('vegafusion') -alt.themes.register("default_color_theme", default_color_theme) -alt.themes.enable("default_color_theme") + }) def save_plot(chart: alt.Chart, path: str | Path, *args: Any, **kwargs: Any) -> None: @@ -52,7 +57,7 @@ def save_index(index_path: Path) -> None: index_path (Path): Path where the index.html file will be saved. """ index_json = json.dumps([ - str(p.name) for p in index_path.parent.iterdir() if p.suffix in ['.svg', '.png'] + str(p.name) for p in sorted(index_path.parent.iterdir()) if p.suffix in ['.svg', '.png'] ], indent=2).replace('\n', '\n ') logger.info(f'save {index_path.absolute()}') @@ -157,16 +162,16 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None Altair Chart object with line plot (and optional area for lower/upper bounds). """ - x_field = x_field or get_coord(df) - x_label = x_label or get_coord_label(df) - x_type = x_type or ('T' if get_coord_axis(df) == 'T' else 'Q') + x_field = x_field or get_first_coord(df) + x_label = x_label or get_first_coord_label(df) + x_type = x_type or ('T' if get_first_coord_axis(df) == 'T' else 'Q') x = alt.X( f'{x_field}:{x_type}', title=x_label ) - y_field = y_field or get_data_var(df) - y_label = y_label or get_data_var_label(df) + y_field = y_field or get_first_data_var(df) + y_label = y_label or get_first_data_var_label(df) y_type = y_type or 'Q' y = alt.Y( f'{y_field}:{y_type}', @@ -175,12 +180,11 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None scale=alt.Scale(zero=False, nice=False) ) - if empty: + color_field = color_field or 'label' + if empty or color_field not in df: color = alt.Color() else: - color_field = color_field or 'label' color_type = color_type or 'N' - color_scale_args = {} if color_domain: color_scale_args['domain'] = color_domain @@ -273,9 +277,9 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | if empty: color = alt.Color() else: - color_field = color_field or get_data_var(df) + color_field = color_field or get_first_data_var(df) color_type = color_type or 'Q' - color_label = color_label or get_data_var_label(df) + color_label = color_label or get_first_data_var_label(df) color_scale_args = {} if color_domain: From 73129ee620bccdd17f904d388b9adbb059c8a51d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 16:07:57 +0100 Subject: [PATCH 079/162] Add test_plot.py --- isimip_utils/tests/constants.py | 1 + isimip_utils/tests/test_plot.py | 274 ++++++++++++++++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 isimip_utils/tests/test_plot.py diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py index b4cb70f..daef8cb 100644 --- a/isimip_utils/tests/constants.py +++ b/isimip_utils/tests/constants.py @@ -2,6 +2,7 @@ DATASETS_PATH = Path("testing/datasets") EXTRACTIONS_PATH = Path("testing/extractions") +PLOTS_PATH = Path("testing/plots") OUTPUT_PATH = Path("testing/output") PROTOCOL_PATH = Path("testing/protocol/output") diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py new file mode 100644 index 0000000..fd2a2d2 --- /dev/null +++ b/isimip_utils/tests/test_plot.py @@ -0,0 +1,274 @@ + +import numpy as np +import pandas as pd + +from isimip_utils.pandas import compute_average, create_label +from isimip_utils.plot import get_plot_title, plot_grid, plot_line, plot_map, save_index, save_plot +from isimip_utils.tests import constants +from isimip_utils.xarray import open_dataset, to_dataframe + + +def test_plot_line(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + + plot_path = constants.PLOTS_PATH / 'plot_line.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + + chart = plot_line(df) + + assert chart.data.equals(df) + assert chart.encoding.x.shorthand == 'time:T' + assert chart.encoding.y.shorthand == 'tas:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_line_nocf(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + + plot_path = constants.PLOTS_PATH / 'plot_line_nocf.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path, decode_cf=True) as ds: + df = to_dataframe(ds) + + chart = plot_line(df, x_type='Q') + + assert chart.data.equals(df) + assert chart.encoding.x.shorthand == 'time:Q' + assert chart.encoding.y.shorthand == 'tas:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_line_empty(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + + plot_path = constants.PLOTS_PATH / 'plot_line_empty.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + df_empty = pd.DataFrame({ 'time': df['time'], 'tas': np.nan }) + + chart = plot_line(df, empty=True) + + assert chart.data.equals(df_empty) + assert chart.encoding.x.shorthand == 'time:T' + assert chart.encoding.y.shorthand == 'tas:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_line_area(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + + plot_path = constants.PLOTS_PATH / 'plot_line_area.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + df = compute_average(df, 'tas') + + chart = plot_line(df) + + assert chart.data.equals(df) + + mean, area = chart.layer + + assert mean.encoding.x.shorthand == 'year:T' + assert mean.encoding.y.shorthand == 'mean:Q' + + assert area.encoding.x.shorthand == 'year:T' + assert area.encoding.y.shorthand == 'lower:Q' + assert area.encoding.y2.shorthand == 'upper:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_line_color(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + + plot_path = constants.PLOTS_PATH / 'plot_line_color.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + df = compute_average(df, 'tas') + df = create_label(df, ('a', 'b', 'c')) + + chart = plot_line(df, color_scheme='viridis') + + assert chart.data.equals(df) + + mean, area = chart.layer + + assert mean.encoding.x.shorthand == 'year:T' + assert mean.encoding.y.shorthand == 'mean:Q' + + assert area.encoding.x.shorthand == 'year:T' + assert area.encoding.y.shorthand == 'lower:Q' + assert area.encoding.y2.shorthand == 'upper:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_map(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') + + plot_path = constants.PLOTS_PATH / 'plot_map.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + chart = plot_map(df) + + assert chart.data.equals(df) + assert chart.encoding.x.shorthand == 'lon:Q' + assert chart.encoding.y.shorthand == 'lat:Q' + assert chart.encoding.color.shorthand == 'tas:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_map_nocf(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') + + plot_path = constants.PLOTS_PATH / 'plot_map_nocf.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + chart = plot_map(df) + + assert chart.data.equals(df) + assert chart.encoding.x.shorthand == 'lon:Q' + assert chart.encoding.y.shorthand == 'lat:Q' + assert chart.encoding.color.shorthand == 'tas:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_map_empty(): + dataset_path = constants.TAS_PATHS[0] + extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') + + plot_path = constants.PLOTS_PATH / 'plot_map_empty.png' + plot_path.unlink(missing_ok=True) + + with open_dataset(extraction_path) as ds: + df = to_dataframe(ds) + df_empty = pd.DataFrame({ + 'lon': [], + 'lat': [] + }) + + chart = plot_map(df, empty=True) + + assert chart.data.equals(df_empty) + assert chart.encoding.x.shorthand == 'lon:Q' + assert chart.encoding.y.shorthand == 'lat:Q' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_plot_grid(): + dataset_paths = constants.TAS_PATHS + extraction_paths = [ + constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + for dataset_path in dataset_paths + ] + + plot_path = constants.PLOTS_PATH / 'plot_grid.png' + plot_path.unlink(missing_ok=True) + + dataframes = [] + for extraction_path in extraction_paths: + with open_dataset(extraction_path) as ds: + dataframes.append(to_dataframe(ds)) + + df_empty = pd.DataFrame({ 'time': dataframes[2]['time'], 'tas': np.nan }) + + parameters = { + 'ab': ('a', 'b'), + 'xy': ('x', 'y'), + } + + permutations = ( + ('a', 'x'), + ('a', 'y'), + ('b', 'x') + ) + + plots = {} + for permutation, df in zip(permutations, dataframes, strict=True): + plots[permutation] = plot_line(df) + + empty_plot = plot_line(df, empty=True) + + chart = plot_grid(parameters, plots, x='independent', empty_plot=empty_plot, layer=False) + + top, bottom = chart.vconcat + top_left, top_right = top.hconcat + bottom_left, bottom_right = bottom.hconcat + + assert top_left.data.equals(dataframes[0]) + assert top_right.data.equals(dataframes[1]) + assert bottom_left.data.equals(dataframes[2]) + assert bottom_right.data.equals(df_empty) + + for sub_chart in [top_left, top_right, bottom_left, bottom_right]: + assert sub_chart.resolve.scale.x == 'independent' + assert sub_chart.resolve.scale.y == 'shared' + + save_plot(chart, plot_path) + + assert plot_path.is_file + + +def test_save_index(): + index_path = constants.PLOTS_PATH / 'index.html' + index_path.unlink(missing_ok=True) + + save_index(index_path) + + assert index_path.is_file + + +def test_get_plot_title(): + permutation = ('a', 'b', 'c') + + assert get_plot_title(permutation) == { + "text": 'a Β· b Β· c', + "fontSize": 16, + "dy": -10 + } From 2cc1fbde22e9d60eb1182ba6e772452386b20e22 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 18:25:10 +0100 Subject: [PATCH 080/162] Refactor tests --- .gitignore | 8 +- isimip_utils/tests/constants.py | 15 +-- isimip_utils/tests/test_extractions.py | 126 +++++++++++-------------- isimip_utils/tests/test_pandas.py | 4 +- isimip_utils/tests/test_patterns.py | 2 +- isimip_utils/tests/test_plot.py | 33 +++---- isimip_utils/tests/test_xarray.py | 8 +- testing/setup.py | 82 +++++----------- 8 files changed, 113 insertions(+), 165 deletions(-) diff --git a/.gitignore b/.gitignore index 408f225..553f72e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,9 @@ __pycache__/ /.coverage /htmlcov -/testing -!/testing/setup.py +/testing/datasets +/testing/extractions +/testing/output +/testing/plots +/testing/protocol +/testing/share diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py index daef8cb..0d1fb53 100644 --- a/isimip_utils/tests/constants.py +++ b/isimip_utils/tests/constants.py @@ -10,13 +10,16 @@ LANDSEAMASK_PATH = "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" -TAS_PATHS = [ - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc", - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2021_2030.nc", - "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2031_2040.nc" +TAS_PATH = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/" \ + "ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" + +TAS_SPLIT_PATHS = [ + TAS_PATH.replace('2015_2020', specifiers) + for specifiers in ('2015_2016', '2017_2018', '2019_2020') ] -YIELD_PATH = "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" # noqa: E501 +YIELD_PATH = "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/" \ + "lpjml_gswp3-w5e5_obsclim_2015soc_default_yield-mai-noirr_global_annual-gs_1901_2016.nc" PROTOCOL_PATHS = [ "definitions/ISIMIP3a/OutputData/agriculture.json", @@ -29,7 +32,7 @@ PATTERN_PATH = 'ISIMIP3a/OutputData/agriculture.json' DATE = '2018-01-01' -PERIOD = ('2017-01-01', '2018-12-31') +PERIOD = ('2015-01-01', '2015-12-31') BBOX = (0, 10, -5, 5) diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py index 6de6dff..8c7e7be 100644 --- a/isimip_utils/tests/test_extractions.py +++ b/isimip_utils/tests/test_extractions.py @@ -22,18 +22,17 @@ def test_select_time(decode_cf): date = constants.DATE - path = constants.TAS_PATHS[0] - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time_') \ - .replace('2015_2020', '20180101') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time_') \ + .replace('2015_2020', '20180101') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_time(file_ds, datetime.strptime(date, "%Y-%m-%d")) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -41,35 +40,33 @@ def test_select_time(decode_cf): def test_select_period(decode_cf): start_date, end_date = constants.PERIOD - path = constants.TAS_PATHS[0] - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period_') \ - .replace('2015_2020', '2017_2018') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period_') \ + .replace('2015_2020', '2015') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_period(file_ds, datetime.strptime(start_date, "%Y-%m-%d"), datetime.strptime(end_date, "%Y-%m-%d")) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ - .replace('2015_2020', '2017_2018') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period-cdo_') \ + .replace('2015_2020', '2015') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_select_point(decode_cf, path): +def test_select_point(decode_cf): lat, lon = constants.POINT - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_point(file_ds, lat, lon) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -78,38 +75,35 @@ def test_select_point_concat(decode_cf): lat, lon = constants.POINT extraction_ds = None - for path in constants.TAS_PATHS: + for path in constants.TAS_SPLIT_PATHS: dataset_path = constants.DATASETS_PATH / path with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_point(file_ds, lat, lon) extraction_ds = concat_extraction(extraction_ds, ds) - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point_') \ - .replace('2031_2040', '2015_2040') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point_') extraction_path.unlink(missing_ok=True) write_dataset(extraction_ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') \ - .replace('2031_2040', '2015_2040') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_select_bbox(decode_cf, path): +def test_select_bbox(decode_cf): west, east, south, north = constants.BBOX - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_bbox(file_ds, west, east, south, north) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -118,38 +112,35 @@ def test_select_bbox_concat(decode_cf): west, east, south, north = constants.BBOX extraction_ds = None - for path in constants.TAS_PATHS: + for path in constants.TAS_SPLIT_PATHS: dataset_path = constants.DATASETS_PATH / path with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = select_bbox(file_ds, west, east, south, north) extraction_ds = concat_extraction(extraction_ds, ds) - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox_') \ - .replace('2031_2040', '2015_2040') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox_') extraction_path.unlink(missing_ok=True) write_dataset(extraction_ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') \ - .replace('2031_2040', '2015_2040') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_mask_bbox(decode_cf, path): +def test_mask_bbox(decode_cf): west, east, south, north = constants.BBOX - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-bbox_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = mask_bbox(file_ds, west, east, south, north) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-bbox-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -158,39 +149,36 @@ def test_mask_bbox_concat(decode_cf): west, east, south, north = constants.BBOX extraction_ds = None - for path in constants.TAS_PATHS: + for path in constants.TAS_SPLIT_PATHS: dataset_path = constants.DATASETS_PATH / path with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = mask_bbox(file_ds, west, east, south, north) extraction_ds = concat_extraction(extraction_ds, ds) - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox_') \ - .replace('2031_2040', '2015_2040') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-bbox_') extraction_path.unlink(missing_ok=True) write_dataset(extraction_ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') \ - .replace('2031_2040', '2015_2040') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-bbox-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_mask_mask(decode_cf, path): +def test_mask_mask(decode_cf): mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH mask_ds = open_dataset(mask_path) - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-mask_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = mask_mask(file_ds, mask_ds) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-mask-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -200,34 +188,31 @@ def test_mask_mask_concat(decode_cf): mask_ds = open_dataset(mask_path) extraction_ds = None - for path in constants.TAS_PATHS: + for path in constants.TAS_SPLIT_PATHS: dataset_path = constants.DATASETS_PATH / path with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = mask_mask(file_ds, mask_ds) extraction_ds = concat_extraction(extraction_ds, ds) - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask_') \ - .replace('2031_2040', '2015_2040') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-mask_') extraction_path.unlink(missing_ok=True) write_dataset(extraction_ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') \ - .replace('2031_2040', '2015_2040') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_mask-mask-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_compute_spatial_average(decode_cf, path): +def test_compute_spatial_average(decode_cf): gridarea_path = constants.SHARE_PATH / 'gridarea.nc' gridarea_ds = open_dataset(gridarea_path) west, east, south, north = constants.BBOX - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: @@ -237,7 +222,7 @@ def test_compute_spatial_average(decode_cf, path): ds = set_attrs(ds, attrs) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @@ -249,7 +234,7 @@ def test_compute_spatial_average_concat(decode_cf): west, east, south, north = constants.BBOX extraction_ds = None - for path in constants.TAS_PATHS: + for path in constants.TAS_SPLIT_PATHS: dataset_path = constants.DATASETS_PATH / path with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: @@ -259,24 +244,21 @@ def test_compute_spatial_average_concat(decode_cf): ds = set_attrs(ds, attrs) extraction_ds = concat_extraction(extraction_ds, ds) - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean_') \ - .replace('2031_2040', '2015_2040') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean_') extraction_path.unlink(missing_ok=True) write_dataset(extraction_ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') \ - .replace('2031_2040', '2015_2040') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean-cdo_') helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_compute_temporal_average(decode_cf, path): +def test_compute_temporal_average(decode_cf): west, east, south, north = constants.BBOX - dataset_path = constants.DATASETS_PATH / path - extraction_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map_') + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-map_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: @@ -286,14 +268,13 @@ def test_compute_temporal_average(decode_cf, path): ds = set_attrs(ds, attrs) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map-cdo_') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-map-cdo_') helper.call(f'cdo diff,abslim=0.001 {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_count_values(decode_cf, path): - dataset_path = constants.DATASETS_PATH / path +def test_count_values(decode_cf): + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = count_values(file_ds) @@ -301,11 +282,10 @@ def test_count_values(decode_cf, path): @pytest.mark.parametrize('decode_cf', (True, False)) -@pytest.mark.parametrize('path', constants.TAS_PATHS) -def test_count_values_mask(decode_cf, path): +def test_count_values_mask(decode_cf): west, east, south, north = constants.BBOX - dataset_path = constants.DATASETS_PATH / path + dataset_path = constants.DATASETS_PATH / constants.TAS_PATH with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: ds = mask_bbox(file_ds, west, east, south, north) diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py index 98df640..d69c880 100644 --- a/isimip_utils/tests/test_pandas.py +++ b/isimip_utils/tests/test_pandas.py @@ -21,8 +21,8 @@ from isimip_utils.xarray import open_dataset, to_dataframe extractions = { - 'bbox': constants.TAS_PATHS[0].replace('_global_', '_select-bbox-cdo_'), - 'point': constants.TAS_PATHS[0].replace('_global_', '_select-point-cdo_') + 'bbox': constants.TAS_PATH.replace('_global_', '_select-bbox-cdo_'), + 'point': constants.TAS_PATH.replace('_global_', '_select-point-cdo_') } @pytest.mark.parametrize('extraction,result', [ diff --git a/isimip_utils/tests/test_patterns.py b/isimip_utils/tests/test_patterns.py index 27c5b2f..d0df25f 100644 --- a/isimip_utils/tests/test_patterns.py +++ b/isimip_utils/tests/test_patterns.py @@ -100,7 +100,7 @@ def test_match_path_specifiers_map(): def test_find_files(): file_path = Path(constants.YIELD_PATH) - files = [file_path.name] + [Path(path).name for path in constants.TAS_PATHS] + files = [file_path.name] + [file_path.name.replace('_global_', s) for s in ('a', 'b', 'c')] pattern = fetch_pattern(pattern_path, protocol_locations) result = find_files(pattern['file'], files) diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py index fd2a2d2..f0997d1 100644 --- a/isimip_utils/tests/test_plot.py +++ b/isimip_utils/tests/test_plot.py @@ -9,8 +9,7 @@ def test_plot_line(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') plot_path = constants.PLOTS_PATH / 'plot_line.png' plot_path.unlink(missing_ok=True) @@ -30,8 +29,7 @@ def test_plot_line(): def test_plot_line_nocf(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') plot_path = constants.PLOTS_PATH / 'plot_line_nocf.png' plot_path.unlink(missing_ok=True) @@ -51,8 +49,7 @@ def test_plot_line_nocf(): def test_plot_line_empty(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') plot_path = constants.PLOTS_PATH / 'plot_line_empty.png' plot_path.unlink(missing_ok=True) @@ -73,8 +70,7 @@ def test_plot_line_empty(): def test_plot_line_area(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') plot_path = constants.PLOTS_PATH / 'plot_line_area.png' plot_path.unlink(missing_ok=True) @@ -102,8 +98,7 @@ def test_plot_line_area(): def test_plot_line_color(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-point-cdo_') plot_path = constants.PLOTS_PATH / 'plot_line_color.png' plot_path.unlink(missing_ok=True) @@ -132,9 +127,8 @@ def test_plot_line_color(): def test_plot_map(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') plot_path = constants.PLOTS_PATH / 'plot_map.png' plot_path.unlink(missing_ok=True) @@ -154,9 +148,8 @@ def test_plot_map(): def test_plot_map_nocf(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') plot_path = constants.PLOTS_PATH / 'plot_map_nocf.png' plot_path.unlink(missing_ok=True) @@ -176,9 +169,8 @@ def test_plot_map_nocf(): def test_plot_map_empty(): - dataset_path = constants.TAS_PATHS[0] - extraction_path = constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ + .replace('2015_2020', '20180101') plot_path = constants.PLOTS_PATH / 'plot_map_empty.png' plot_path.unlink(missing_ok=True) @@ -202,10 +194,9 @@ def test_plot_map_empty(): def test_plot_grid(): - dataset_paths = constants.TAS_PATHS extraction_paths = [ constants.EXTRACTIONS_PATH / dataset_path.replace('_global_', '_select-point-cdo_') - for dataset_path in dataset_paths + for dataset_path in constants.TAS_SPLIT_PATHS ] plot_path = constants.PLOTS_PATH / 'plot_grid.png' diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 38ebfb1..031efce 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -60,13 +60,13 @@ def test_init_dataset_args(): def test_open_dataset(): - with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATH) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.datetime64 def test_open_dataset_decode_cf_false(): - with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0], decode_cf=False) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATH, decode_cf=False) as ds: assert isinstance(ds, xr.Dataset) assert ds['time'].dtype.type == np.float64 @@ -114,7 +114,7 @@ def test_order_variables(): def test_get_attrs(): - with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATH) as ds: attrs = get_attrs(ds) assert attrs['lon']['long_name'] == 'Longitude' assert attrs['lat']['long_name'] == 'Latitude' @@ -122,7 +122,7 @@ def test_get_attrs(): def test_set_attrs(): - with open_dataset(constants.DATASETS_PATH / constants.TAS_PATHS[0]) as ds: + with open_dataset(constants.DATASETS_PATH / constants.TAS_PATH) as ds: attrs = get_attrs(ds) attrs['tas']['egg'] = 'spam' set_attrs(ds, attrs) diff --git a/testing/setup.py b/testing/setup.py index e6e7319..00d3c2c 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -5,7 +5,10 @@ def main(): download_datasets() download_protocol() + run_gridfile() + run_seldate() + run_select_time() run_select_period() run_select_point() @@ -19,7 +22,7 @@ def main(): def download_datasets(): constants.DATASETS_PATH.mkdir(parents=True, exist_ok=True) - for path in [constants.LANDSEAMASK_PATH, *constants.TAS_PATHS, constants.YIELD_PATH]: + for path in [constants.LANDSEAMASK_PATH, constants.TAS_PATH, constants.YIELD_PATH]: file_path = constants.DATASETS_PATH / path file_path.parent.mkdir(parents=True, exist_ok=True) @@ -41,17 +44,29 @@ def download_protocol(): def run_gridfile(): - input_path = constants.DATASETS_PATH / constants.TAS_PATHS[0] + input_path = constants.DATASETS_PATH / constants.TAS_PATH output_path = constants.SHARE_PATH / 'gridarea.nc' output_path.parent.mkdir(parents=True, exist_ok=True) helper.call(f'cdo gridarea {input_path} {output_path}') +def run_seldate(): + input_path = constants.DATASETS_PATH / constants.TAS_PATH + + for start_date, end_date, specifiers in [ + ('2015-01-01', '2016-12-31', '2015_2016'), + ('2017-01-01', '2018-12-31', '2017_2018'), + ('2019-01-01', '2020-12-31', '2019_2020') + ]: + output_path = constants.DATASETS_PATH / constants.TAS_PATH.replace('2015_2020', specifiers) + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') + + def run_select_time(): date = constants.DATE - path = constants.TAS_PATHS[0] + path = constants.TAS_PATH input_path = constants.DATASETS_PATH / path @@ -66,12 +81,12 @@ def run_select_time(): def run_select_period(): start_date, end_date = constants.PERIOD - path = constants.TAS_PATHS[0] + path = constants.TAS_PATH input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ - .replace('2015_2020', '2017_2018') + .replace('2015_2020', '2015') output_path.parent.mkdir(parents=True, exist_ok=True) output_path.unlink(missing_ok=True) @@ -85,7 +100,7 @@ def run_select_point(): ix, iy = ix + 1, iy + 1 output_paths = [] - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') @@ -96,21 +111,12 @@ def run_select_point(): helper.call(f'cdo -f nc4c -z zip_5 -L -selindexbox,{ix},{ix},{iy},{iy} {input_path} {output_path}') - input_paths = ' '.join(output_paths) - - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') \ - .replace('2031_2040', '2015_2040') - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - def run_select_bbox(): west, east, south, north = constants.BBOX output_paths = [] - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') @@ -121,21 +127,12 @@ def run_select_bbox(): helper.call(f'cdo -f nc4c -z zip_5 -L -sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') - input_paths = ' '.join(output_paths) - - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') \ - .replace('2031_2040', '2015_2040') - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - def run_select_bbox_mean(): west, east, south, north = constants.BBOX output_paths = [] - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') @@ -147,20 +144,11 @@ def run_select_bbox_mean(): helper.call('cdo -f nc4c -z zip_5 -L -fldmean ' \ f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') - input_paths = ' '.join(output_paths) - - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') \ - .replace('2031_2040', '2015_2040') - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - def run_select_bbox_map(): west, east, south, north = constants.BBOX - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map-cdo_') @@ -175,7 +163,7 @@ def run_mask_bbox(): west, east, south, north = constants.BBOX output_paths = [] - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') @@ -186,21 +174,12 @@ def run_mask_bbox(): helper.call(f'cdo -f nc4c -z zip_5 -L -masklonlatbox,{west},{east},{south},{north} {input_path} {output_path}') - input_paths = ' '.join(output_paths) - - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') \ - .replace('2031_2040', '2015_2040') - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - def run_mask_mask(): mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH output_paths = [] - for path in constants.TAS_PATHS: + for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') @@ -211,15 +190,6 @@ def run_mask_mask(): helper.call(f'cdo -f nc4c -z zip_5 -L -ifthen -selname,mask {mask_path} {input_path} {output_path}') - input_paths = ' '.join(output_paths) - - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') \ - .replace('2031_2040', '2015_2040') - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - helper.call(f'cdo -f nc4c -z zip_5 cat {input_paths} {output_path}') - if __name__ == "__main__": main() From 806fa0cee3a96d51a99f393ad0a3766e77048c6f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 19:13:45 +0100 Subject: [PATCH 081/162] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2f909d1..a37beb0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,12 +12,12 @@ repos: - id: debug-statements - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.13.3 + rev: v0.14.6 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/crate-ci/typos - rev: v1 + rev: v1.39.2 hooks: - id: typos From 944123a01fd9ffc45ea6355d540cf3b7b5c4be1b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 19:13:59 +0100 Subject: [PATCH 082/162] Fix find_files --- isimip_utils/patterns.py | 8 ++++++-- isimip_utils/tests/test_patterns.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index b70c372..5654aef 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -63,7 +63,7 @@ def match_path(pattern: dict, path: Path, dirname_pattern_key: str = 'path', filename_pattern = pattern[filename_pattern_key] # match the dirname and the filename - dirname_path, dirname_specifiers = match_string(dirname_pattern, path.parent.as_posix()) + dirname_path, dirname_specifiers = match_string(dirname_pattern, str(path.parent)) filename_path, filename_specifiers = match_string(filename_pattern, path.name) path = dirname_path / filename_path @@ -171,7 +171,11 @@ def find_files(pattern: re.Pattern, file_iter: Iterable[Path]) -> list[dict]: files = [] for path in sorted(file_iter): try: - files.append(match_string(pattern, path)) + _, specifiers = match_string(pattern, str(path)) + files.append({ + 'path': path, + **specifiers + }) except DidNotMatch: pass diff --git a/isimip_utils/tests/test_patterns.py b/isimip_utils/tests/test_patterns.py index d0df25f..b0ae7d2 100644 --- a/isimip_utils/tests/test_patterns.py +++ b/isimip_utils/tests/test_patterns.py @@ -100,12 +100,17 @@ def test_match_path_specifiers_map(): def test_find_files(): file_path = Path(constants.YIELD_PATH) - files = [file_path.name] + [file_path.name.replace('_global_', s) for s in ('a', 'b', 'c')] + files = [ + file_path.name, + file_path.name.replace('_global_', 'a'), + file_path.name.replace('_global_', 'b'), + file_path.name.replace('_global_', 'c') + ] pattern = fetch_pattern(pattern_path, protocol_locations) result = find_files(pattern['file'], files) assert len(result) - assert result == [( - Path(file_path.name), - file_specifiers - )] + assert result == [{ + 'path': file_path.name, + **file_specifiers + }] From 6601ac5b8cdd6f2f701e58a4728bfa8805da5060 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 19:14:24 +0100 Subject: [PATCH 083/162] User first data var by default in pandas.py --- isimip_utils/pandas.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 1177df6..245ab80 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -140,6 +140,7 @@ def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = Args: df (pd.DataFrame): DataFrame with time column and data variable. + data_var (str): Name of the data variable (default: first data var). area (bool): Whether to include lower/upper bounds using std (default: True). Returns: @@ -178,15 +179,18 @@ def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = return df -def group_by_day(df: pd.DataFrame, data_var: str) -> pd.DataFrame: +def group_by_day(df: pd.DataFrame, data_var: None | str = None) -> pd.DataFrame: """Group data by day of year and compute mean. Args: df (pd.DataFrame): DataFrame with time column and data variable. + data_var (str): Name of the data variable (default: first data var). Returns: DataFrame grouped by day of year (1-365/366). """ + data_var = data_var or get_first_data_var(df) + df['day'] = df['time'].dt.dayofyear df = df.groupby('day')[data_var].mean().reset_index() df.attrs['coords'] = {'day': { 'long_name': 'Day of the year'}} @@ -194,15 +198,18 @@ def group_by_day(df: pd.DataFrame, data_var: str) -> pd.DataFrame: return df -def group_by_month(df: pd.DataFrame, data_var: str) -> pd.DataFrame: +def group_by_month(df: pd.DataFrame, data_var: None | str = None) -> pd.DataFrame: """Group data by month and compute mean. Args: df (pd.DataFrame): DataFrame with time column and data variable. + data_var (str): Name of the data variable (default: first data var). Returns: DataFrame grouped by month (1-12). """ + data_var = data_var or get_first_data_var(df) + df['month'] = df['time'].dt.month df = df.groupby('month')[data_var].mean().reset_index() df.attrs['coords'] = {'month': {'long_name': 'Month of the year'}} @@ -210,15 +217,17 @@ def group_by_month(df: pd.DataFrame, data_var: str) -> pd.DataFrame: return df -def normalize(df: pd.DataFrame, data_var: str) -> pd.DataFrame: +def normalize(df: pd.DataFrame, data_var: None | str = None) -> pd.DataFrame: """Normalize data variable using z-score normalization. Args: df (pd.DataFrame): DataFrame with data variable to normalize. + data_var (str): Name of the data variable (default: first data var). Returns: DataFrame with normalized data variable (mean=0, std=1). """ + data_var = data_var or get_first_data_var(df) data_var_long_name = df.attrs['data_vars'][data_var].get('long_name') mean, std = df[data_var].mean(), df[data_var].std() From 2f75e2347f393ddcf33190b26fe5ac6ec9de621e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 24 Nov 2025 19:39:53 +0100 Subject: [PATCH 084/162] Fix log messages --- isimip_utils/extractions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 16999dc..0032985 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -100,7 +100,7 @@ def select_bbox(ds: xr.Dataset, west: float, east: float, south: float, north: f ValidationError: If coordinates are out of valid range. ExtractionError: If no lat or lon axis remains after selection. """ - logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') + logger.info(f'select bbox west={west} east={east} south={south} north={north}') validate_lat(south) validate_lat(north) @@ -136,7 +136,7 @@ def mask_bbox(ds: xr.Dataset, west: float, east: float, south: float, north: flo Raises: ValidationError: If coordinates are out of valid range. """ - logger.info(f'cutout bbox west={west} east={east} south={south} east={north}') + logger.info(f'mask bbox west={west} east={east} south={south} north={north}') validate_lat(south) validate_lat(north) From 1ff36769fea6f768f873cc0f3b208ff00258a9c4 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 25 Nov 2025 12:43:56 +0100 Subject: [PATCH 085/162] Split setup.py and download.py --- testing/download.py | 35 +++++++++++++++++++++++++++++++++++ testing/setup.py | 3 --- 2 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 testing/download.py diff --git a/testing/download.py b/testing/download.py new file mode 100644 index 0000000..cb1eb15 --- /dev/null +++ b/testing/download.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +from isimip_utils.tests import constants, helper + + +def main(): + download_datasets() + download_protocol() + + +def download_datasets(): + constants.DATASETS_PATH.mkdir(parents=True, exist_ok=True) + + for path in [constants.LANDSEAMASK_PATH, constants.TAS_PATH, constants.YIELD_PATH]: + file_path = constants.DATASETS_PATH / path + file_path.parent.mkdir(parents=True, exist_ok=True) + + url = f"https://files.isimip.org/{path}" + + helper.call(f'wget -c {url} -O {file_path}') + + +def download_protocol(): + constants.PROTOCOL_PATH.mkdir(parents=True, exist_ok=True) + + for path in constants.PROTOCOL_PATHS: + file_path = constants.PROTOCOL_PATH / path + file_path.parent.mkdir(parents=True, exist_ok=True) + + url = f"https://protocol.isimip.org/{path}" + + helper.call(f'wget -c {url} -O {file_path}') + + +if __name__ == "__main__": + main() diff --git a/testing/setup.py b/testing/setup.py index 00d3c2c..15498d8 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -3,9 +3,6 @@ def main(): - download_datasets() - download_protocol() - run_gridfile() run_seldate() From 0893954aa8c65b2142ae23e1adeedfd0f8c56f94 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 26 Nov 2025 14:57:02 +0100 Subject: [PATCH 086/162] Fix dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 642a806..436accb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ shapes = [ "rioxarray", ] xarray = [ + "cftime", "xarray" ] dev = [ From 3218f30f6e32c440f4416b1f784810255379b9da Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 26 Nov 2025 19:28:06 +0100 Subject: [PATCH 087/162] Refactor xarray.init_dataset --- isimip_utils/tests/helper.py | 14 +- isimip_utils/tests/test_xarray.py | 312 ++++++++++++++++++++++++++++-- isimip_utils/xarray.py | 151 +++++++++------ 3 files changed, 394 insertions(+), 83 deletions(-) diff --git a/isimip_utils/tests/helper.py b/isimip_utils/tests/helper.py index e3886b6..5d78f99 100644 --- a/isimip_utils/tests/helper.py +++ b/isimip_utils/tests/helper.py @@ -1,6 +1,16 @@ -from subprocess import check_call +import re +import subprocess def call(cmd): print(cmd) - check_call(cmd, shell=True) + return subprocess.check_output(cmd, shell=True).decode() + + +def normalize_whitespace(string): + return re.sub(r'\s+', ' ', string).strip() + + +def assert_multiline_strings_equal(a, b): + for a_line, b_line in zip(a.strip().splitlines(), b.strip().splitlines(), strict=True): + assert normalize_whitespace(a_line) == normalize_whitespace(b_line) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 031efce..45ed1b6 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -1,18 +1,23 @@ +from datetime import timedelta + +import cftime import geopandas as gpd import numpy as np +import pandas as pd import xarray as xr from shapely.geometry import box from isimip_utils.netcdf import open_dataset_read -from isimip_utils.tests import constants +from isimip_utils.tests import constants, helper from isimip_utils.xarray import ( - add_fill_value_to_attrs, + add_fill_value_to_data_vars, create_mask, get_attrs, init_dataset, load_dataset, open_dataset, order_variables, + remove_fill_value_from_coords, set_attrs, set_fill_value_to_nan, set_nan_to_fill_value, @@ -28,6 +33,32 @@ def test_init_dataset(): assert ds.sizes['lon'] == 720 assert ds.sizes['lat'] == 360 + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + lon = 720 ; + lat = 360 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; +} +''') + def test_init_dataset_args(): lon_size, lat_size, time_size = 180, 90, 10 @@ -35,29 +66,264 @@ def test_init_dataset_args(): time = np.arange(time_size, dtype=np.float64) var = np.random.rand(time_size, lat_size, lon_size).astype(np.float64) - time_units = 'days since 2000-01-01 00:00:00' - time_calendar = '365_day' - attrs = { 'var': { 'long_name': 'Variable' + }, + 'time': { + 'calendar': '365_day', + 'units': 'days since 2000-01-01 00:00:00' } } - ds = init_dataset(lon=lon_size, lat=lat_size, time=time, - time_units=time_units, time_calendar=time_calendar, - attrs=attrs, var=var) + ds = init_dataset(lon=lon_size, lat=lat_size, time=time, attrs=attrs, var=var) assert isinstance(ds, xr.Dataset) assert ds.sizes['lon'] == lon_size assert ds.sizes['lat'] == lat_size - assert ds['time'].units == time_units - assert ds['time'].calendar == time_calendar + assert ds['time'].units == attrs['time']['units'] + assert ds['time'].calendar == attrs['time']['calendar'] assert np.array_equal(ds['var'].values, var) assert ds['var'].long_name == attrs['var']['long_name'] + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + time = UNLIMITED ; // (10 currently) + lon = 180 ; + lat = 90 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double time(time) ; + time:standard_name = "time" ; + time:long_name = "Time" ; + time:calendar = "365_day" ; + time:units = "days since 2000-01-01 00:00:00" ; + time:axis = "T" ; + double var(time, lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + + +def test_init_dataset_datetime(): + calendar = 'proleptic_gregorian' + units = 'days since 2000-01-01 00:00:00' + + start_day = cftime.datetime(2000, 1, 1, calendar=calendar) + end_day = cftime.datetime(2000, 12, 31, calendar=calendar) + + time = np.array([start_day + timedelta(days=i) for i in range((end_day - start_day).days + 1)], dtype=object) + var = np.random.rand(time.size, 360, 720).astype(np.float64) + + attrs = { + 'var': { + 'long_name': 'Variable' + }, + 'time': { + 'calendar': calendar, + 'units': units + } + } + + ds = init_dataset(time=time, attrs=attrs, var=var) + + assert isinstance(ds, xr.Dataset) + + assert np.array_equal(cftime.num2date(ds['time'], calendar=calendar, units=units), time) + assert ds['time'].units == attrs['time']['units'] + assert ds['time'].calendar == attrs['time']['calendar'] + + assert np.array_equal(ds['var'].values, var) + assert ds['var'].long_name == attrs['var']['long_name'] + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + time = UNLIMITED ; // (366 currently) + lon = 720 ; + lat = 360 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double time(time) ; + time:standard_name = "time" ; + time:long_name = "Time" ; + time:calendar = "proleptic_gregorian" ; + time:units = "days since 2000-01-01 00:00:00" ; + time:axis = "T" ; + double var(time, lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + + +def test_init_dataset_datetime64(): + time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D') + var = np.random.rand(time.size, 360, 720).astype(np.float64) + + attrs = { + 'var': { + 'long_name': 'Variable' + }, + 'time': { + 'units': 'days since 2000-01-01 00:00:00' + } + } + + ds = init_dataset(time=time, attrs=attrs, var=var) + + assert isinstance(ds, xr.Dataset) + + # assert np.array_equal(ds['time'], time.astype('O')) + assert ds['time'].units == attrs['time']['units'] + assert ds['time'].calendar == "proleptic_gregorian" + + assert np.array_equal(ds['var'].values, var) + assert ds['var'].long_name == attrs['var']['long_name'] + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + time = UNLIMITED ; // (366 currently) + lon = 720 ; + lat = 360 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double time(time) ; + time:standard_name = "time" ; + time:long_name = "Time" ; + time:calendar = "proleptic_gregorian" ; + time:units = "days since 2000-01-01 00:00:00" ; + time:axis = "T" ; + double var(time, lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + + +def test_init_dataset_extra_dims(): + a = np.arange(0, 10, dtype=np.float64) + b = np.arange(0, 10, dtype=np.float64) + var = np.random.rand(b.size, a.size, 360, 720).astype(np.float64) + + attrs = { + 'var': { + 'long_name': 'Variable' + }, + 'a': { + 'long_name': 'A Axis', + 'axis': 'A' + }, + 'b': { + 'long_name': 'B Axis', + 'axis': 'B' + } + } + + ds = init_dataset(extra_dims=('b', 'a'), attrs=attrs, a=a, b=b, var=var) + + assert isinstance(ds, xr.Dataset) + + assert ds['a'].long_name == attrs['a']['long_name'] + assert ds['b'].long_name == attrs['b']['long_name'] + + assert np.array_equal(ds['var'].values, var) + assert ds['var'].long_name == attrs['var']['long_name'] + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + lon = 720 ; + lat = 360 ; + b = 10 ; + a = 10 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double b(b) ; + b:long_name = "B Axis" ; + b:axis = "B" ; + double a(a) ; + a:long_name = "A Axis" ; + a:axis = "A" ; + double var(b, a, lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + def test_open_dataset(): with open_dataset(constants.DATASETS_PATH / constants.TAS_PATH) as ds: @@ -83,14 +349,6 @@ def test_load_dataset(): assert isinstance(ds, xr.Dataset) -def test_write_dataset(): - test_path = constants.OUTPUT_PATH / 'test.nc' - test_path.unlink(missing_ok=True) - - ds = init_dataset() - write_dataset(ds, test_path) - - def test_order_variables(): test_path = constants.OUTPUT_PATH / 'test.nc' test_path.unlink(missing_ok=True) @@ -129,7 +387,20 @@ def test_set_attrs(): assert attrs['tas']['egg'] == 'spam' -def test_add_fill_value_to_attrs(): +def test_remove_fill_value_from_coords(): + ds = xr.Dataset( + coords={ + 'time': np.arange(10, dtype=np.float64) + }, + data_vars={ + 'var': (['time'], np.ones(10)) + } + ) + remove_fill_value_from_coords(ds) + assert '_FillValue' not in ds['time'] + + +def test_add_fill_value_to_data_vars(): ds = xr.Dataset( coords={ 'time': np.arange(10, dtype=np.float64) @@ -138,8 +409,7 @@ def test_add_fill_value_to_attrs(): 'var': (['time'], np.ones(10)) } ) - add_fill_value_to_attrs(ds) - assert ds['time'].attrs['_FillValue'] == 1e20 + add_fill_value_to_data_vars(ds) assert ds['var'].attrs['_FillValue'] == 1e20 assert ds['var'].attrs['missing_value'] == 1e20 diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index b270b79..ef636cf 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -10,87 +10,106 @@ logger = logging.getLogger(__name__) +default_attrs = { + 'lon': { + 'standard_name': 'longitude', + 'long_name': 'Longitude', + 'units': 'degrees_east', + 'axis': 'X' + }, + 'lat': { + 'standard_name': 'latitude', + 'long_name': 'Latitude', + 'units': 'degrees_north', + 'axis': 'Y' + }, + 'time': { + 'standard_name': 'time', + 'long_name': 'Time', + 'calendar': 'proleptic_gregorian', + 'units': 'days since 1601-1-1 00:00:00', + 'axis': 'T' + } +} -def init_dataset(lon: int = 720, lat: int = 360, time: np.ndarray | None = None, - time_units: str = 'days since 1601-1-1 00:00:00', - time_calendar: str = 'proleptic_gregorian', - attrs: None | dict = None, **variables: np.ndarray) -> xr.Dataset: + +def init_dataset(lon: None | int = 720, lat: None | int = 360, time: np.ndarray | None = None, + attrs: None | dict = None, extra_dims: None | list = None, **variables: np.ndarray) -> xr.Dataset: """Initialize a new xarray dataset with standard ISIMIP dimensions. Args: - lon (int): Number of longitude points (default: 720). - lat (int): Number of latitude points (default: 360). + lon (int): Number of longitude points, or None to omit (default: 720). + lat (int): Number of latitude points, or None to omit (default: 360). time (np.ndarray): Time coordinate array, or None to omit time dimension (default: None). - time_units (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). - time_calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). attrs (dict): Dictionary of attributes for variables and global attributes. + extra_dims (list): List of extra dimensions (besides lat, lon, time). **variables (np.ndarray): Data variables to include in the dataset. Returns: Initialized xarray Dataset with coordinates and data variables. """ + # combine attrs + attrs = { + key: {**default_attrs.get(key, {}), **(attrs or {}).get(key, {})} + for key in {*default_attrs.keys(), *(attrs or {}).keys()} + } - # create coordinates - dims = ('lat', 'lon') - coords = {} + # create list of dimensions + dims = list(extra_dims) if extra_dims else [] if time is not None: - dims = ('time', 'lat', 'lon') - coords['time'] = time - - lon_delta = 360.0 / lon - lat_delta = 180.0 / lat + dims.append('time') + if lat: + dims.append('lat') + if lon: + dims.append('lon') - coords['lon'] = np.arange(-180 + 0.5 * lon_delta, 180, lon_delta) - coords['lat'] = np.arange(90 - 0.5 * lat_delta, -90, -lat_delta) + # create coords + coords = {} + if lon is not None: + delta = 360.0 / lon + coords['lon'] = np.arange(-180 + 0.5 * delta, 180, delta) + if lat is not None: + delta = 180.0 / lat + coords['lat'] = np.arange(90 - 0.5 * delta, -90, -delta) + if time is not None: + if time.dtype == object: + coords['time'] = cftime.date2num( + time, calendar=attrs['time']['calendar'], units=attrs['time']['units'] + ).astype(np.float64) + elif np.issubdtype(time.dtype, np.datetime64): + coords['time'] = cftime.date2num( + time.to_pydatetime(), calendar=attrs['time']['calendar'], units=attrs['time']['units'] + ).astype(np.float64) + else: + coords['time'] = time.astype(np.float64) + if extra_dims: + for extra_dim in extra_dims: + coords[extra_dim] = variables[extra_dim] # create data variables data_vars = { var_name: (dims, var) for var_name, var in variables.items() + if extra_dims is None or var_name not in extra_dims } # create dataset ds = xr.Dataset(coords=coords, data_vars=data_vars) - # set time attributes if time is set - if time is not None: - ds.coords['time'].attrs = { - 'standard_name': 'time', - 'long_name': 'Time', - 'units': time_units, - 'calendar': time_calendar, - 'axis': 'T', - '_FillValue': 1.e+20 - } - - # set lon attributes - ds.coords['lon'].attrs = { - 'standard_name': 'longitude', - 'long_name': 'Longitude', - 'units': 'degrees_east', - 'axis': 'X', - '_FillValue': 1.e+20 - } - - # set lon attributes - ds.coords['lat'].attrs = { - 'standard_name': 'latitude', - 'long_name': 'Latitude', - 'units': 'degrees_north', - 'axis': 'Y', - '_FillValue': 1.e+20 - } + # set attributes + if attrs: + for coord in ds.coords: + if coord in attrs: + ds.coords[coord].attrs.update(attrs[coord]) - # set variable attributes - for data_var in ds.data_vars: - if attrs: - if data_var in attrs: - ds.data_vars[data_var].attrs.update(attrs[data_var]) + for data_var in ds.data_vars: + if attrs: + if data_var in attrs: + ds.data_vars[data_var].attrs.update(attrs[data_var]) - ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 + ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 - # set global attributes - if attrs: + # set global attributes ds.attrs = attrs.get('global', {}) return ds @@ -167,7 +186,8 @@ def write_dataset(ds: xr.Dataset, path: str | Path): logger.info(f'write {path.absolute()}') - ds = add_fill_value_to_attrs(ds) + ds = remove_fill_value_from_coords(ds) + ds = add_fill_value_to_data_vars(ds) ds = set_nan_to_fill_value(ds) ds = order_variables(ds) @@ -225,19 +245,30 @@ def set_attrs(ds: xr.Dataset, attrs: dict) -> xr.Dataset: return ds -def add_fill_value_to_attrs(ds: xr.Dataset) -> xr.Dataset: - """Add _FillValue and missing_value attributes if not present. +def remove_fill_value_from_coords(ds: xr.Dataset) -> xr.Dataset: + """Remove _FillValue and missing_value attributes from the coords. Args: ds (xr.Dataset): Xarray Dataset to modify. Returns: - Dataset with fill value attributes added (default: 1.e+20). + Dataset with fill value removed for the coords. """ for coord in ds.coords: - if '_FillValue' not in ds.coords[coord].attrs: - ds.coords[coord].attrs['_FillValue'] = 1.e+20 + if '_FillValue' not in ds[coord].encoding: + ds[coord].encoding['_FillValue'] = None + return ds + + +def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: + """Add _FillValue and missing_value attributes to data_vars if not present. + + Args: + ds (xr.Dataset): Xarray Dataset to modify. + Returns: + Dataset with fill value attributes added for the data_vars. + """ for data_var in ds.data_vars: if '_FillValue' not in ds.data_vars[data_var].attrs: ds.data_vars[data_var].attrs['_FillValue'] = 1.e+20 From a04454b59157b720433ed56ab706d99359130a90 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 26 Nov 2025 19:46:44 +0100 Subject: [PATCH 088/162] Pin dependencies --- pyproject.toml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 436accb..0f55fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,19 +40,23 @@ recommended = [ "isimip-utils[netcdf,plots,shapes,xarray]" ] netcdf = [ - "netCDF4" + "netCDF4~=1.7" ] plots = [ - "altair[all]", - "palettable", + "altair[all]~=6.0", + "palettable~=3.3", ] shapes = [ - "geopandas", - "rioxarray", + "geopandas~=1.1", + "rioxarray~=0.20", ] xarray = [ - "cftime", - "xarray" + "cftime~=1.6", + "xarray>=2025.11" +] +pytest = [ + "pytest~=9.0", + "pytest-cov~=7.0" ] dev = [ "build", @@ -60,10 +64,6 @@ dev = [ "ruff", "twine", ] -pytest = [ - "pytest", - "pytest-cov" -] docs = [ "mkdocs", "mkdocs-material", From bfde49d4fc78f84b29e2b1d9316d9162375ef5dc Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 27 Nov 2025 18:00:30 +0100 Subject: [PATCH 089/162] Add compression to xarray write_dataset --- isimip_utils/xarray.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index ef636cf..38e99a2 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -194,6 +194,10 @@ def write_dataset(ds: xr.Dataset, path: str | Path): # time should be an unlimited dimension unlimited_dims = ['time'] if 'time' in ds.dims else [] + # data variables should be compressed + for data_var in ds.data_vars: + ds[data_var].encoding.update({'zlib': True, 'complevel': 5}) + ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) From 47a946c38f94233f0bda5bc321dec1be5b8fb5b2 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 27 Nov 2025 18:01:01 +0100 Subject: [PATCH 090/162] Add convert_time method and support time as string array --- isimip_utils/tests/test_xarray.py | 62 +++++++++++++++++++++++++++++++ isimip_utils/xarray.py | 38 ++++++++++++++----- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 45ed1b6..edb3dd9 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -257,6 +257,68 @@ def test_init_dataset_datetime64(): ''') +def test_init_dataset_datetime_str(): + time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D').astype(str) + var = np.random.rand(time.size, 360, 720).astype(np.float64) + + attrs = { + 'var': { + 'long_name': 'Variable' + }, + 'time': { + 'units': 'days since 2000-01-01 00:00:00' + } + } + + ds = init_dataset(time=time, attrs=attrs, var=var) + + assert isinstance(ds, xr.Dataset) + + # assert np.array_equal(ds['time'], time.astype('O')) + assert ds['time'].units == attrs['time']['units'] + assert ds['time'].calendar == "proleptic_gregorian" + + assert np.array_equal(ds['var'].values, var) + assert ds['var'].long_name == attrs['var']['long_name'] + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + time = UNLIMITED ; // (366 currently) + lon = 720 ; + lat = 360 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double time(time) ; + time:standard_name = "time" ; + time:long_name = "Time" ; + time:calendar = "proleptic_gregorian" ; + time:units = "days since 2000-01-01 00:00:00" ; + time:axis = "T" ; + double var(time, lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + + def test_init_dataset_extra_dims(): a = np.arange(0, 10, dtype=np.float64) b = np.arange(0, 10, dtype=np.float64) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 38e99a2..e458f23 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -1,5 +1,6 @@ """Functions for working with xarray datasets for ISIMIP data.""" import logging +import warnings from datetime import datetime from pathlib import Path @@ -72,16 +73,11 @@ def init_dataset(lon: None | int = 720, lat: None | int = 360, time: np.ndarray delta = 180.0 / lat coords['lat'] = np.arange(90 - 0.5 * delta, -90, -delta) if time is not None: - if time.dtype == object: - coords['time'] = cftime.date2num( - time, calendar=attrs['time']['calendar'], units=attrs['time']['units'] - ).astype(np.float64) - elif np.issubdtype(time.dtype, np.datetime64): - coords['time'] = cftime.date2num( - time.to_pydatetime(), calendar=attrs['time']['calendar'], units=attrs['time']['units'] - ).astype(np.float64) - else: + if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): coords['time'] = time.astype(np.float64) + else: + coords['time'] = convert_time(time, calendar=attrs['time']['calendar'], units=attrs['time']['units']) + if extra_dims: for extra_dim in extra_dims: coords[extra_dim] = variables[extra_dim] @@ -383,6 +379,30 @@ def create_mask(ds: xr.Dataset, df: pd.DataFrame, layer: int) -> xr.Dataset: return mask_ds +def convert_time(time: np.ndarray, units='days since 1601-1-1 00:00:00', calendar='proleptic_gregorian') -> np.ndarray: + """Convert an time coordinate array to np.float64 using cftime.date2num. + + Args: + time (np.ndarray): Time coordinate array. + units (str): Units for the time coordinate (default: 'days since 1601-1-1 00:00:00'). + calendar (str): Calendar type for time coordinate (default: 'proleptic_gregorian'). + + Returns: + time (np.ndarray): Time coordinate array as np.float64. + """ + if np.issubdtype(time.dtype, np.datetime64): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + time = time.to_pydatetime() + + if time.dtype == 'object' and isinstance(time[0], str): + time = np.array([datetime.fromisoformat(t) for t in time], dtype=object) + + return cftime.date2num( + time, calendar=calendar, units=units + ).astype(np.float64) + + def to_dataframe(ds: xr.Dataset) -> pd.DataFrame: """Convert an xarray Dataset to a pandas DataFrame. From 0a63e3f63b63c03013c85c61600727f87fec1cde Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 28 Nov 2025 12:24:41 +0100 Subject: [PATCH 091/162] Fix convert_time --- isimip_utils/xarray.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index e458f23..226b705 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -393,7 +393,10 @@ def convert_time(time: np.ndarray, units='days since 1601-1-1 00:00:00', calenda if np.issubdtype(time.dtype, np.datetime64): with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) - time = time.to_pydatetime() + if isinstance(time, pd.core.indexes.datetimes.DatetimeIndex): + time = time.to_pydatetime() + else: + time = time.dt.to_pydatetime() if time.dtype == 'object' and isinstance(time[0], str): time = np.array([datetime.fromisoformat(t) for t in time], dtype=object) From 8fcf7f1b80cd83af4ddb3cfb8a52c67507cfc97f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 3 Dec 2025 22:52:26 +0100 Subject: [PATCH 092/162] Refactor init_dataset --- isimip_utils/tests/helper.py | 2 +- isimip_utils/tests/test_xarray.py | 209 +++++++++--------------------- isimip_utils/xarray.py | 110 +++++++++------- 3 files changed, 124 insertions(+), 197 deletions(-) diff --git a/isimip_utils/tests/helper.py b/isimip_utils/tests/helper.py index 5d78f99..3ef86b8 100644 --- a/isimip_utils/tests/helper.py +++ b/isimip_utils/tests/helper.py @@ -13,4 +13,4 @@ def normalize_whitespace(string): def assert_multiline_strings_equal(a, b): for a_line, b_line in zip(a.strip().splitlines(), b.strip().splitlines(), strict=True): - assert normalize_whitespace(a_line) == normalize_whitespace(b_line) + assert normalize_whitespace(a_line) == normalize_whitespace(b_line), (a_line, b_line) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index edb3dd9..1fa11b8 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -11,6 +11,7 @@ from isimip_utils.tests import constants, helper from isimip_utils.xarray import ( add_fill_value_to_data_vars, + convert_time, create_mask, get_attrs, init_dataset, @@ -126,157 +127,27 @@ def test_init_dataset_args(): ''') -def test_init_dataset_datetime(): - calendar = 'proleptic_gregorian' - units = 'days since 2000-01-01 00:00:00' - - start_day = cftime.datetime(2000, 1, 1, calendar=calendar) - end_day = cftime.datetime(2000, 12, 31, calendar=calendar) - - time = np.array([start_day + timedelta(days=i) for i in range((end_day - start_day).days + 1)], dtype=object) - var = np.random.rand(time.size, 360, 720).astype(np.float64) - - attrs = { - 'var': { - 'long_name': 'Variable' - }, - 'time': { - 'calendar': calendar, - 'units': units - } - } - - ds = init_dataset(time=time, attrs=attrs, var=var) - - assert isinstance(ds, xr.Dataset) - - assert np.array_equal(cftime.num2date(ds['time'], calendar=calendar, units=units), time) - assert ds['time'].units == attrs['time']['units'] - assert ds['time'].calendar == attrs['time']['calendar'] - - assert np.array_equal(ds['var'].values, var) - assert ds['var'].long_name == attrs['var']['long_name'] - - test_path = constants.OUTPUT_PATH / 'test.nc' - test_path.unlink(missing_ok=True) - - write_dataset(ds, test_path) - - output = helper.call(f'ncdump -h {test_path}') - - helper.assert_multiline_strings_equal(output, ''' -netcdf test { -dimensions: - time = UNLIMITED ; // (366 currently) - lon = 720 ; - lat = 360 ; -variables: - double lon(lon) ; - lon:standard_name = "longitude" ; - lon:long_name = "Longitude" ; - lon:units = "degrees_east" ; - lon:axis = "X" ; - double lat(lat) ; - lat:standard_name = "latitude" ; - lat:long_name = "Latitude" ; - lat:units = "degrees_north" ; - lat:axis = "Y" ; - double time(time) ; - time:standard_name = "time" ; - time:long_name = "Time" ; - time:calendar = "proleptic_gregorian" ; - time:units = "days since 2000-01-01 00:00:00" ; - time:axis = "T" ; - double var(time, lat, lon) ; - var:_FillValue = 1.e+20 ; - var:long_name = "Variable" ; - var:missing_value = 1.e+20 ; -} -''') - - -def test_init_dataset_datetime64(): - time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D') - var = np.random.rand(time.size, 360, 720).astype(np.float64) - - attrs = { - 'var': { - 'long_name': 'Variable' - }, - 'time': { - 'units': 'days since 2000-01-01 00:00:00' - } - } - - ds = init_dataset(time=time, attrs=attrs, var=var) - - assert isinstance(ds, xr.Dataset) - - # assert np.array_equal(ds['time'], time.astype('O')) - assert ds['time'].units == attrs['time']['units'] - assert ds['time'].calendar == "proleptic_gregorian" - - assert np.array_equal(ds['var'].values, var) - assert ds['var'].long_name == attrs['var']['long_name'] - - test_path = constants.OUTPUT_PATH / 'test.nc' - test_path.unlink(missing_ok=True) - - write_dataset(ds, test_path) - - output = helper.call(f'ncdump -h {test_path}') - - helper.assert_multiline_strings_equal(output, ''' -netcdf test { -dimensions: - time = UNLIMITED ; // (366 currently) - lon = 720 ; - lat = 360 ; -variables: - double lon(lon) ; - lon:standard_name = "longitude" ; - lon:long_name = "Longitude" ; - lon:units = "degrees_east" ; - lon:axis = "X" ; - double lat(lat) ; - lat:standard_name = "latitude" ; - lat:long_name = "Latitude" ; - lat:units = "degrees_north" ; - lat:axis = "Y" ; - double time(time) ; - time:standard_name = "time" ; - time:long_name = "Time" ; - time:calendar = "proleptic_gregorian" ; - time:units = "days since 2000-01-01 00:00:00" ; - time:axis = "T" ; - double var(time, lat, lon) ; - var:_FillValue = 1.e+20 ; - var:long_name = "Variable" ; - var:missing_value = 1.e+20 ; -} -''') - - -def test_init_dataset_datetime_str(): - time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D').astype(str) - var = np.random.rand(time.size, 360, 720).astype(np.float64) +def test_init_dataset_latlon(): + var = np.random.rand(10, 1, 1).astype(np.float64) attrs = { 'var': { 'long_name': 'Variable' - }, - 'time': { - 'units': 'days since 2000-01-01 00:00:00' } } - ds = init_dataset(time=time, attrs=attrs, var=var) + ds = init_dataset( + lon=np.array([10], dtype=np.float64), + lat=np.array([20], dtype=np.float64), + time=10, attrs=attrs, var=var + ) assert isinstance(ds, xr.Dataset) + assert ds.sizes['lon'] == 1 + assert ds.sizes['lat'] == 1 - # assert np.array_equal(ds['time'], time.astype('O')) - assert ds['time'].units == attrs['time']['units'] - assert ds['time'].calendar == "proleptic_gregorian" + assert ds['time'].units == 'days since 1601-1-1 00:00:00' + assert ds['time'].calendar == 'proleptic_gregorian' assert np.array_equal(ds['var'].values, var) assert ds['var'].long_name == attrs['var']['long_name'] @@ -291,9 +162,9 @@ def test_init_dataset_datetime_str(): helper.assert_multiline_strings_equal(output, ''' netcdf test { dimensions: - time = UNLIMITED ; // (366 currently) - lon = 720 ; - lat = 360 ; + time = UNLIMITED ; // (10 currently) + lon = 1 ; + lat = 1 ; variables: double lon(lon) ; lon:standard_name = "longitude" ; @@ -309,7 +180,7 @@ def test_init_dataset_datetime_str(): time:standard_name = "time" ; time:long_name = "Time" ; time:calendar = "proleptic_gregorian" ; - time:units = "days since 2000-01-01 00:00:00" ; + time:units = "days since 1601-1-1 00:00:00" ; time:axis = "T" ; double var(time, lat, lon) ; var:_FillValue = 1.e+20 ; @@ -319,7 +190,7 @@ def test_init_dataset_datetime_str(): ''') -def test_init_dataset_extra_dims(): +def test_init_dataset_dims(): a = np.arange(0, 10, dtype=np.float64) b = np.arange(0, 10, dtype=np.float64) var = np.random.rand(b.size, a.size, 360, 720).astype(np.float64) @@ -338,7 +209,7 @@ def test_init_dataset_extra_dims(): } } - ds = init_dataset(extra_dims=('b', 'a'), attrs=attrs, a=a, b=b, var=var) + ds = init_dataset(dims=('b', 'a', 'lat', 'lon'), attrs=attrs, a=a, b=b, var=var) assert isinstance(ds, xr.Dataset) @@ -539,6 +410,50 @@ def test_create_mask(): assert np.all(np.isnan(outside_region['mask'].values)) +def test_convert_time(): + time = np.arange(0, 100, dtype=np.int8) + time_converted = convert_time(time) + assert np.array_equal(time_converted, np.arange(0, 100, dtype=np.float64)) + + +def test_convert_time_datetime(): + calendar = 'proleptic_gregorian' + units = 'days since 2000-01-01 00:00:00' + + start_day = cftime.datetime(2000, 1, 1, calendar=calendar) + end_day = cftime.datetime(2000, 12, 31, calendar=calendar) + + time = np.array([start_day + timedelta(days=i) for i in range((end_day - start_day).days + 1)], dtype=object) + time_converted = convert_time(time, calendar=calendar, units=units) + + start = 0 + assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) + + +def test_init_dataset_datetime64_index(): + time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D') + time_converted = convert_time(time) + + start = 145731 + assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) + + +def test_init_dataset_datetime64_series(): + time = pd.Series(pd.date_range(start='2000-01-01', end='2000-12-31', freq='D')) + time_converted = convert_time(time) + + start = 145731 + assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) + + +def test_convert_time_datetime_str(): + time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D').astype(str) + time_converted = convert_time(time) + + start = 145731 + assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) + + def test_to_dataframe(): ds = xr.Dataset( coords={ diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 226b705..e9e9f8b 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -33,80 +33,89 @@ } } - -def init_dataset(lon: None | int = 720, lat: None | int = 360, time: np.ndarray | None = None, - attrs: None | dict = None, extra_dims: None | list = None, **variables: np.ndarray) -> xr.Dataset: +def init_dataset(lon: None | int | np.ndarray = 720, + lat: None | int | np.ndarray = 360, + time: None | int | np.ndarray = None, + dims: None | list = None, + attrs: None | dict = None, + **variables: np.ndarray) -> xr.Dataset: """Initialize a new xarray dataset with standard ISIMIP dimensions. Args: - lon (int): Number of longitude points, or None to omit (default: 720). - lat (int): Number of latitude points, or None to omit (default: 360). - time (np.ndarray): Time coordinate array, or None to omit time dimension (default: None). + lon (int | np.ndarray): Number of longitude points, or longitude array, or None to omit (default: 720). + lat (int | np.ndarray): Number of latitude points, or latitude array, or None to omit (default: 360). + time (int | np.ndarray): Number of time steps, or time array, or None to omit time dimension (default: None). attrs (dict): Dictionary of attributes for variables and global attributes. - extra_dims (list): List of extra dimensions (besides lat, lon, time). + dims (list): List of dimensions (default time, lat, lon). **variables (np.ndarray): Data variables to include in the dataset. Returns: Initialized xarray Dataset with coordinates and data variables. """ - # combine attrs - attrs = { - key: {**default_attrs.get(key, {}), **(attrs or {}).get(key, {})} - for key in {*default_attrs.keys(), *(attrs or {}).keys()} - } - - # create list of dimensions - dims = list(extra_dims) if extra_dims else [] - if time is not None: - dims.append('time') - if lat: - dims.append('lat') - if lon: - dims.append('lon') - # create coords + # create dimensions + if dims is None: + dims = [] + if time is not None: + dims.append('time') + if lat is not None: + dims.append('lat') + if lon is not None: + dims.append('lon') + + # create coordinates coords = {} - if lon is not None: - delta = 360.0 / lon - coords['lon'] = np.arange(-180 + 0.5 * delta, 180, delta) - if lat is not None: - delta = 180.0 / lat - coords['lat'] = np.arange(90 - 0.5 * delta, -90, -delta) - if time is not None: - if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): - coords['time'] = time.astype(np.float64) - else: - coords['time'] = convert_time(time, calendar=attrs['time']['calendar'], units=attrs['time']['units']) - - if extra_dims: - for extra_dim in extra_dims: - coords[extra_dim] = variables[extra_dim] + if isinstance(lon, int): + lon_delta = 360.0 / lon + coords['lon'] = np.arange(-180 + 0.5 * lon_delta, 180, lon_delta) + elif isinstance(lon, np.ndarray): + coords['lon'] = lon + + if isinstance(lat, int): + lat_delta = 180.0 / lat + coords['lat'] = np.arange(90 - 0.5 * lat_delta, -90, -lat_delta) + elif isinstance(lat, np.ndarray): + coords['lat'] = lat + + if isinstance(time, int): + coords['time'] = np.arange(time, dtype=np.float64) + elif isinstance(time, np.ndarray): + coords['time'] = time + + for dim in dims: + if dim not in ['lon', 'lat', 'time']: + coords[dim] = variables[dim] # create data variables data_vars = { var_name: (dims, var) for var_name, var in variables.items() - if extra_dims is None or var_name not in extra_dims + if var_name not in dims } # create dataset ds = xr.Dataset(coords=coords, data_vars=data_vars) + # combine attrs + attrs = { + key: {**default_attrs.get(key, {}), **(attrs or {}).get(key, {})} + for key in {*default_attrs.keys(), *(attrs or {}).keys()} + } + # set attributes - if attrs: - for coord in ds.coords: - if coord in attrs: - ds.coords[coord].attrs.update(attrs[coord]) + for coord in ds.coords: + if coord in attrs: + ds.coords[coord].attrs.update(attrs[coord]) - for data_var in ds.data_vars: - if attrs: - if data_var in attrs: - ds.data_vars[data_var].attrs.update(attrs[data_var]) + for data_var in ds.data_vars: + if attrs: + if data_var in attrs: + ds.data_vars[data_var].attrs.update(attrs[data_var]) - ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 + ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 - # set global attributes - ds.attrs = attrs.get('global', {}) + # set global attributes + ds.attrs = attrs.get('global', {}) return ds @@ -390,6 +399,9 @@ def convert_time(time: np.ndarray, units='days since 1601-1-1 00:00:00', calenda Returns: time (np.ndarray): Time coordinate array as np.float64. """ + if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): + return time.astype(np.float64) + if np.issubdtype(time.dtype, np.datetime64): with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) From 19d7a053e9e2a4bb6a6d383329b449996c54411b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 9 Dec 2025 16:19:45 +0100 Subject: [PATCH 093/162] Use float32 for FILL_VALUE --- isimip_utils/xarray.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index e9e9f8b..afa30fd 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -default_attrs = { +DEFAULT_ATTRS = { 'lon': { 'standard_name': 'longitude', 'long_name': 'Longitude', @@ -33,6 +33,8 @@ } } +FILL_VALUE = np.float32(1.e+20) + def init_dataset(lon: None | int | np.ndarray = 720, lat: None | int | np.ndarray = 360, time: None | int | np.ndarray = None, @@ -98,8 +100,8 @@ def init_dataset(lon: None | int | np.ndarray = 720, # combine attrs attrs = { - key: {**default_attrs.get(key, {}), **(attrs or {}).get(key, {})} - for key in {*default_attrs.keys(), *(attrs or {}).keys()} + key: {**DEFAULT_ATTRS.get(key, {}), **(attrs or {}).get(key, {})} + for key in {*DEFAULT_ATTRS.keys(), *(attrs or {}).keys()} } # set attributes @@ -280,9 +282,9 @@ def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: """ for data_var in ds.data_vars: if '_FillValue' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['_FillValue'] = 1.e+20 + ds.data_vars[data_var].attrs['_FillValue'] = FILL_VALUE if 'missing_value' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['missing_value'] = 1.e+20 + ds.data_vars[data_var].attrs['missing_value'] = FILL_VALUE return ds @@ -296,7 +298,7 @@ def set_fill_value_to_nan(ds: xr.Dataset) -> xr.Dataset: Dataset with fill values replaced by NaN. """ for var in ds.data_vars: - fill_value = ds[var].attrs.get('_FillValue', 1e+20) + fill_value = ds[var].attrs.get('_FillValue', FILL_VALUE) ds[var] = ds[var].where(ds[var] != fill_value) return ds @@ -311,7 +313,7 @@ def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: Dataset with NaN values replaced by fill values. """ for var in ds.data_vars: - fill_value = ds[var].attrs.get('_FillValue', 1e+20) + fill_value = ds[var].attrs.get('_FillValue', FILL_VALUE) ds[var] = ds[var].where(~np.isnan(ds[var]), fill_value) return ds From 8befeaa907b1eb8adef713eddde1ce2ba0280a1d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 12 Dec 2025 17:26:30 +0100 Subject: [PATCH 094/162] Add parse_locations and parse_parameters --- isimip_utils/cli.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 8f90f27..30f71d0 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -5,6 +5,7 @@ import tomllib from datetime import datetime from pathlib import Path +from urllib.parse import urlparse from dotenv import load_dotenv from rich.logging import RichHandler @@ -121,6 +122,24 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() +def parse_locations(value: str) -> Path: + """Parse and expand a location string as list of URL or Path objects. + + Args: + value (str): Location string to parse. + + Returns: + List of URL or Path objects. + """ + if value: + return [ + string if urlparse(string).scheme else Path(string).expanduser() + for string in value.split() + ] + else: + return [] + + def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: """Parse a filelist file into a set of file paths. @@ -140,6 +159,20 @@ def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: return filelist +def parse_parameters(value: str) -> Path: + """Parse and expand a parameters string (a=b). + + Args: + value (str): Parameter string to parse. + + Returns: + Dict of the form {key: values} + """ + key, values_str = value.split('=') + values = values_str.split(',') + return {key : values} + + class ArgumentParser(argparse.ArgumentParser): """Extended ArgumentParser that reads defaults from config files and environment. From 9018cb75076d9a58da390ec7ca6bf45c116c4a72 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 13 Dec 2025 15:54:38 +0100 Subject: [PATCH 095/162] Add dim argument to extractions.count_values --- isimip_utils/extractions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 0032985..477df5d 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -209,17 +209,19 @@ def compute_temporal_average(ds: xr.Dataset) -> xr.Dataset: return ds.mean(dim='time', skipna=True).astype(np.float32) -def count_values(ds: xr.Dataset) -> xr.Dataset: +def count_values(ds: xr.Dataset, dim: list | None = None) -> xr.Dataset: """Count non-NaN values over lat/lon dimensions. Args: ds (xr.Dataset): Dataset with lat/lon dimensions. + dim (list): Dimensions along which to count [default: ('lat', 'lon')] Returns: Dataset with count of non-NaN values per time step. """ logger.info('count values') - return ds.count(dim=('lat', 'lon')).astype(np.float32) + dim = dim or ('lat', 'lon') + return ds.count(dim=dim).astype(np.float32) def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: From 64f055d0ee90abc442f7980f8c031178fd12a302 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sun, 14 Dec 2025 11:56:23 +0100 Subject: [PATCH 096/162] Add protocol.py and refactor fetch.py --- isimip_utils/fetch.py | 220 ++++++++------------------------------- isimip_utils/protocol.py | 181 ++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 177 deletions(-) create mode 100644 isimip_utils/protocol.py diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index 11bd475..077397d 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -1,220 +1,86 @@ -"""Functions to fetch files from machine-actionable ISIMIP protocols.""" +"""Functions to fetch files from urls or local paths.""" import json import logging -import os -import re -from collections.abc import Generator +import shutil from pathlib import Path from typing import Any -from urllib.parse import urlparse import requests -from isimip_utils.exceptions import NotFound - logger = logging.getLogger(__name__) -PROTOCOL_LOCATIONS = [ - 'https://protocol.isimip.org', - 'https://protocol2.isimip.org', -] - - -def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: - """Fetch definitions from ISIMIP protocol locations. - - Args: - path (str | Path): Path to search for definitions. - protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). - - Returns: - Dictionary of definitions with specifiers as keys. - - Raises: - NotFound: If no definitions are found for the given path. - """ - if isinstance(protocol_locations, str): - protocol_locations = [protocol_locations] - - for protocol_location in protocol_locations: - definitions_json = find_json(protocol_location, 'definitions', path) - if definitions_json: - definitions = {} - for definition_name, definition in definitions_json.items(): - # convert the definitions to dicts if they are lists - if isinstance(definition, list): - definitions[definition_name] = { - row['specifier']: row for row in definition - } - else: - definitions[definition_name] = definition - - logger.debug('definitions = %s', definitions) - return definitions - - raise NotFound(f'No definitions found for {path}.') - - -def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: - """Fetch pattern definitions from ISIMIP protocol locations. - - Args: - path (str | Path): Path to search for patterns. - protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). - - Returns: - Dictionary containing compiled regex patterns for 'path', 'file', 'dataset', - and lists of 'suffix', 'specifiers', and 'specifiers_map'. - - Raises: - NotFound: If no pattern is found for the given path. - """ - if isinstance(protocol_locations, str): - protocol_locations = [protocol_locations] - - for protocol_location in protocol_locations: - pattern_json = find_json(protocol_location, 'pattern', path) - if pattern_json: - if not all([ - isinstance(pattern_json['path'], str), - isinstance(pattern_json['file'], str), - isinstance(pattern_json['dataset'], str), - isinstance(pattern_json['suffix'], list) - ]): - break - - pattern = { - 'path': re.compile(pattern_json['path']), - 'file': re.compile(pattern_json['file']), - 'dataset': re.compile(pattern_json['dataset']), - 'suffix': pattern_json['suffix'], - 'specifiers': pattern_json.get('specifiers', []), - 'specifiers_map': pattern_json.get('specifiers_map', {}) - } - - logger.debug('pattern = %s', pattern) - - return pattern - - raise NotFound(f'No pattern found for {path}.') - - -def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: - """Fetch schema from ISIMIP protocol locations. - - Args: - path (str | Path): Path to search for schema. - protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). - - Returns: - Schema JSON object. - - Raises: - NotFound: If no schema is found for the given path. - """ - if isinstance(protocol_locations, str): - protocol_locations = [protocol_locations] - - for protocol_location in protocol_locations: - schema_json = find_json(protocol_location, 'schema', path) - if schema_json: - return schema_json - - raise NotFound(f'No schema found for {path}.') - -def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: - """Fetch tree structure from ISIMIP protocol locations. +def fetch_json(url: str) -> Any | None: + """Fetch JSON content from a URL. Args: - path (str | Path): Path to search for tree structure. - protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + location (str | Path): URL to fetch JSON from. Returns: - Tree JSON object. - - Raises: - NotFound: If no tree is found for the given path. + Parsed JSON object, or None if request fails. """ - if isinstance(protocol_locations, str): - protocol_locations = [protocol_locations] + logger.debug('url = %s', url) - for protocol_location in protocol_locations: - tree_json = find_json(protocol_location, 'tree', path) - if tree_json: - return tree_json - - raise NotFound(f'No tree found for {path}.') - - -def fetch_resource(resource_location: str | Path) -> dict: - if urlparse(resource_location).scheme: - return fetch_json(resource_location) - else: - return load_json(resource_location) + try: + response = requests.get(url) + except requests.exceptions.ConnectionError: + return None - raise NotFound(f'No resource found at {resource_location}.') + if response.status_code == 200: + return response.json() -def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Generator[tuple[Path, Any], None, None]: - """Find JSON files in protocol locations by traversing path components. +def fetch_file(url: str, target: str | Path) -> bool: + """Download file from a URL. Args: - protocol_location (str): Base protocol location URL or path. - sub_location (str): Subdirectory within protocol location (e.g., 'definitions', 'pattern'). - path (str | Path): Path to search for JSON files. + location (str | Path): URL to download file from. + target (str | Path): Target path. Returns: - The JSON response from the first matching path. + True, or None if request fails. """ - path_components = Path(path).parts - for i in range(len(path_components), 0, -1): - current_path = Path(os.sep.join(path_components[:i+1])).with_suffix('.json') + logger.debug('url = %s', url) - if urlparse(protocol_location).scheme: - data = fetch_json(f'{protocol_location}/{sub_location}/{current_path}') - else: - data = load_json(Path(protocol_location) / 'output' / sub_location / current_path) - - logger.debug('path = %s', current_path) - logger.debug('data = %s', data) + try: + response = requests.get(url) + except requests.exceptions.ConnectionError: + return None - if data is not None: - return data + if response.status_code == 200: + with open(target, "wb") as fp: + fp.write(response.content) + return True -def fetch_json(location: str) -> Any | None: - """Fetch JSON content from a URL. +def load_json(path: str | Path) -> Any | None: + """Load JSON content from a local path. Args: - location (str): URL to fetch JSON from. + location (str | Path): URL to fetch JSON from. Returns: - Parsed JSON object, or None if request fails or status is not 200. + Parsed JSON object, or None if request fails. """ - logger.debug('location = %s', location) - - try: - response = requests.get(location) - except requests.exceptions.ConnectionError: - return None + logger.debug('path = %s', path) - if response.status_code == 200: - return response.json() + path = Path(path) + if path.exists(): + return json.loads(open(path).read()) -def load_json(path: str | Path) -> Any | None: - """Load JSON content from a local file. +def load_file(path: str | Path, target: str | Path) -> bool: + """Copy a file from a local path. Args: - path (str | Path): Path to the JSON file. + location (str | Path): URL to download file from. + target (str | Path): Target path. Returns: - Parsed JSON object, or None if file doesn't exist. + True, or None if request fails. """ - path = Path(path).expanduser() - logger.debug('path = %s', path) - if path.exists(): - return json.loads(open(path).read()) + path = Path(path) + if path.is_file(): + shutil.copy(path, target) diff --git a/isimip_utils/protocol.py b/isimip_utils/protocol.py new file mode 100644 index 0000000..e154add --- /dev/null +++ b/isimip_utils/protocol.py @@ -0,0 +1,181 @@ +"""Functions to fetch files from machine-actionable ISIMIP protocols.""" +import logging +import os +import re +from collections.abc import Generator +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +from .exceptions import NotFound +from .fetch import fetch_json, load_json + +logger = logging.getLogger(__name__) + +PROTOCOL_LOCATIONS = [ + 'https://protocol.isimip.org', + 'https://protocol2.isimip.org', +] + + +def fetch_definitions(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: + """Fetch definitions from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for definitions. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Dictionary of definitions with specifiers as keys. + + Raises: + NotFound: If no definitions are found for the given path. + """ + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] + + for protocol_location in protocol_locations: + definitions_json = find_json(protocol_location, 'definitions', path) + if definitions_json: + definitions = {} + for definition_name, definition in definitions_json.items(): + # convert the definitions to dicts if they are lists + if isinstance(definition, list): + definitions[definition_name] = { + row['specifier']: row for row in definition + } + else: + definitions[definition_name] = definition + + logger.debug('definitions = %s', definitions) + return definitions + + raise NotFound(f'No definitions found for {path}.') + + +def fetch_pattern(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> dict[str, Any]: + """Fetch pattern definitions from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for patterns. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Dictionary containing compiled regex patterns for 'path', 'file', 'dataset', + and lists of 'suffix', 'specifiers', and 'specifiers_map'. + + Raises: + NotFound: If no pattern is found for the given path. + """ + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] + + for protocol_location in protocol_locations: + pattern_json = find_json(protocol_location, 'pattern', path) + if pattern_json: + if not all([ + isinstance(pattern_json['path'], str), + isinstance(pattern_json['file'], str), + isinstance(pattern_json['dataset'], str), + isinstance(pattern_json['suffix'], list) + ]): + break + + pattern = { + 'path': re.compile(pattern_json['path']), + 'file': re.compile(pattern_json['file']), + 'dataset': re.compile(pattern_json['dataset']), + 'suffix': pattern_json['suffix'], + 'specifiers': pattern_json.get('specifiers', []), + 'specifiers_map': pattern_json.get('specifiers_map', {}) + } + + logger.debug('pattern = %s', pattern) + + return pattern + + raise NotFound(f'No pattern found for {path}.') + + +def fetch_schema(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: + """Fetch schema from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for schema. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Schema JSON object. + + Raises: + NotFound: If no schema is found for the given path. + """ + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] + + for protocol_location in protocol_locations: + schema_json = find_json(protocol_location, 'schema', path) + if schema_json: + return schema_json + + raise NotFound(f'No schema found for {path}.') + + +def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_LOCATIONS) -> Any: + """Fetch tree structure from ISIMIP protocol locations. + + Args: + path (str | Path): Path to search for tree structure. + protocol_locations (str | list[str]): List of protocol locations to search (default: https://protocol.isimip.org). + + Returns: + Tree JSON object. + + Raises: + NotFound: If no tree is found for the given path. + """ + if isinstance(protocol_locations, str): + protocol_locations = [protocol_locations] + + for protocol_location in protocol_locations: + tree_json = find_json(protocol_location, 'tree', path) + if tree_json: + return tree_json + + raise NotFound(f'No tree found for {path}.') + + +def fetch_resource(resource_location: str | Path) -> dict: + resource = fetch_json(resource_location) + + if resource is None: + return resource + + raise NotFound(f'No resource found at {resource_location}.') + + +def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Generator[tuple[Path, Any], None, None]: + """Find JSON files in protocol locations by traversing path components. + + Args: + protocol_location (str): Base protocol location URL or path. + sub_location (str): Subdirectory within protocol location (e.g., 'definitions', 'pattern'). + path (str | Path): Path to search for JSON files. + + Returns: + The JSON response from the first matching path. + """ + path_components = Path(path).parts + for i in range(len(path_components), 0, -1): + current_path = Path(os.sep.join(path_components[:i+1])).with_suffix('.json') + + if not isinstance(protocol_location, Path) and urlparse(protocol_location).scheme: + data = fetch_json(f'{protocol_location}/{sub_location}/{current_path}') + else: + data = load_json(Path(protocol_location) / 'output' / sub_location / current_path) + + logger.debug('path = %s', current_path) + logger.debug('data = %s', data) + + if data is not None: + return data From 36216e93f5c5879b708e38787558877ea1fdba87 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 15 Dec 2025 21:04:11 +0100 Subject: [PATCH 097/162] Add apply_placeholders --- isimip_utils/cli.py | 2 +- isimip_utils/config.py | 2 ++ isimip_utils/utils.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 30f71d0..f959b9e 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -170,7 +170,7 @@ def parse_parameters(value: str) -> Path: """ key, values_str = value.split('=') values = values_str.split(',') - return {key : values} + return key, values class ArgumentParser(argparse.ArgumentParser): diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 6ae51ae..4e16a69 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -22,6 +22,8 @@ def __repr__(self) -> str: def __getattr__(self, name: str) -> Any: if name in self._settings.keys(): return self._settings[name] + else: + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{name}'") def __setattr__(self, name: str, value: Any) -> None: if name.startswith('_'): diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index c0188b4..bf59f84 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -185,6 +185,25 @@ def copy_placeholders(*placeholder_args: dict, **kwargs: Any) -> dict: return placeholders +def apply_placeholders(path_template: str | Path, placeholders: dict) -> Path: + """Apply placeholders to a string or path, ensuring that the name of the path is lower case + + Args: + path_template (str | Path): Path template as string or path. + placeholders (dict): Placeholder dictionary. + + Returns: + Path with the applied placeholders. + """ + try: + path = str(path_template).format(**placeholders) + except KeyError as e: + raise RuntimeError('Some of the placeholders are missing.') from e + + path = Path(path) + return path.with_stem(path.stem.lower()) + + def update_year(placeholders: dict, key: str, year: int | str, operator: str) -> None: """Update a year placeholder based on comparison operator. From 9950386eca7d6396c87d5337225a0e05212a7b90 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 16 Dec 2025 16:54:01 +0100 Subject: [PATCH 098/162] Fix concat_extractions --- isimip_utils/extractions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 477df5d..9f1d52e 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -239,7 +239,7 @@ def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: elif not ds2.sizes.get('time'): return ds1 else: - if not ds1.time.encoding: + if not ds1.time.encoding or not ds1.time.encoding.get('units'): # apply offset when time units or calendar diverges, but only if times where not decoded offset = compute_offset(ds1, ds2) if offset is not None: From d41fe90ff12a0111becc0e6e380c9543aafd4078 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 23 Dec 2025 12:48:35 +0100 Subject: [PATCH 099/162] Revert previous changes to parse_paramers --- isimip_utils/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index f959b9e..3c64f21 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -170,7 +170,7 @@ def parse_parameters(value: str) -> Path: """ key, values_str = value.split('=') values = values_str.split(',') - return key, values + return {key: values} class ArgumentParser(argparse.ArgumentParser): From 2b98b80ccf45a41a88a9480815abc806fb69574f Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 23 Dec 2025 12:48:59 +0100 Subject: [PATCH 100/162] Add update methods to Settings --- isimip_utils/config.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 4e16a69..98cbb48 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -1,5 +1,6 @@ """Configuration management for ISIMIP tools.""" import logging +import tomllib from typing import Any from .utils import Singleton @@ -16,6 +17,8 @@ class Settings(Singleton): """ _settings: dict[str, Any] = {} + ignore_keys = ('config', ) + def __repr__(self) -> str: return str(self._settings) @@ -40,6 +43,40 @@ def to_dict(self) -> dict[str, Any]: """ return self._settings + def update(self, values: dict[str, Any]) -> dict[str, Any]: + """Update the settings from a dictionary. + + Args: + values (dict[str, Any]): Dictionary of setting key-value pairs. + """ + for key, value in values.items(): + name = key.upper() + current_value = self._settings[name] + + if not hasattr(self, name): + raise ValueError(f'unknown key "{key}"') + + if isinstance(current_value, list): + current_value.extend(value if isinstance(value, list) else [value]) + elif isinstance(current_value, dict): + if not isinstance(value, dict): + raise ValueError(f'key "{key}" is not a dict') + self._settings[name].update(value) + else: + self._settings[name] = value + + + def update_from_toml(self, path): + """Update the settings from a toml file.. + + Args: + path (Path): Path to the toml file/. + """ + if path and path.exists(): + config = tomllib.loads(path.read_text()) + self.update(config) + + @classmethod def from_dict(cls, values: dict[str, Any]) -> 'Settings': """Create a Settings instance from a dictionary. @@ -52,6 +89,6 @@ def from_dict(cls, values: dict[str, Any]) -> 'Settings': All keys are converted to uppercase. """ instance = cls() - instance._settings = {key.upper(): value for key, value in values.items()} + instance._settings = {key.upper(): value for key, value in values.items() if key not in cls.ignore_keys} logger.debug('settings = %s', instance) return instance From d2099780146fca30d48696ac75702d0be47d6b1a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 23 Dec 2025 12:49:44 +0100 Subject: [PATCH 101/162] Add add separate parameters module --- isimip_utils/extractions.py | 6 +- isimip_utils/parameters.py | 112 ++++++++++++++++++++++++++++++++++++ isimip_utils/plot.py | 9 ++- isimip_utils/utils.py | 108 ---------------------------------- 4 files changed, 119 insertions(+), 116 deletions(-) create mode 100644 isimip_utils/parameters.py diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 9f1d52e..fa33297 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -5,9 +5,9 @@ import numpy as np import xarray as xr -from isimip_utils.exceptions import ExtractionError -from isimip_utils.utils import validate_lat, validate_lon -from isimip_utils.xarray import compute_offset, compute_time +from .exceptions import ExtractionError +from .utils import validate_lat, validate_lon +from .xarray import compute_offset, compute_time logger = logging.getLogger(__name__) diff --git a/isimip_utils/parameters.py b/isimip_utils/parameters.py new file mode 100644 index 0000000..b68a906 --- /dev/null +++ b/isimip_utils/parameters.py @@ -0,0 +1,112 @@ +"""Utility functions for the work with parameters and placeholders.""" +from itertools import product +from pathlib import Path +from typing import Any + + +def get_permutations(parameters: dict[str, list]) -> tuple[tuple]: + """Generate all permutations from parameter value lists. + + Args: + parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. + + Returns: + Tuple of tuples representing all possible combinations of parameter values. + """ + return tuple(product(*parameters.values())) + + +def get_placeholders(parameters: dict[str, list], permutation: tuple) -> dict: + """Convert a permutation tuple into a dictionary of placeholders. + + Args: + parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. + permutation (tuple): Tuple of values representing one permutation. + + Returns: + Dictionary mapping parameter names to their values in this permutation. + """ + return dict(zip(parameters.keys(), permutation, strict=True)) + + +def join_parameters(parameters: dict[str, list[str]], max_count: int = 5, + max_label: str = 'various') -> dict[str, str]: + """Join parameter values into strings, with fallback for large value sets. + + Args: + parameters (dict[str, list[str]]): Dictionary mapping parameter names to lists of values. + max_count (int): Maximum number of values to join (default: 5). + max_label (str): Label to use when value count exceeds max_count (default: 'various'). + + Returns: + Dictionary mapping parameter names to joined strings or max_label. + """ + return { + key: (max_label if len(values) > max_count else '+'.join(values)) + for key, values in parameters.items() + } + + +def copy_placeholders(*placeholder_args: dict, **kwargs: Any) -> dict: + """Merge multiple placeholder dictionaries and additional kwargs. + + Args: + *placeholder_args (dict): Variable number of placeholder dictionaries to merge. + **kwargs (Any): Additional key-value pairs to add to the result. + + Returns: + Dictionary containing all merged placeholders. + """ + placeholders = { + key: value + for placeholder_arg in placeholder_args + for key, value in placeholder_arg.items() + } + placeholders.update(**kwargs) + return placeholders + + +def apply_placeholders(path_template: str | Path, placeholders: dict) -> Path: + """Apply placeholders to a string or path, ensuring that the name of the path is lower case + + Args: + path_template (str | Path): Path template as string or path. + placeholders (dict): Placeholder dictionary. + + Returns: + Path with the applied placeholders. + """ + try: + path = str(path_template).format(**placeholders) + except KeyError as e: + raise RuntimeError('Some of the placeholders are missing.') from e + + path = Path(path) + return path.with_stem(path.stem.lower()) + + +def update_year(placeholders: dict, key: str, year: int | str, operator: str) -> None: + """Update a year placeholder based on comparison operator. + + Args: + placeholders (dict): Dictionary of placeholders to update. + key (str): Key in placeholders dictionary to update. + year (int | str): Year value to compare/set. + operator (str): Comparison operator ('<' for minimum, '>' for maximum). + + Raises: + RuntimeError: If operator is not '<' or '>'. + + Note: + Updates placeholders[key] in-place if condition is met. + """ + if operator not in ('<', '>'): + raise RuntimeError(f'operator "{operator}" not supported') + + current = placeholders.get(key) + if ( + (current is None) or + (operator == '>' and int(current) < int(year)) or + (operator == '<' and int(current) > int(year)) + ): + placeholders[key] = year diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index f7117c1..726bfd5 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -8,14 +8,13 @@ import numpy as np import pandas as pd -from isimip_utils.pandas import ( +from .pandas import ( get_first_coord, get_first_coord_axis, get_first_coord_label, get_first_data_var, get_first_data_var_label, ) -from isimip_utils.utils import get_permutations logger = logging.getLogger(__name__) @@ -312,12 +311,12 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | ) -def plot_grid(parameters: dict, plots: dict, empty_plot: alt.Chart, layer: bool = True, +def plot_grid(permutations: list[tuple], plots: dict, empty_plot: alt.Chart, layer: bool = True, x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: """Create a grid of plots organized by parameter permutations. Args: - parameters (dict): Dictionary of parameters with lists of values. + permutations (list): List of permutations with tuples of parameters. plots (dict): Dictionary mapping permutation tuples to Chart objects. empty_plot (alt.Chart): Chart to use when a permutation has no data. layer (bool): Whether to layer plots or concatenate vertically (default: True). @@ -331,7 +330,7 @@ def plot_grid(parameters: dict, plots: dict, empty_plot: alt.Chart, layer: bool rows = [] prev_permutation = None - for permutation in get_permutations(parameters): + for permutation in permutations: row_title = permutation[0] if len(permutation) > 0 else '' column_title = permutation[1] if len(permutation) > 1 else '' diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index bf59f84..97888e7 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,6 +1,5 @@ """Additional utility functions for ISIMIP tools.""" from collections.abc import Callable -from itertools import product from pathlib import Path from typing import Any @@ -122,110 +121,3 @@ def validate_lon(lon: float) -> None: raise ValidationError(f'lon={lon} must be < 180') except TypeError as e: raise ValidationError(f'lon={lon} is a valid number') from e - -def get_permutations(parameters: dict[str, list]) -> tuple[tuple]: - """Generate all permutations from parameter value lists. - - Args: - parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. - - Returns: - Tuple of tuples representing all possible combinations of parameter values. - """ - return tuple(product(*parameters.values())) - - -def get_placeholders(parameters: dict[str, list], permutation: tuple) -> dict: - """Convert a permutation tuple into a dictionary of placeholders. - - Args: - parameters (dict[str, list]): Dictionary mapping parameter names to lists of values. - permutation (tuple): Tuple of values representing one permutation. - - Returns: - Dictionary mapping parameter names to their values in this permutation. - """ - return dict(zip(parameters.keys(), permutation, strict=True)) - - -def join_parameters(parameters: dict[str, list[str]], max_count: int = 5, - max_label: str = 'various') -> dict[str, str]: - """Join parameter values into strings, with fallback for large value sets. - - Args: - parameters (dict[str, list[str]]): Dictionary mapping parameter names to lists of values. - max_count (int): Maximum number of values to join (default: 5). - max_label (str): Label to use when value count exceeds max_count (default: 'various'). - - Returns: - Dictionary mapping parameter names to joined strings or max_label. - """ - return { - key: (max_label if len(values) > max_count else '+'.join(values)) - for key, values in parameters.items() - } - - -def copy_placeholders(*placeholder_args: dict, **kwargs: Any) -> dict: - """Merge multiple placeholder dictionaries and additional kwargs. - - Args: - *placeholder_args (dict): Variable number of placeholder dictionaries to merge. - **kwargs (Any): Additional key-value pairs to add to the result. - - Returns: - Dictionary containing all merged placeholders. - """ - placeholders = { - key: value - for placeholder_arg in placeholder_args - for key, value in placeholder_arg.items() - } - placeholders.update(**kwargs) - return placeholders - - -def apply_placeholders(path_template: str | Path, placeholders: dict) -> Path: - """Apply placeholders to a string or path, ensuring that the name of the path is lower case - - Args: - path_template (str | Path): Path template as string or path. - placeholders (dict): Placeholder dictionary. - - Returns: - Path with the applied placeholders. - """ - try: - path = str(path_template).format(**placeholders) - except KeyError as e: - raise RuntimeError('Some of the placeholders are missing.') from e - - path = Path(path) - return path.with_stem(path.stem.lower()) - - -def update_year(placeholders: dict, key: str, year: int | str, operator: str) -> None: - """Update a year placeholder based on comparison operator. - - Args: - placeholders (dict): Dictionary of placeholders to update. - key (str): Key in placeholders dictionary to update. - year (int | str): Year value to compare/set. - operator (str): Comparison operator ('<' for minimum, '>' for maximum). - - Raises: - RuntimeError: If operator is not '<' or '>'. - - Note: - Updates placeholders[key] in-place if condition is met. - """ - if operator not in ('<', '>'): - raise RuntimeError(f'operator "{operator}" not supported') - - current = placeholders.get(key) - if ( - (current is None) or - (operator == '>' and int(current) < int(year)) or - (operator == '<' and int(current) > int(year)) - ): - placeholders[key] = year From 9ba214b87b2d39b8488b92b8938381f1329d93ed Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 23 Dec 2025 18:38:48 +0100 Subject: [PATCH 102/162] Refactor find_files, remove update_year and add get_min_value and get_max_value --- isimip_utils/files.py | 40 ++++++++++++++++++++++++++++++++++++++ isimip_utils/parameters.py | 27 ------------------------- isimip_utils/patterns.py | 25 ------------------------ isimip_utils/plot.py | 2 +- isimip_utils/utils.py | 24 +++++++++++++++++++++++ 5 files changed, 65 insertions(+), 53 deletions(-) create mode 100644 isimip_utils/files.py diff --git a/isimip_utils/files.py b/isimip_utils/files.py new file mode 100644 index 0000000..69b4cf9 --- /dev/null +++ b/isimip_utils/files.py @@ -0,0 +1,40 @@ +"""Functions to find files for specific datasets.""" +import logging +import re +from collections.abc import Iterable +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def find_files(base_path: str | Path, file_iter: Iterable[Path], + pattern: str = r'_(?P\d{4})_*(?P\d{4})?\.nc\d*$') -> tuple[list[tuple], int, int]: + """Find files for a given (dataset) path, matching a regex pattern for start and end year. + + Args: + base_path (str | Path): Base path for file discovery. + file_iter (Iterable[Path]): Iterator over file paths to search through. + pattern (str): Regular expression for start and end year matching. + + Returns: + Tuple containing (a) the List of tuples containing the path and the start and end years for each file, + (b) the lowest start year, and (c) the highest end year. + """ + files = [] + + for file_path in sorted(file_iter): + match = re.search(pattern, str(file_path), re.IGNORECASE) + if match: + try: + start_year = int(match.group('start_year')) + except TypeError: + start_year = None + + try: + end_year = int(match.group('end_year')) + except TypeError: + end_year = None + + files.append((file_path, start_year, end_year)) + + return files diff --git a/isimip_utils/parameters.py b/isimip_utils/parameters.py index b68a906..f96ca74 100644 --- a/isimip_utils/parameters.py +++ b/isimip_utils/parameters.py @@ -83,30 +83,3 @@ def apply_placeholders(path_template: str | Path, placeholders: dict) -> Path: path = Path(path) return path.with_stem(path.stem.lower()) - - -def update_year(placeholders: dict, key: str, year: int | str, operator: str) -> None: - """Update a year placeholder based on comparison operator. - - Args: - placeholders (dict): Dictionary of placeholders to update. - key (str): Key in placeholders dictionary to update. - year (int | str): Year value to compare/set. - operator (str): Comparison operator ('<' for minimum, '>' for maximum). - - Raises: - RuntimeError: If operator is not '<' or '>'. - - Note: - Updates placeholders[key] in-place if condition is met. - """ - if operator not in ('<', '>'): - raise RuntimeError(f'operator "{operator}" not supported') - - current = placeholders.get(key) - if ( - (current is None) or - (operator == '>' and int(current) < int(year)) or - (operator == '<' and int(current) > int(year)) - ): - placeholders[key] = year diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index 5654aef..7ee610a 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -1,7 +1,6 @@ """Functions to match file names and extract ISIMIP specifiers.""" import logging import re -from collections.abc import Iterable from pathlib import Path from .exceptions import DidNotMatch @@ -156,27 +155,3 @@ def match_string(pattern: re.Pattern, string: str) -> tuple[Path, dict]: return Path(match.group(0)), specifiers else: raise DidNotMatch(f'No match for {string} ("{pattern.pattern}")') - - -def find_files(pattern: re.Pattern, file_iter: Iterable[Path]) -> list[dict]: - """Find files matching a regex pattern from an iterator. - - Args: - pattern (re.Pattern): Compiled regular expression pattern to match against file paths. - file_iter (Iterable[Path]): Iterator over file paths to search through. - - Returns: - List of dictionaries containing 'path' and any named groups from the regex match. - """ - files = [] - for path in sorted(file_iter): - try: - _, specifiers = match_string(pattern, str(path)) - files.append({ - 'path': path, - **specifiers - }) - except DidNotMatch: - pass - - return files diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 726bfd5..3e0b380 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -114,7 +114,7 @@ def save_index(index_path: Path) -> None: '''.replace(r'{{ index_json }}', index_json).strip()) -def get_plot_title(permutation: tuple) -> dict: +def format_title(permutation: tuple) -> dict: """Create a plot title from a permutation tuple. Args: diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index 97888e7..af8fd44 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -121,3 +121,27 @@ def validate_lon(lon: float) -> None: raise ValidationError(f'lon={lon} must be < 180') except TypeError as e: raise ValidationError(f'lon={lon} is a valid number') from e + + +def get_min_value(values): + """Get the minimal value of the input values, excluding None and using None as default. + + Args: + values (list): Input values. + + Returns: + Minimal value + """ + return min([v for v in values if v is not None], default=None) + + +def get_max_value(values): + """Get the maximum value of the input values, excluding None and using None as default. + + Args: + values (list): Input values. + + Returns: + Maximum value + """ + return max([v for v in values if v is not None], default=None) From 88a086955972056c20fbc1ff6ecadf9dd72636cc Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 24 Dec 2025 10:34:22 +0100 Subject: [PATCH 103/162] Refactor and fix tests --- isimip_utils/files.py | 2 +- isimip_utils/tests/helper.py | 17 +++++ isimip_utils/tests/test_fetch.py | 94 +-------------------------- isimip_utils/tests/test_files.py | 40 ++++++++++++ isimip_utils/tests/test_parameters.py | 52 +++++++++++++++ isimip_utils/tests/test_patterns.py | 22 +------ isimip_utils/tests/test_plot.py | 19 +++--- isimip_utils/tests/test_protocol.py | 77 ++++++++++++++++++++++ isimip_utils/tests/test_utils.py | 69 -------------------- 9 files changed, 200 insertions(+), 192 deletions(-) create mode 100644 isimip_utils/tests/test_files.py create mode 100644 isimip_utils/tests/test_parameters.py create mode 100644 isimip_utils/tests/test_protocol.py diff --git a/isimip_utils/files.py b/isimip_utils/files.py index 69b4cf9..9e99349 100644 --- a/isimip_utils/files.py +++ b/isimip_utils/files.py @@ -8,7 +8,7 @@ def find_files(base_path: str | Path, file_iter: Iterable[Path], - pattern: str = r'_(?P\d{4})_*(?P\d{4})?\.nc\d*$') -> tuple[list[tuple], int, int]: + pattern: str = r'_(?P\d{4})_(?P\d{4})?\.nc\d*$') -> tuple[list[tuple], int, int]: """Find files for a given (dataset) path, matching a regex pattern for start and end year. Args: diff --git a/isimip_utils/tests/helper.py b/isimip_utils/tests/helper.py index 3ef86b8..db0440c 100644 --- a/isimip_utils/tests/helper.py +++ b/isimip_utils/tests/helper.py @@ -1,5 +1,8 @@ +import json import re import subprocess +from pathlib import Path +from unittest.mock import MagicMock def call(cmd): @@ -14,3 +17,17 @@ def normalize_whitespace(string): def assert_multiline_strings_equal(a, b): for a_line, b_line in zip(a.strip().splitlines(), b.strip().splitlines(), strict=True): assert normalize_whitespace(a_line) == normalize_whitespace(b_line), (a_line, b_line) + +def mock_side_effect(url, *args, **kwargs): + mock_response = MagicMock() + mock_path = Path(url.replace('https://protocol.isimip.org', 'testing/protocol/output')) + + if mock_path.exists(): + with mock_path.open() as fp: + mock_response.status_code = 200 + mock_response.json.return_value = json.load(fp) + else: + mock_response.status_code = 404 + mock_response.json.return_value = None + + return mock_response diff --git a/isimip_utils/tests/test_fetch.py b/isimip_utils/tests/test_fetch.py index b3cf868..5682c33 100644 --- a/isimip_utils/tests/test_fetch.py +++ b/isimip_utils/tests/test_fetch.py @@ -1,106 +1,18 @@ -import json -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest +from unittest.mock import patch from isimip_utils.fetch import ( - fetch_definitions, fetch_json, - fetch_pattern, - fetch_schema, - fetch_tree, - find_json, load_json, ) +from .helper import mock_side_effect + paths = [ 'ISIMIP3a/OutputData/agriculture/ACEA/gswp3-w5e5.json', 'ISIMIP3a/OutputData/agriculture/ACEA.json', 'ISIMIP3a/OutputData/agriculture.json' ] - -def mock_side_effect(url, *args, **kwargs): - mock_response = MagicMock() - mock_path = Path(url.replace('https://protocol.isimip.org', 'testing/protocol/output')) - - if mock_path.exists(): - with mock_path.open() as fp: - mock_response.status_code = 200 - mock_response.json.return_value = json.load(fp) - else: - mock_response.status_code = 404 - mock_response.json.return_value = None - - return mock_response - - -@pytest.mark.parametrize('path', paths) -def test_fetch_definitions(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): - data = fetch_definitions(path) - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_definitions_local(path): - data = fetch_definitions(path, 'testing/protocol') - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_pattern(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): - data = fetch_pattern(path) - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_pattern_local(path): - data = fetch_pattern(path, 'testing/protocol') - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_schema(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): - data = fetch_schema(path) - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_schema_local(path): - data = fetch_schema(path, 'testing/protocol') - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_tree(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): - data = fetch_tree(path) - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_fetch_tree_local(path): - data = fetch_tree(path, 'testing/protocol') - assert data and isinstance(data, dict) - - -@pytest.mark.parametrize('path', paths) -def test_find_json_fetch(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): - data = find_json('https://protocol.isimip.org', 'definitions', path) - assert data is not None - - -@pytest.mark.parametrize('path', paths) -def test_find_json_load(path): - data = find_json('testing/protocol', 'definitions', path) - assert data is not None - - def test_fetch_json(): with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): data = fetch_json("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture.json") diff --git a/isimip_utils/tests/test_files.py b/isimip_utils/tests/test_files.py new file mode 100644 index 0000000..a8d3acb --- /dev/null +++ b/isimip_utils/tests/test_files.py @@ -0,0 +1,40 @@ +from pathlib import Path + +from isimip_utils.files import find_files +from isimip_utils.tests import constants + + +def test_find_files(): + file_path = Path(constants.YIELD_PATH) + fake_path = file_path.with_stem(file_path.stem + '_a') + files = [ + file_path.name, + fake_path.name + ] + + result = find_files(file_path.parent, files) + assert len(result) + assert result == [ + (file_path.name, 1901, 2016) + ] + + +def test_find_files_with_pattern(): + file_path = Path(constants.YIELD_PATH) + fake_path = file_path.with_stem(file_path.stem + '_a') + none_path = file_path.with_stem(file_path.stem.replace('_1901_2016', '')) + files = [ + file_path.name, + fake_path.name, + none_path.name, + ] + + pattern = r'(_(?P\d{4}))?(_(?P\d{4}))?(_\w+)?\.nc\d*$' + + result = find_files(file_path.parent, files, pattern=pattern) + assert len(result) + assert result == [ + (none_path.name, None, None), # result is sorted + (file_path.name, 1901, 2016), + (fake_path.name, 1901, 2016), + ] diff --git a/isimip_utils/tests/test_parameters.py b/isimip_utils/tests/test_parameters.py new file mode 100644 index 0000000..8314b1e --- /dev/null +++ b/isimip_utils/tests/test_parameters.py @@ -0,0 +1,52 @@ +from isimip_utils.parameters import copy_placeholders, get_permutations, get_placeholders, join_parameters + +parameters = { + 'model': ['model_a', 'model_b'], + 'variable': ['x', 'y', 'z'] +} + + +def test_get_permutations(): + assert get_permutations(parameters) == ( + ('model_a', 'x'), + ('model_a', 'y'), + ('model_a', 'z'), + ('model_b', 'x'), + ('model_b', 'y'), + ('model_b', 'z') + ) + + +def test_get_placeholders(): + assert get_placeholders(parameters, ('model_a', 'x')) == { + 'model': 'model_a', + 'variable': 'x' + } + + +def test_join_parameters(): + assert join_parameters(parameters) == { + 'model': 'model_a+model_b', + 'variable': 'x+y+z' + } + + +def test_join_parameters_max_count(): + assert join_parameters(parameters, 2) == { + 'model': 'model_a+model_b', + 'variable': 'various' + } + + +def test_join_parameters_max_count_label(): + assert join_parameters(parameters, 2, 'label') == { + 'model': 'model_a+model_b', + 'variable': 'label' + } + + +def test_copy_placeholders(): + assert copy_placeholders({'foo': 'bar'}, {'egg': 'spam'}) == { + 'foo': 'bar', + 'egg': 'spam' + } diff --git a/isimip_utils/tests/test_patterns.py b/isimip_utils/tests/test_patterns.py index b0ae7d2..38d9fcd 100644 --- a/isimip_utils/tests/test_patterns.py +++ b/isimip_utils/tests/test_patterns.py @@ -1,7 +1,7 @@ from pathlib import Path -from isimip_utils.fetch import fetch_pattern -from isimip_utils.patterns import find_files, match_dataset, match_dataset_path, match_file, match_file_path, match_path +from isimip_utils.patterns import match_dataset, match_dataset_path, match_file, match_file_path, match_path +from isimip_utils.protocol import fetch_pattern from isimip_utils.tests import constants protocol_locations = ['testing/protocol'] @@ -96,21 +96,3 @@ def test_match_path_specifiers_map(): assert str(path) == str(file_path) assert specifiers == {**path_specifiers, **file_specifiers, 'region': 'spam'} - - -def test_find_files(): - file_path = Path(constants.YIELD_PATH) - files = [ - file_path.name, - file_path.name.replace('_global_', 'a'), - file_path.name.replace('_global_', 'b'), - file_path.name.replace('_global_', 'c') - ] - - pattern = fetch_pattern(pattern_path, protocol_locations) - result = find_files(pattern['file'], files) - assert len(result) - assert result == [{ - 'path': file_path.name, - **file_specifiers - }] diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py index f0997d1..bf61996 100644 --- a/isimip_utils/tests/test_plot.py +++ b/isimip_utils/tests/test_plot.py @@ -3,7 +3,7 @@ import pandas as pd from isimip_utils.pandas import compute_average, create_label -from isimip_utils.plot import get_plot_title, plot_grid, plot_line, plot_map, save_index, save_plot +from isimip_utils.plot import format_title, plot_grid, plot_line, plot_map, save_index, save_plot from isimip_utils.tests import constants from isimip_utils.xarray import open_dataset, to_dataframe @@ -209,16 +209,11 @@ def test_plot_grid(): df_empty = pd.DataFrame({ 'time': dataframes[2]['time'], 'tas': np.nan }) - parameters = { - 'ab': ('a', 'b'), - 'xy': ('x', 'y'), - } - - permutations = ( + permutations = [ ('a', 'x'), ('a', 'y'), ('b', 'x') - ) + ] plots = {} for permutation, df in zip(permutations, dataframes, strict=True): @@ -226,7 +221,9 @@ def test_plot_grid(): empty_plot = plot_line(df, empty=True) - chart = plot_grid(parameters, plots, x='independent', empty_plot=empty_plot, layer=False) + permutations.append(('b', 'y')) + + chart = plot_grid(permutations, plots, x='independent', empty_plot=empty_plot, layer=False) top, bottom = chart.vconcat top_left, top_right = top.hconcat @@ -255,10 +252,10 @@ def test_save_index(): assert index_path.is_file -def test_get_plot_title(): +def test_format_title(): permutation = ('a', 'b', 'c') - assert get_plot_title(permutation) == { + assert format_title(permutation) == { "text": 'a Β· b Β· c', "fontSize": 16, "dy": -10 diff --git a/isimip_utils/tests/test_protocol.py b/isimip_utils/tests/test_protocol.py new file mode 100644 index 0000000..7dfc8ea --- /dev/null +++ b/isimip_utils/tests/test_protocol.py @@ -0,0 +1,77 @@ +from unittest.mock import patch + +import pytest + +from isimip_utils.protocol import ( + fetch_definitions, + fetch_pattern, + fetch_schema, + fetch_tree, + find_json, +) + +from .helper import mock_side_effect + +paths = [ + 'ISIMIP3a/OutputData/agriculture/ACEA/gswp3-w5e5.json', + 'ISIMIP3a/OutputData/agriculture/ACEA.json', + 'ISIMIP3a/OutputData/agriculture.json' +] + + +@pytest.mark.parametrize('path', paths) +def test_fetch_definitions_local(path): + data = fetch_definitions(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_pattern(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_pattern(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_pattern_local(path): + data = fetch_pattern(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_schema(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_schema(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_schema_local(path): + data = fetch_schema(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_tree(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = fetch_tree(path) + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_fetch_tree_local(path): + data = fetch_tree(path, 'testing/protocol') + assert data and isinstance(data, dict) + + +@pytest.mark.parametrize('path', paths) +def test_find_json_fetch(path): + with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + data = find_json('https://protocol.isimip.org', 'definitions', path) + assert data is not None + + +@pytest.mark.parametrize('path', paths) +def test_find_json_load(path): + data = find_json('testing/protocol', 'definitions', path) + assert data is not None diff --git a/isimip_utils/tests/test_utils.py b/isimip_utils/tests/test_utils.py index 46c1d79..362f271 100644 --- a/isimip_utils/tests/test_utils.py +++ b/isimip_utils/tests/test_utils.py @@ -4,13 +4,8 @@ from isimip_utils.utils import ( Singleton, cached_property, - copy_placeholders, exclude_path, - get_permutations, - get_placeholders, include_path, - join_parameters, - update_year, validate_lat, validate_lon, ) @@ -21,11 +16,6 @@ 'a/b/e' ] -parameters = { - 'model': ['model_a', 'model_b'], - 'variable': ['x', 'y', 'z'] -} - def test_singleton(): a = Singleton() @@ -87,62 +77,3 @@ def test_include_path(): assert include_path(paths, 'a/b/c') is True assert include_path(paths, 'a/b/cc') is True assert include_path(paths, 'a/b/f') is False - - -def test_get_permutations(): - assert get_permutations(parameters) == ( - ('model_a', 'x'), - ('model_a', 'y'), - ('model_a', 'z'), - ('model_b', 'x'), - ('model_b', 'y'), - ('model_b', 'z') - ) - - -def test_get_placeholders(): - assert get_placeholders(parameters, ('model_a', 'x')) == { - 'model': 'model_a', - 'variable': 'x' - } - - -def test_join_parameters(): - assert join_parameters(parameters) == { - 'model': 'model_a+model_b', - 'variable': 'x+y+z' - } - - -def test_join_parameters_max_count(): - assert join_parameters(parameters, 2) == { - 'model': 'model_a+model_b', - 'variable': 'various' - } - - -def test_join_parameters_max_count_label(): - assert join_parameters(parameters, 2, 'label') == { - 'model': 'model_a+model_b', - 'variable': 'label' - } - - -def test_copy_placeholders(): - assert copy_placeholders({'foo': 'bar'}, {'egg': 'spam'}) == { - 'foo': 'bar', - 'egg': 'spam' - } - - -def test_update_year(): - placeholders = {'year': 2000} - - update_year(placeholders, 'year', 2001, '<') - assert placeholders == {'year': 2000} - - update_year(placeholders, 'year', 2001, '>') - assert placeholders == {'year': 2001} - - update_year(placeholders, 'year', 2000, '<') - assert placeholders == {'year': 2000} From 7f616d8b1601d01d84fededc0c58161c2b1c5fe1 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 25 Nov 2025 13:01:41 +0100 Subject: [PATCH 104/162] Add GitHub Actions workflow --- .github/workflows/ci.yaml | 123 ++++++++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 5 ++ pyproject.toml | 3 +- testing/download.py | 0 4 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/ci.yaml mode change 100644 => 100755 testing/download.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..38464f1 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,123 @@ +name: isimip-utils CI + +on: + push: + branches: + - main + - test + pull_request: + +permissions: + contents: read + actions: read + +env: + PYTHONDONTWRITEBYTECODE: 1 + FORCE_COLOR: 1 + PYTHON_VERSION: "3.11" + +jobs: + setup: + name: Setup tests + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - name: Check out repository πŸ’Ύ + uses: actions/checkout@v5 + with: + persist-credentials: false + + - name: Restore testing cache πŸ“₯ + uses: actions/cache@v4 + with: + path: | + testing/datasets + testing/extractions + testing/protocol + testing/share + key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} + restore-keys: | + testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}- + + - name: Install cdo 🌍 + run: | + sudo apt-get update + sudo apt-get install -y cdo --no-install-recommends + + - name: Set up Python 🐍 + uses: actions/setup-python@v6 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install package πŸ“¦ + run: pip install -e .[all] + + - name: Run setup scripts πŸ”§ + run: | + python testing/download.py + python testing/setup.py + + - name: Save testing cache πŸ“€ + if: always() + uses: actions/cache/save@v4 + with: + path: | + testing/datasets + testing/extractions + testing/protocol + testing/share + key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} + + test: + name: Run tests + needs: setup + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - name: Check out repository πŸ’Ύ + uses: actions/checkout@v5 + with: + persist-credentials: false + + - name: Restore testing cache πŸ“₯ + uses: actions/cache@v4 + with: + path: | + testing/datasets + testing/extractions + testing/protocol + testing/share + key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} + restore-keys: | + testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}- + + - name: Install cdo 🌍 + run: | + sudo apt-get update + sudo apt-get install -y cdo --no-install-recommends + + - name: Set up Python 🐍 + uses: actions/setup-python@v6 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install package πŸ“¦ + run: pip install -e .[all] + + - name: Run pytest πŸ§ͺ + run: pytest --cov + + - name: Upload testing directory πŸ“€ + uses: actions/upload-artifact@v5 + with: + name: testing + path: | + testing/datasets + testing/extractions + testing/output + testing/protocol + testing/share diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a37beb0..031203a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,3 +21,8 @@ repos: rev: v1.39.2 hooks: - id: typos + + - repo: https://github.com/zizmorcore/zizmor-pre-commit + rev: v1.16.3 + hooks: + - id: zizmor diff --git a/pyproject.toml b/pyproject.toml index 0f55fd5..402b073 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,12 +12,11 @@ maintainers = [ ] description = "This package contains common functionality for different ISIMIP tools." readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.11" license = { file = "LICENSE" } classifiers = [ 'Operating System :: OS Independent', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', diff --git a/testing/download.py b/testing/download.py old mode 100644 new mode 100755 From 2239cba86eec36ad8d66bb35a0e96fa9adc8670a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 10 Dec 2025 14:24:57 +0100 Subject: [PATCH 105/162] Use float64 for _FillValue and float32 for missing_value --- isimip_utils/tests/test_xarray.py | 14 +++++++------- isimip_utils/xarray.py | 7 ++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 1fa11b8..076def4 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -122,7 +122,7 @@ def test_init_dataset_args(): double var(time, lat, lon) ; var:_FillValue = 1.e+20 ; var:long_name = "Variable" ; - var:missing_value = 1.e+20 ; + var:missing_value = 1.e+20f ; } ''') @@ -185,14 +185,14 @@ def test_init_dataset_latlon(): double var(time, lat, lon) ; var:_FillValue = 1.e+20 ; var:long_name = "Variable" ; - var:missing_value = 1.e+20 ; + var:missing_value = 1.e+20f ; } ''') def test_init_dataset_dims(): - a = np.arange(0, 10, dtype=np.float64) - b = np.arange(0, 10, dtype=np.float64) + a = np.arange(0, 2, dtype=np.float64) + b = np.arange(0, 3, dtype=np.float64) var = np.random.rand(b.size, a.size, 360, 720).astype(np.float64) attrs = { @@ -231,8 +231,8 @@ def test_init_dataset_dims(): dimensions: lon = 720 ; lat = 360 ; - b = 10 ; - a = 10 ; + b = 3 ; + a = 2 ; variables: double lon(lon) ; lon:standard_name = "longitude" ; @@ -253,7 +253,7 @@ def test_init_dataset_dims(): double var(b, a, lat, lon) ; var:_FillValue = 1.e+20 ; var:long_name = "Variable" ; - var:missing_value = 1.e+20 ; + var:missing_value = 1.e+20f ; } ''') diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index afa30fd..09a95f0 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -33,7 +33,8 @@ } } -FILL_VALUE = np.float32(1.e+20) +FILL_VALUE = np.float64(1e20) +MISSING_VALUE = np.float32(1e20) def init_dataset(lon: None | int | np.ndarray = 720, lat: None | int | np.ndarray = 360, @@ -114,7 +115,7 @@ def init_dataset(lon: None | int | np.ndarray = 720, if data_var in attrs: ds.data_vars[data_var].attrs.update(attrs[data_var]) - ds.data_vars[data_var].attrs["_FillValue"] = 1.e+20 + # ds.data_vars[data_var].attrs["_FillValue"] = FILL_VALUE # set global attributes ds.attrs = attrs.get('global', {}) @@ -284,7 +285,7 @@ def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: if '_FillValue' not in ds.data_vars[data_var].attrs: ds.data_vars[data_var].attrs['_FillValue'] = FILL_VALUE if 'missing_value' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['missing_value'] = FILL_VALUE + ds.data_vars[data_var].attrs['missing_value'] = MISSING_VALUE return ds From bdc51c6a31fc89ddc1defe502896f71750d04eec Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 10 Dec 2025 14:25:16 +0100 Subject: [PATCH 106/162] Update GitHub actions --- .github/workflows/ci.yaml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 38464f1..fb8d8c5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,8 +3,8 @@ name: isimip-utils CI on: push: branches: - - main - - test + - main + - test pull_request: permissions: @@ -109,15 +109,4 @@ jobs: run: pip install -e .[all] - name: Run pytest πŸ§ͺ - run: pytest --cov - - - name: Upload testing directory πŸ“€ - uses: actions/upload-artifact@v5 - with: - name: testing - path: | - testing/datasets - testing/extractions - testing/output - testing/protocol - testing/share + run: pytest --cov=isimip_utils --cov-fail-under --cov-report=term-missing From b1cc4355ec6a50785c022c07852f5e9d1c88c77b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 10 Dec 2025 14:59:20 +0100 Subject: [PATCH 107/162] Adjust rioxarray version for Python 3.11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 402b073..ae8a299 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ plots = [ ] shapes = [ "geopandas~=1.1", - "rioxarray~=0.20", + "rioxarray>=0.19", ] xarray = [ "cftime~=1.6", From 33e5c93f26d3c198b82cc0e2631e73106e090ddd Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 10 Dec 2025 15:21:23 +0100 Subject: [PATCH 108/162] Update GitHub actions --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fb8d8c5..8d8e11a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -109,4 +109,4 @@ jobs: run: pip install -e .[all] - name: Run pytest πŸ§ͺ - run: pytest --cov=isimip_utils --cov-fail-under --cov-report=term-missing + run: pytest --cov=isimip_utils --cov-fail-under=90 --cov-report=term-missing From 935b0af3a90227323a7151865d40b02d3192521b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 10 Dec 2025 18:13:30 +0100 Subject: [PATCH 109/162] Check for existing files in testing/setup.py --- .github/workflows/ci.yaml | 2 +- testing/setup.py | 60 ++++++++++++++++----------------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8d8e11a..cf6c787 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -98,7 +98,7 @@ jobs: - name: Install cdo 🌍 run: | sudo apt-get update - sudo apt-get install -y cdo --no-install-recommends + sudo apt-get install -y cdo netcdf-bin --no-install-recommends - name: Set up Python 🐍 uses: actions/setup-python@v6 diff --git a/testing/setup.py b/testing/setup.py index 15498d8..797f88c 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -45,7 +45,8 @@ def run_gridfile(): output_path = constants.SHARE_PATH / 'gridarea.nc' output_path.parent.mkdir(parents=True, exist_ok=True) - helper.call(f'cdo gridarea {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo gridarea {input_path} {output_path}') def run_seldate(): @@ -57,7 +58,9 @@ def run_seldate(): ('2019-01-01', '2020-12-31', '2019_2020') ]: output_path = constants.DATASETS_PATH / constants.TAS_PATH.replace('2015_2020', specifiers) - helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') + + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') def run_select_time(): @@ -70,9 +73,9 @@ def run_select_time(): output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') \ .replace('2015_2020', '20180101') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date} {input_path} {output_path}') def run_select_period(): @@ -85,9 +88,9 @@ def run_select_period(): output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ .replace('2015_2020', '2015') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') def run_select_point(): @@ -96,50 +99,42 @@ def run_select_point(): # add one since cdo is counting from 1! ix, iy = ix + 1, iy + 1 - output_paths = [] for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-point-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - output_paths.append(str(output_path)) - helper.call(f'cdo -f nc4c -z zip_5 -L -selindexbox,{ix},{ix},{iy},{iy} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L -selindexbox,{ix},{ix},{iy},{iy} {input_path} {output_path}') def run_select_bbox(): west, east, south, north = constants.BBOX - output_paths = [] for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - output_paths.append(str(output_path)) - - helper.call(f'cdo -f nc4c -z zip_5 -L -sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L ' \ + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') def run_select_bbox_mean(): west, east, south, north = constants.BBOX - output_paths = [] for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - output_paths.append(str(output_path)) - helper.call('cdo -f nc4c -z zip_5 -L -fldmean ' \ - f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + if not output_path.exists(): + helper.call('cdo -f nc4c -z zip_5 -L -fldmean ' + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') def run_select_bbox_map(): @@ -150,42 +145,37 @@ def run_select_bbox_map(): output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-map-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - helper.call('cdo -f nc4c -z zip_5 -L timmean ' \ - f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + if not output_path.exists(): + helper.call('cdo -f nc4c -z zip_5 -L timmean ' + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') def run_mask_bbox(): west, east, south, north = constants.BBOX - output_paths = [] for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-bbox-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - output_paths.append(str(output_path)) - - helper.call(f'cdo -f nc4c -z zip_5 -L -masklonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L ' + f'-masklonlatbox,{west},{east},{south},{north} {input_path} {output_path}') def run_mask_mask(): mask_path = constants.DATASETS_PATH / constants.LANDSEAMASK_PATH - output_paths = [] for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_mask-mask-cdo_') output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.unlink(missing_ok=True) - - output_paths.append(str(output_path)) - helper.call(f'cdo -f nc4c -z zip_5 -L -ifthen -selname,mask {mask_path} {input_path} {output_path}') + if not output_path.exists(): + helper.call(f'cdo -f nc4c -z zip_5 -L -ifthen -selname,mask {mask_path} {input_path} {output_path}') if __name__ == "__main__": From 00ae6e1c1e4d7cedf31304d6283617a7ccece19c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 24 Dec 2025 14:11:51 +0100 Subject: [PATCH 110/162] Refactor GitHub action --- .github/workflows/ci.yaml | 58 ++++++++------------------------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cf6c787..fdd09d3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,8 +17,8 @@ env: PYTHON_VERSION: "3.11" jobs: - setup: - name: Setup tests + test: + name: Run tests runs-on: ubuntu-latest permissions: contents: write @@ -37,27 +37,29 @@ jobs: testing/extractions testing/protocol testing/share - key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} + key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('isimip_utils/tests/constants.py') }} restore-keys: | testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}- - name: Install cdo 🌍 run: | sudo apt-get update - sudo apt-get install -y cdo --no-install-recommends + sudo apt-get install -y cdo netcdf-bin --no-install-recommends - name: Set up Python 🐍 uses: actions/setup-python@v6 with: python-version: ${{ env.PYTHON_VERSION }} + cache: pip - name: Install package πŸ“¦ run: pip install -e .[all] - - name: Run setup scripts πŸ”§ - run: | - python testing/download.py - python testing/setup.py + - name: Run download script 🌐 + run: python testing/download.py + + - name: Run setup script πŸ”§ + run: python testing/setup.py - name: Save testing cache πŸ“€ if: always() @@ -68,45 +70,7 @@ jobs: testing/extractions testing/protocol testing/share - key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} - - test: - name: Run tests - needs: setup - runs-on: ubuntu-latest - permissions: - contents: write - - steps: - - name: Check out repository πŸ’Ύ - uses: actions/checkout@v5 - with: - persist-credentials: false - - - name: Restore testing cache πŸ“₯ - uses: actions/cache@v4 - with: - path: | - testing/datasets - testing/extractions - testing/protocol - testing/share - key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/*.py') }} - restore-keys: | - testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}- - - - name: Install cdo 🌍 - run: | - sudo apt-get update - sudo apt-get install -y cdo netcdf-bin --no-install-recommends - - - name: Set up Python 🐍 - uses: actions/setup-python@v6 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install package πŸ“¦ - run: pip install -e .[all] + key: testing-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('isimip_utils/tests/constants.py') }} - name: Run pytest πŸ§ͺ run: pytest --cov=isimip_utils --cov-fail-under=90 --cov-report=term-missing From 52cf7bae824d3ae0de14fa03e965a243f2b0a778 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Dec 2025 13:47:21 +0100 Subject: [PATCH 111/162] Add conversion of date type in compute_time --- isimip_utils/xarray.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 09a95f0..bb408da 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -1,7 +1,7 @@ """Functions for working with xarray datasets for ISIMIP data.""" import logging import warnings -from datetime import datetime +from datetime import date, datetime from pathlib import Path import cftime @@ -324,11 +324,14 @@ def compute_time(ds: xr.Dataset, timestamp: datetime | None) -> float | None: Args: ds (xr.Dataset): Dataset with time coordinate containing units and calendar. - timestamp (datetime | None): Timestamp to convert, or None. + timestamp (datetime | date | None): Timestamp to convert, or None. Returns: Numeric time value in dataset's units, or None if timestamp is None. """ + if type(timestamp) is date: + timestamp = datetime.combine(timestamp, datetime.min.time()) + units = ds.time.encoding.get('units') or ds.coords['time'].attrs.get('units') calendar = ds.time.encoding.get('calendar') or ds.coords['time'].attrs.get('calendar') return cftime.date2num(timestamp, units=units, calendar=calendar) if timestamp else None From 26945fbe098378d837a35176f513485c710e4c14 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Dec 2025 13:47:48 +0100 Subject: [PATCH 112/162] Add type argument to compute_average --- isimip_utils/pandas.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 245ab80..3db5a8c 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -1,4 +1,6 @@ """Pandas DataFrame utilities for ISIMIP data.""" +from typing import Literal + import pandas as pd @@ -135,14 +137,15 @@ def get_first_data_var_label(df: pd.DataFrame) -> str: return next(iter(get_data_var_labels(df))) -def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = True) -> pd.DataFrame: - """Compute yearly average with optional standard deviation bounds. +def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = True, + type: Literal['annual', 'monthly'] = 'annual') -> pd.DataFrame: + """Compute yearly or monthly average with optional standard deviation bounds. Args: df (pd.DataFrame): DataFrame with time column and data variable. data_var (str): Name of the data variable (default: first data var). area (bool): Whether to include lower/upper bounds using std (default: True). - + type ('annual' | 'monthly'): Compute annual or monthly averages Returns: DataFrame with yearly aggregated data. """ @@ -152,14 +155,21 @@ def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = attrs = df.attrs - df['year'] = df['time'].dt.year + if type == 'annual': + column_name = 'year' + df[column_name] = df['time'].dt.year + elif type == 'monthly': + column_name = 'month' + df[column_name] = df['time'].values.astype('datetime64[M]') + else: + raise RuntimeError(f'unknown type "{type}" must be "annual" or "monthly"') kwargs = {'mean': (data_var, 'mean')} if area: kwargs['lower'] = (data_var, lambda y: y.mean() - y.std()) kwargs['upper'] = (data_var, lambda y: y.mean() + y.std()) - df = df.groupby('year').agg(**kwargs).reset_index() + df = df.groupby(column_name).agg(**kwargs).reset_index() # cast to double df['mean'] = df['mean'].astype('float64') @@ -169,10 +179,10 @@ def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = # update attrs df.attrs = attrs - df.attrs['coords'] = {'year': {'long_name': 'Year', 'axis': 'T'}} + df.attrs['coords'] = {column_name: {'long_name': column_name.capitalize(), 'axis': 'T'}} df.attrs['data_vars'] = { 'mean': {} } if data_var_long_name: - df.attrs['data_vars']['mean']['long_name'] = f'Average {data_var_long_name.lower()}' + df.attrs['data_vars']['mean']['long_name'] = f'Average {type} {data_var_long_name.lower()}' if data_var_units: df.attrs['data_vars']['mean']['units'] = data_var_units From 14ca4cc4896bc4496f16b7254e401951acf8a5f1 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Dec 2025 13:47:59 +0100 Subject: [PATCH 113/162] Refactor tests --- isimip_utils/tests/constants.py | 20 +++++++++----- isimip_utils/tests/test_extractions.py | 31 ++++++++++++++-------- isimip_utils/tests/test_pandas.py | 10 +++---- isimip_utils/tests/test_plot.py | 36 +++++++++++++++++--------- testing/setup.py | 27 ++++++++++--------- 5 files changed, 78 insertions(+), 46 deletions(-) diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py index 0d1fb53..86f5c4a 100644 --- a/isimip_utils/tests/constants.py +++ b/isimip_utils/tests/constants.py @@ -1,3 +1,4 @@ +from datetime import date from pathlib import Path DATASETS_PATH = Path("testing/datasets") @@ -10,12 +11,19 @@ LANDSEAMASK_PATH = "ISIMIP3a/InputData/geo_conditions/landseamask/landseamask.nc" -TAS_PATH = "ISIMIP3b/InputData/climate/atmosphere/bias-adjusted/global/daily/" \ - "ssp585/GFDL-ESM4/gfdl-esm4_r1i1p1f1_w5e5_ssp585_tas_global_daily_2015_2020.nc" +TAS_PATH = "ISIMIP3a/InputData/climate/atmosphere/obsclim/global/daily/" \ + "historical/20CRv3-ERA5/20crv3-era5_obsclim_tas_global_daily_2021_2021.nc" +TAS_DATE_SPECIFIERS = '2021_2021' + +TAS_SPLIT_PERIOD = ( + (date(2021, 1, 1), date(2021, 4, 30)), + (date(2021, 5, 1), date(2021, 8, 31)), + (date(2021, 9, 1), date(2021, 12, 31)) +) TAS_SPLIT_PATHS = [ - TAS_PATH.replace('2015_2020', specifiers) - for specifiers in ('2015_2016', '2017_2018', '2019_2020') + TAS_PATH.replace(TAS_DATE_SPECIFIERS, f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}') + for start_date, end_date in TAS_SPLIT_PERIOD ] YIELD_PATH = "ISIMIP3a/OutputData/agriculture/LPJmL/gswp3-w5e5/historical/" \ @@ -31,8 +39,8 @@ PROTOCOL_LOCATIONS = ['testing/protocol'] PATTERN_PATH = 'ISIMIP3a/OutputData/agriculture.json' -DATE = '2018-01-01' -PERIOD = ('2015-01-01', '2015-12-31') +DATE = date(2021, 1, 1) +PERIOD = date(2021, 4, 1), date(2021, 9, 30) BBOX = (0, 10, -5, 5) diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py index 8c7e7be..48c004e 100644 --- a/isimip_utils/tests/test_extractions.py +++ b/isimip_utils/tests/test_extractions.py @@ -1,4 +1,3 @@ -from datetime import datetime import pytest @@ -21,36 +20,46 @@ @pytest.mark.parametrize('decode_cf', (True, False)) def test_select_time(decode_cf): date = constants.DATE + date_specifiers = date.strftime('%Y%m%d') dataset_path = constants.DATASETS_PATH / constants.TAS_PATH - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time_') \ - .replace('2015_2020', '20180101') + extraction_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: - ds = select_time(file_ds, datetime.strptime(date, "%Y-%m-%d")) + ds = select_time(file_ds, date) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + cdo_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) helper.call(f'cdo diff {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) def test_select_period(decode_cf): start_date, end_date = constants.PERIOD + date_specifiers = f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}' dataset_path = constants.DATASETS_PATH / constants.TAS_PATH - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period_') \ - .replace('2015_2020', '2015') + extraction_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: - ds = select_period(file_ds, datetime.strptime(start_date, "%Y-%m-%d"), datetime.strptime(end_date, "%Y-%m-%d")) + ds = select_period(file_ds, start_date, end_date) write_dataset(ds, extraction_path) - cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period-cdo_') \ - .replace('2015_2020', '2015') + cdo_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-period-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) helper.call(f'cdo diff {extraction_path} {cdo_path}') diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py index d69c880..47f639e 100644 --- a/isimip_utils/tests/test_pandas.py +++ b/isimip_utils/tests/test_pandas.py @@ -45,8 +45,8 @@ def test_get_first_coord(extraction, result): @pytest.mark.parametrize('extraction,result', [ - ('bbox', ('Longitude [degrees_east]', 'Latitude [degrees_north]', 'time')), - ('point', ('time', )) + ('bbox', ('Longitude [degrees_east]', 'Latitude [degrees_north]', 'Time')), + ('point', ('Time', )) ]) def test_get_coord_labels(extraction, result): with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: @@ -55,7 +55,7 @@ def test_get_coord_labels(extraction, result): @pytest.mark.parametrize('extraction,result', [ - ('point', 'time') + ('point', 'Time') ]) def test_get_first_coord_label(extraction, result): with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: @@ -135,8 +135,8 @@ def test_group_by_day(): df = to_dataframe(ds) df = group_by_day(df, 'tas') - assert len(df) == 366 - assert df['tas'].between(260, 300).all() + assert len(df) == 365 + assert df['tas'].between(260, 305).all() def test_group_by_month(): diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py index bf61996..5ea473c 100644 --- a/isimip_utils/tests/test_plot.py +++ b/isimip_utils/tests/test_plot.py @@ -77,7 +77,7 @@ def test_plot_line_area(): with open_dataset(extraction_path) as ds: df = to_dataframe(ds) - df = compute_average(df, 'tas') + df = compute_average(df, 'tas', type='monthly') chart = plot_line(df) @@ -85,10 +85,10 @@ def test_plot_line_area(): mean, area = chart.layer - assert mean.encoding.x.shorthand == 'year:T' + assert mean.encoding.x.shorthand == 'month:T' assert mean.encoding.y.shorthand == 'mean:Q' - assert area.encoding.x.shorthand == 'year:T' + assert area.encoding.x.shorthand == 'month:T' assert area.encoding.y.shorthand == 'lower:Q' assert area.encoding.y2.shorthand == 'upper:Q' @@ -105,7 +105,7 @@ def test_plot_line_color(): with open_dataset(extraction_path) as ds: df = to_dataframe(ds) - df = compute_average(df, 'tas') + df = compute_average(df, 'tas', type='monthly') df = create_label(df, ('a', 'b', 'c')) chart = plot_line(df, color_scheme='viridis') @@ -114,10 +114,10 @@ def test_plot_line_color(): mean, area = chart.layer - assert mean.encoding.x.shorthand == 'year:T' + assert mean.encoding.x.shorthand == 'month:T' assert mean.encoding.y.shorthand == 'mean:Q' - assert area.encoding.x.shorthand == 'year:T' + assert area.encoding.x.shorthand == 'month:T' assert area.encoding.y.shorthand == 'lower:Q' assert area.encoding.y2.shorthand == 'upper:Q' @@ -127,8 +127,12 @@ def test_plot_line_color(): def test_plot_map(): - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + date = constants.DATE + date_specifiers = date.strftime('%Y%m%d') + extraction_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) plot_path = constants.PLOTS_PATH / 'plot_map.png' plot_path.unlink(missing_ok=True) @@ -148,8 +152,12 @@ def test_plot_map(): def test_plot_map_nocf(): - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + date = constants.DATE + date_specifiers = date.strftime('%Y%m%d') + extraction_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) plot_path = constants.PLOTS_PATH / 'plot_map_nocf.png' plot_path.unlink(missing_ok=True) @@ -169,8 +177,12 @@ def test_plot_map_nocf(): def test_plot_map_empty(): - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + date = constants.DATE + date_specifiers = date.strftime('%Y%m%d') + extraction_path = ( + constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-time-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) plot_path = constants.PLOTS_PATH / 'plot_map_empty.png' plot_path.unlink(missing_ok=True) diff --git a/testing/setup.py b/testing/setup.py index 797f88c..a8ea630 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -5,7 +5,6 @@ def main(): run_gridfile() run_seldate() - run_select_time() run_select_period() run_select_point() @@ -52,12 +51,10 @@ def run_gridfile(): def run_seldate(): input_path = constants.DATASETS_PATH / constants.TAS_PATH - for start_date, end_date, specifiers in [ - ('2015-01-01', '2016-12-31', '2015_2016'), - ('2017-01-01', '2018-12-31', '2017_2018'), - ('2019-01-01', '2020-12-31', '2019_2020') - ]: - output_path = constants.DATASETS_PATH / constants.TAS_PATH.replace('2015_2020', specifiers) + for period, path in zip(constants.TAS_SPLIT_PERIOD, constants.TAS_SPLIT_PATHS, strict=True): + start_date, end_date = period + start_date, end_date = start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d') + output_path = constants.DATASETS_PATH / path if not output_path.exists(): helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{start_date},{end_date} {input_path} {output_path}') @@ -65,28 +62,34 @@ def run_seldate(): def run_select_time(): date = constants.DATE + date_specifiers = date.strftime('%Y%m%d') path = constants.TAS_PATH input_path = constants.DATASETS_PATH / path - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') \ - .replace('2015_2020', '20180101') + output_path = ( + constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-time-cdo_') + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) output_path.parent.mkdir(parents=True, exist_ok=True) if not output_path.exists(): - helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date} {input_path} {output_path}') + helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date.strftime('%Y-%m-%d')} {input_path} {output_path}') def run_select_period(): start_date, end_date = constants.PERIOD + date_specifiers = f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}' path = constants.TAS_PATH input_path = constants.DATASETS_PATH / path - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ - .replace('2015_2020', '2015') + output_path = ( + constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-period-cdo_') \ + .replace(constants.TAS_DATE_SPECIFIERS, date_specifiers) + ) output_path.parent.mkdir(parents=True, exist_ok=True) if not output_path.exists(): From 797d1854c7ab4fabc80628ecfa4e32987ab4fc33 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Thu, 25 Dec 2025 13:52:54 +0100 Subject: [PATCH 114/162] Fix constants.py --- isimip_utils/tests/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py index 86f5c4a..6d02fbf 100644 --- a/isimip_utils/tests/constants.py +++ b/isimip_utils/tests/constants.py @@ -22,7 +22,7 @@ (date(2021, 9, 1), date(2021, 12, 31)) ) TAS_SPLIT_PATHS = [ - TAS_PATH.replace(TAS_DATE_SPECIFIERS, f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}') + TAS_PATH.replace(TAS_DATE_SPECIFIERS, f"{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}") for start_date, end_date in TAS_SPLIT_PERIOD ] From 1782d410b3abd1509545de62d408e886affab252 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 26 Dec 2025 11:30:59 +0100 Subject: [PATCH 115/162] Fix f-strings --- isimip_utils/tests/test_extractions.py | 2 +- pyproject.toml | 2 +- testing/setup.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py index 48c004e..cd3d67c 100644 --- a/isimip_utils/tests/test_extractions.py +++ b/isimip_utils/tests/test_extractions.py @@ -43,7 +43,7 @@ def test_select_time(decode_cf): @pytest.mark.parametrize('decode_cf', (True, False)) def test_select_period(decode_cf): start_date, end_date = constants.PERIOD - date_specifiers = f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}' + date_specifiers = f"{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}" dataset_path = constants.DATASETS_PATH / constants.TAS_PATH extraction_path = ( diff --git a/pyproject.toml b/pyproject.toml index ae8a299..b01db3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ packages = ["isimip_utils"] version_scheme = "release-branch-semver" [tool.ruff] -target-version = "py312" +target-version = "py311" line-length = 120 [tool.ruff.lint] diff --git a/testing/setup.py b/testing/setup.py index a8ea630..ff72485 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -75,12 +75,12 @@ def run_select_time(): output_path.parent.mkdir(parents=True, exist_ok=True) if not output_path.exists(): - helper.call(f'cdo -f nc4c -z zip_5 -L seldate,{date.strftime('%Y-%m-%d')} {input_path} {output_path}') + helper.call(f"cdo -f nc4c -z zip_5 -L seldate,{date.strftime('%Y-%m-%d')} {input_path} {output_path}") def run_select_period(): start_date, end_date = constants.PERIOD - date_specifiers = f'{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}' + date_specifiers = f"{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}" path = constants.TAS_PATH From a2baadeafbf3b8f683402f336c9127242d9e04a3 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 26 Dec 2025 12:45:03 +0100 Subject: [PATCH 116/162] Add more tests --- isimip_utils/cli.py | 11 ++++++---- isimip_utils/tests/helper.py | 25 ++++++++++++++++++++- isimip_utils/tests/test_cli.py | 31 ++++++++++++++++++++++++++- isimip_utils/tests/test_fetch.py | 30 +++++++++++++++++++------- isimip_utils/tests/test_netcdf.py | 7 +++++- isimip_utils/tests/test_pandas.py | 10 +++++++++ isimip_utils/tests/test_parameters.py | 21 +++++++++++++++++- isimip_utils/tests/test_protocol.py | 11 +++++----- isimip_utils/tests/test_utils.py | 22 +++++++++++++++++++ pyproject.toml | 10 +++++++++ 10 files changed, 156 insertions(+), 22 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 3c64f21..b6a9ab6 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -137,7 +137,7 @@ def parse_locations(value: str) -> Path: for string in value.split() ] else: - return [] + return None def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: @@ -168,9 +168,12 @@ def parse_parameters(value: str) -> Path: Returns: Dict of the form {key: values} """ - key, values_str = value.split('=') - values = values_str.split(',') - return {key: values} + if value: + key, values_str = value.split('=') + values = values_str.split(',') + return {key: values} + else: + return None class ArgumentParser(argparse.ArgumentParser): diff --git a/isimip_utils/tests/helper.py b/isimip_utils/tests/helper.py index db0440c..7248492 100644 --- a/isimip_utils/tests/helper.py +++ b/isimip_utils/tests/helper.py @@ -1,6 +1,7 @@ import json import re import subprocess +from io import BytesIO from pathlib import Path from unittest.mock import MagicMock @@ -18,7 +19,8 @@ def assert_multiline_strings_equal(a, b): for a_line, b_line in zip(a.strip().splitlines(), b.strip().splitlines(), strict=True): assert normalize_whitespace(a_line) == normalize_whitespace(b_line), (a_line, b_line) -def mock_side_effect(url, *args, **kwargs): + +def mock_json(url, *args, **kwargs): mock_response = MagicMock() mock_path = Path(url.replace('https://protocol.isimip.org', 'testing/protocol/output')) @@ -31,3 +33,24 @@ def mock_side_effect(url, *args, **kwargs): mock_response.json.return_value = None return mock_response + + +def mock_content(url, *args, **kwargs): + mock_response = MagicMock() + mock_path = Path(url.replace('https://protocol.isimip.org', 'testing/protocol/output')) + + if mock_path.exists(): + data = mock_path.read_bytes() + + mock_response.status_code = 200 + mock_response.raw = BytesIO(data) + mock_response.content = data + mock_response.iter_content.return_value = [data] + + else: + mock_response.status_code = 404 + mock_response.raw = BytesIO() + mock_response.content = b"" + mock_response.iter_content.return_value = [] + + return mock_response diff --git a/isimip_utils/tests/test_cli.py b/isimip_utils/tests/test_cli.py index d217a96..3436d4e 100644 --- a/isimip_utils/tests/test_cli.py +++ b/isimip_utils/tests/test_cli.py @@ -5,7 +5,16 @@ import pytest -from isimip_utils.cli import ArgumentParser, parse_dict, parse_filelist, parse_list, parse_path, parse_version +from isimip_utils.cli import ( + ArgumentParser, + parse_dict, + parse_filelist, + parse_list, + parse_locations, + parse_parameters, + parse_path, + parse_version, +) def test_parse_dict(): @@ -33,6 +42,16 @@ def test_parse_path(): assert isinstance(result, Path) +def test_parse_locations(): + result = parse_locations('https://example.com /opt/test ~/test') + assert result == ['https://example.com', Path('/opt/test'), Path('~/test').expanduser()] + + +def test_parse_locations_none(): + result = parse_locations('') + assert result is None + + def test_parse_filelist(): with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: f.write("/path/to/file1\n") @@ -54,6 +73,16 @@ def test_parse_filelist_none(): assert result is None +def test_parse_parameters(): + result = parse_parameters('egg=spam,foo,bar') + assert result == {'egg': ['spam', 'foo', 'bar']} + + +def test_parse_parameters_none(): + result = parse_parameters('') + assert result is None + + def test_argument_parser(): parser = ArgumentParser() parser.add_argument("--test", default="default") diff --git a/isimip_utils/tests/test_fetch.py b/isimip_utils/tests/test_fetch.py index 5682c33..2629ff0 100644 --- a/isimip_utils/tests/test_fetch.py +++ b/isimip_utils/tests/test_fetch.py @@ -1,11 +1,7 @@ from unittest.mock import patch -from isimip_utils.fetch import ( - fetch_json, - load_json, -) - -from .helper import mock_side_effect +from isimip_utils.fetch import fetch_file, fetch_json, load_file, load_json +from isimip_utils.tests import constants, helper paths = [ 'ISIMIP3a/OutputData/agriculture/ACEA/gswp3-w5e5.json', @@ -13,18 +9,28 @@ 'ISIMIP3a/OutputData/agriculture.json' ] + def test_fetch_json(): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = fetch_json("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture.json") assert data is not None def test_fetch_json_not_found(): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = fetch_json("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture/ACEA.json") assert data is None +def test_fetch_file(): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_content): + output_path = constants.OUTPUT_PATH / 'test.json' + output_path.unlink(missing_ok=True) + + fetch_file("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture.json", output_path) + assert output_path.is_file() + + def test_load_json(): data = load_json('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture.json') assert data is not None @@ -33,3 +39,11 @@ def test_load_json(): def test_load_json_not_found(): data = load_json('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture/ACEA.json') assert data is None + + +def test_load_file(): + output_path = constants.OUTPUT_PATH / 'test.json' + output_path.unlink(missing_ok=True) + + load_file('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture.json', output_path) + assert output_path.is_file() diff --git a/isimip_utils/tests/test_netcdf.py b/isimip_utils/tests/test_netcdf.py index b7f9cb3..e1d2c68 100644 --- a/isimip_utils/tests/test_netcdf.py +++ b/isimip_utils/tests/test_netcdf.py @@ -47,7 +47,12 @@ def test_init_dataset(): test_path.parent.mkdir(exist_ok=True) test_path.unlink(missing_ok=True) - dataset = init_dataset(test_path) + dataset = init_dataset( + test_path, + time=np.arange(0, 10, dtype=np.float64), + var=np.random.rand(10, 360, 720).astype(np.float64), + attrs={'var': {'long_name': 'Variable'}} + ) assert isinstance(dataset, Dataset) diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py index 47f639e..68a29dc 100644 --- a/isimip_utils/tests/test_pandas.py +++ b/isimip_utils/tests/test_pandas.py @@ -130,6 +130,16 @@ def test_compute_average(): assert df['upper'].between(290, 300).all() +def test_compute_average_monthly(): + with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: + df = to_dataframe(ds) + df = compute_average(df, 'tas', type='monthly') + + assert df['lower'].between(260, 300).all() + assert df['mean'].between(270, 300).all() + assert df['upper'].between(270, 305).all() + + def test_group_by_day(): with open_dataset(constants.EXTRACTIONS_PATH / extractions['point']) as ds: df = to_dataframe(ds) diff --git a/isimip_utils/tests/test_parameters.py b/isimip_utils/tests/test_parameters.py index 8314b1e..cf35aa5 100644 --- a/isimip_utils/tests/test_parameters.py +++ b/isimip_utils/tests/test_parameters.py @@ -1,4 +1,14 @@ -from isimip_utils.parameters import copy_placeholders, get_permutations, get_placeholders, join_parameters +from pathlib import Path + +import pytest + +from isimip_utils.parameters import ( + apply_placeholders, + copy_placeholders, + get_permutations, + get_placeholders, + join_parameters, +) parameters = { 'model': ['model_a', 'model_b'], @@ -50,3 +60,12 @@ def test_copy_placeholders(): 'foo': 'bar', 'egg': 'spam' } + + +def test_apply_placeholders(): + assert apply_placeholders('{foo}_{egg}', {'foo': 'bar', 'egg': 'spam'}) == Path('bar_spam') + + +def test_apply_placeholders_error(): + with pytest.raises(RuntimeError): + apply_placeholders('{foo}_{egg}', {'foo': 'bar'}) diff --git a/isimip_utils/tests/test_protocol.py b/isimip_utils/tests/test_protocol.py index 7dfc8ea..a34fbb1 100644 --- a/isimip_utils/tests/test_protocol.py +++ b/isimip_utils/tests/test_protocol.py @@ -9,8 +9,7 @@ fetch_tree, find_json, ) - -from .helper import mock_side_effect +from isimip_utils.tests import helper paths = [ 'ISIMIP3a/OutputData/agriculture/ACEA/gswp3-w5e5.json', @@ -27,7 +26,7 @@ def test_fetch_definitions_local(path): @pytest.mark.parametrize('path', paths) def test_fetch_pattern(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = fetch_pattern(path) assert data and isinstance(data, dict) @@ -40,7 +39,7 @@ def test_fetch_pattern_local(path): @pytest.mark.parametrize('path', paths) def test_fetch_schema(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = fetch_schema(path) assert data and isinstance(data, dict) @@ -53,7 +52,7 @@ def test_fetch_schema_local(path): @pytest.mark.parametrize('path', paths) def test_fetch_tree(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = fetch_tree(path) assert data and isinstance(data, dict) @@ -66,7 +65,7 @@ def test_fetch_tree_local(path): @pytest.mark.parametrize('path', paths) def test_find_json_fetch(path): - with patch('isimip_utils.fetch.requests.get', side_effect=mock_side_effect): + with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_json): data = find_json('https://protocol.isimip.org', 'definitions', path) assert data is not None diff --git a/isimip_utils/tests/test_utils.py b/isimip_utils/tests/test_utils.py index 362f271..f262aeb 100644 --- a/isimip_utils/tests/test_utils.py +++ b/isimip_utils/tests/test_utils.py @@ -5,6 +5,8 @@ Singleton, cached_property, exclude_path, + get_max_value, + get_min_value, include_path, validate_lat, validate_lon, @@ -77,3 +79,23 @@ def test_include_path(): assert include_path(paths, 'a/b/c') is True assert include_path(paths, 'a/b/cc') is True assert include_path(paths, 'a/b/f') is False + + +@pytest.mark.parametrize('values,result', [ + ([1, 2, 3], 1), + ([None, 2, 3], 2), + ([None, None, None], None), + ([], None) +]) +def test_get_min_value(values, result): + assert get_min_value(values) == result + + +@pytest.mark.parametrize('values,result', [ + ([1, 2, 3], 3), + ([1, 2, None], 2), + ([None, None, None], None), + ([], None) +]) +def test_get_max_value(values, result): + assert get_max_value(values) == result diff --git a/pyproject.toml b/pyproject.toml index b01db3c..fb2dd8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,3 +114,13 @@ pytest = ["pytest"] [tool.typos.default.extend-words] iy = "iy" + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "except PackageNotFoundError:", + "except requests.exceptions.ConnectionError", + "raise AssertionError", + "raise NotImplementedError", + "raise RuntimeError" +] From 9430b6aff49badb7ce7bd57bb3e79368fab03ef9 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 26 Dec 2025 13:02:12 +0100 Subject: [PATCH 117/162] Fix tests --- isimip_utils/tests/test_fetch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/isimip_utils/tests/test_fetch.py b/isimip_utils/tests/test_fetch.py index 2629ff0..ea7131f 100644 --- a/isimip_utils/tests/test_fetch.py +++ b/isimip_utils/tests/test_fetch.py @@ -25,6 +25,7 @@ def test_fetch_json_not_found(): def test_fetch_file(): with patch('isimip_utils.fetch.requests.get', side_effect=helper.mock_content): output_path = constants.OUTPUT_PATH / 'test.json' + output_path.parent.mkdir(exist_ok=True, parents=True) output_path.unlink(missing_ok=True) fetch_file("https://protocol.isimip.org/definitions/ISIMIP3a/OutputData/agriculture.json", output_path) @@ -43,6 +44,7 @@ def test_load_json_not_found(): def test_load_file(): output_path = constants.OUTPUT_PATH / 'test.json' + output_path.parent.mkdir(exist_ok=True, parents=True) output_path.unlink(missing_ok=True) load_file('testing/protocol/output/definitions/ISIMIP3a/OutputData/agriculture.json', output_path) From dc31a2abdf42408804e88d40005ef58b87430c3c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 7 Jan 2026 17:37:07 +0100 Subject: [PATCH 118/162] Try to find matching fragment in pattern.match_string --- isimip_utils/patterns.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/isimip_utils/patterns.py b/isimip_utils/patterns.py index 7ee610a..a66eb86 100644 --- a/isimip_utils/patterns.py +++ b/isimip_utils/patterns.py @@ -138,8 +138,8 @@ def match_string(pattern: re.Pattern, string: str) -> tuple[Path, dict]: Raises: DidNotMatch: If the string doesn't match the pattern. """ - logger.debug(pattern.pattern) - logger.debug(string) + logger.debug('pattern = "%s"', pattern.pattern) + logger.debug('string = "%s"', string) # try to match the string match = pattern.search(string) @@ -154,4 +154,17 @@ def match_string(pattern: re.Pattern, string: str) -> tuple[Path, dict]: return Path(match.group(0)), specifiers else: + # try to find a matching fragment + split_pattern = pattern.pattern.split('_') + for i in range(len(split_pattern), 0, -1): + try: + sub_pattern = re.compile('_'.join(split_pattern[:i])) + sub_match = sub_pattern.search(string) + if sub_match: + fragment = sub_match.group(0) + raise DidNotMatch(f'No match for "{string}", last matching fragment was "{fragment}"') + except re.error: + pass + + # just raise the exception if no fragment was found raise DidNotMatch(f'No match for {string} ("{pattern.pattern}")') From 488ece9c001dd40539c098d71e7e0a268d7f5a6c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 11:19:00 +0100 Subject: [PATCH 119/162] Make target optional for fetch_file and load_file --- isimip_utils/fetch.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/isimip_utils/fetch.py b/isimip_utils/fetch.py index 077397d..9392d21 100644 --- a/isimip_utils/fetch.py +++ b/isimip_utils/fetch.py @@ -30,15 +30,15 @@ def fetch_json(url: str) -> Any | None: return response.json() -def fetch_file(url: str, target: str | Path) -> bool: +def fetch_file(url: str, target: None | str | Path = None) -> bool: """Download file from a URL. Args: location (str | Path): URL to download file from. - target (str | Path): Target path. + target (str | Path): Target path, or None if the content should be returned. Returns: - True, or None if request fails. + Target path if it was provided, the content otherwise, or None if the request fails. """ logger.debug('url = %s', url) @@ -47,10 +47,14 @@ def fetch_file(url: str, target: str | Path) -> bool: except requests.exceptions.ConnectionError: return None - if response.status_code == 200: - with open(target, "wb") as fp: - fp.write(response.content) - return True + if target is None: + return response.content.decode() + else: + target.parent.mkdir(exist_ok=True, parents=True) + if response.status_code == 200: + with open(target, "wb") as fp: + fp.write(response.content) + return target def load_json(path: str | Path) -> Any | None: @@ -69,18 +73,23 @@ def load_json(path: str | Path) -> Any | None: return json.loads(open(path).read()) -def load_file(path: str | Path, target: str | Path) -> bool: +def load_file(path: str | Path, target: None | str | Path = None) -> bool: """Copy a file from a local path. Args: location (str | Path): URL to download file from. - target (str | Path): Target path. + target (str | Path): Target path, or None if the content should be returned. Returns: - True, or None if request fails. + Target path if it was provided, the content otherwise, or None if the request fails. """ logger.debug('path = %s', path) path = Path(path) if path.is_file(): - shutil.copy(path, target) + if target is None: + return path.read_text() + else: + target.parent.mkdir(exist_ok=True, parents=True) + shutil.copy(path, target) + return target From 14980c8dd80e1fe163e6715ace47426b078e79fe Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 11:19:15 +0100 Subject: [PATCH 120/162] Remove base_path from find_files --- isimip_utils/files.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/isimip_utils/files.py b/isimip_utils/files.py index 9e99349..f974471 100644 --- a/isimip_utils/files.py +++ b/isimip_utils/files.py @@ -7,12 +7,11 @@ logger = logging.getLogger(__name__) -def find_files(base_path: str | Path, file_iter: Iterable[Path], +def find_files(file_iter: Iterable[Path], pattern: str = r'_(?P\d{4})_(?P\d{4})?\.nc\d*$') -> tuple[list[tuple], int, int]: """Find files for a given (dataset) path, matching a regex pattern for start and end year. Args: - base_path (str | Path): Base path for file discovery. file_iter (Iterable[Path]): Iterator over file paths to search through. pattern (str): Regular expression for start and end year matching. From be2e553e68cc70a3e6f98537bbf0bee39c0939cf Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 11:19:35 +0100 Subject: [PATCH 121/162] Use short name in plots --- isimip_utils/pandas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/isimip_utils/pandas.py b/isimip_utils/pandas.py index 3db5a8c..de3f286 100644 --- a/isimip_utils/pandas.py +++ b/isimip_utils/pandas.py @@ -119,7 +119,7 @@ def get_data_var_labels(df: pd.DataFrame) -> str: """ labels = [] for data_var in get_data_vars(df): - data_var_name = df.attrs['data_vars'][data_var].get('long_name', data_var) + data_var_name = df.attrs['data_vars'][data_var].get('name', data_var) data_var_units = df.attrs['data_vars'][data_var].get('units') labels.append(f'{data_var_name} [{data_var_units}]' if data_var_units else data_var_name) return tuple(labels) @@ -180,7 +180,11 @@ def compute_average(df: pd.DataFrame, data_var: None | str = None, area: bool = # update attrs df.attrs = attrs df.attrs['coords'] = {column_name: {'long_name': column_name.capitalize(), 'axis': 'T'}} - df.attrs['data_vars'] = { 'mean': {} } + df.attrs['data_vars'] = { + 'mean': { + 'name': f'avg {type} {data_var}' + } + } if data_var_long_name: df.attrs['data_vars']['mean']['long_name'] = f'Average {type} {data_var_long_name.lower()}' if data_var_units: From 5e4df3af7317a79e2e16547a4fd8461022351a9a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 11:19:57 +0100 Subject: [PATCH 122/162] Strip trailing slash in parse_locations --- isimip_utils/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index b6a9ab6..0de576d 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -133,7 +133,7 @@ def parse_locations(value: str) -> Path: """ if value: return [ - string if urlparse(string).scheme else Path(string).expanduser() + string.rstrip('/') if urlparse(string).scheme else Path(string).expanduser() for string in value.split() ] else: From 771fdae058d7bd5383516b1c7fa2decf3a726602 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 12:34:05 +0100 Subject: [PATCH 123/162] Refactor parsing functions in cli.py --- isimip_utils/cli.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 0de576d..1256865 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -88,7 +88,10 @@ def parse_list(string: str) -> list[str]: Returns: List of stripped values. """ - return [value.strip() for value in string.split(',')] + if string: + return [value.strip() for value in string.split(',')] + else: + return [] def parse_version(value: str) -> str: @@ -122,7 +125,7 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() -def parse_locations(value: str) -> Path: +def parse_locations(value: str) -> list[str | Path]: """Parse and expand a location string as list of URL or Path objects. Args: @@ -137,10 +140,10 @@ def parse_locations(value: str) -> Path: for string in value.split() ] else: - return None + return [] -def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: +def parse_filelist(filelist_file: str | Path | None) -> set[str]: """Parse a filelist file into a set of file paths. Args: @@ -148,18 +151,18 @@ def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: Lines starting with '#' are treated as comments. Returns: - Set of file paths, or None if filelist_file is None/empty. + Set of file paths. """ if filelist_file: with open(filelist_file) as f: filelist = {line for line in f.read().splitlines() if (line and not line.startswith('#'))} else: - filelist = None + filelist = {} return filelist -def parse_parameters(value: str) -> Path: +def parse_parameters(value: str) -> Path | None: """Parse and expand a parameters string (a=b). Args: From 91b33b32b109d872e870e93edb6da799ca746f9e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 15:48:04 +0100 Subject: [PATCH 124/162] Refactor plot_grid --- isimip_utils/plot.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 3e0b380..9ea22aa 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -311,12 +311,13 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | ) -def plot_grid(permutations: list[tuple], plots: dict, empty_plot: alt.Chart, layer: bool = True, - x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: +def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], plots: dict, empty_plot: alt.Chart, + layer: bool = True, x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: """Create a grid of plots organized by parameter permutations. Args: - permutations (list): List of permutations with tuples of parameters. + grid_permutations (list): List the permutations (with tuples of parameters) which span the grid. + plot_permutations (list): List the permutations (with tuples of parameters) for each plot. plots (dict): Dictionary mapping permutation tuples to Chart objects. empty_plot (alt.Chart): Chart to use when a permutation has no data. layer (bool): Whether to layer plots or concatenate vertically (default: True). @@ -328,29 +329,30 @@ def plot_grid(permutations: list[tuple], plots: dict, empty_plot: alt.Chart, lay Altair Chart object with grid layout. """ rows = [] - prev_permutation = None + prev = None - for permutation in permutations: - row_title = permutation[0] if len(permutation) > 0 else '' - column_title = permutation[1] if len(permutation) > 1 else '' + for grid_permutation in grid_permutations: + row_title = grid_permutation[0] if len(grid_permutation) > 0 else '' + column_title = grid_permutation[1] if len(grid_permutation) > 1 else '' - if prev_permutation is None or (len(permutation) > 0 and permutation[0] != prev_permutation[0]): + if prev is None or (len(grid_permutation) > 0 and grid_permutation[0] != prev[0]): # start a new row column = [] row = [(column_title, column)] rows.append((row_title, row)) - elif prev_permutation is None or (len(permutation) > 1 and permutation[1] != prev_permutation[1]): + elif prev is None or (len(grid_permutation) > 1 and grid_permutation[1] != prev[1]): # start a new column column = [] row.append((column_title, column)) - plot = plots.get(permutation, empty_plot) - if not layer: - plot = plot.properties(title=' '.join(permutation[2:])) + for plot_permutation in plot_permutations: + plot = plots.get(grid_permutation + plot_permutation, empty_plot) + if not layer: + plot = plot.properties(title=' '.join(plot_permutation)) - column.append(plot) + column.append(plot) - prev_permutation = permutation + prev = grid_permutation chart = alt.vconcat(*[ alt.hconcat(*[ From 54ce2d1f90137506f21b40409e4b2671fb9b2e69 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 15:48:26 +0100 Subject: [PATCH 125/162] Improve error message in apply_placeholders --- isimip_utils/parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/parameters.py b/isimip_utils/parameters.py index f96ca74..e898813 100644 --- a/isimip_utils/parameters.py +++ b/isimip_utils/parameters.py @@ -79,7 +79,7 @@ def apply_placeholders(path_template: str | Path, placeholders: dict) -> Path: try: path = str(path_template).format(**placeholders) except KeyError as e: - raise RuntimeError('Some of the placeholders are missing.') from e + raise RuntimeError(f'Some of the placeholders are missing ({e}).') from e path = Path(path) return path.with_stem(path.stem.lower()) From a35d6a7c3c4a5676a1a4ade2f8b6590760c19dfa Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 15:49:05 +0100 Subject: [PATCH 126/162] Rename optional dependencies --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fb2dd8b..64fd614 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,19 +33,19 @@ Repository = "https://github.com/ISI-MIP/isimip-utils" [project.optional-dependencies] all = [ - "isimip-utils[netcdf,plots,shapes,xarray,dev,pytest,docs]" + "isimip-utils[netcdf,altair,geopandas,xarray,dev,pytest,docs]" ] recommended = [ - "isimip-utils[netcdf,plots,shapes,xarray]" + "isimip-utils[netcdf,altair,geopandas,xarray]" ] netcdf = [ "netCDF4~=1.7" ] -plots = [ +altair = [ "altair[all]~=6.0", "palettable~=3.3", ] -shapes = [ +geopandas = [ "geopandas~=1.1", "rioxarray>=0.19", ] From eca50963703427cdfcbeb7bf9c084a63859748d4 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 10 Jan 2026 16:15:52 +0100 Subject: [PATCH 127/162] Fix tests and update cli.py --- isimip_utils/cli.py | 29 ++++++++++------------------- isimip_utils/tests/test_files.py | 4 ++-- isimip_utils/tests/test_pandas.py | 6 +++--- isimip_utils/tests/test_plot.py | 11 ++++++----- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 1256865..fa1f5a1 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -64,7 +64,7 @@ def setup_logs(log_level: str = 'WARN', log_file: str | None = None, root_logger.addHandler(file_handler) -def parse_dict(string: str) -> dict[str, list[str]]: +def parse_dict(string: str) -> dict[str, list[str]] | None: """Parse a string in format 'key=value1,value2' into a dictionary. Args: @@ -73,13 +73,14 @@ def parse_dict(string: str) -> dict[str, list[str]]: Returns: Dictionary with single key mapping to list of values. """ - key, values = string.split('=') - return { - key.strip(): [value.strip() for value in values.split(',')] - } + if string: + key, values = string.split('=') + return { + key.strip(): [value.strip() for value in values.split(',')] + } -def parse_list(string: str) -> list[str]: +def parse_list(string: str) -> list[str] | None: """Parse a comma-separated string into a list. Args: @@ -90,8 +91,6 @@ def parse_list(string: str) -> list[str]: """ if string: return [value.strip() for value in string.split(',')] - else: - return [] def parse_version(value: str) -> str: @@ -125,7 +124,7 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() -def parse_locations(value: str) -> list[str | Path]: +def parse_locations(value: str) -> list[str | Path] | None: """Parse and expand a location string as list of URL or Path objects. Args: @@ -139,11 +138,9 @@ def parse_locations(value: str) -> list[str | Path]: string.rstrip('/') if urlparse(string).scheme else Path(string).expanduser() for string in value.split() ] - else: - return [] -def parse_filelist(filelist_file: str | Path | None) -> set[str]: +def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: """Parse a filelist file into a set of file paths. Args: @@ -155,11 +152,7 @@ def parse_filelist(filelist_file: str | Path | None) -> set[str]: """ if filelist_file: with open(filelist_file) as f: - filelist = {line for line in f.read().splitlines() if (line and not line.startswith('#'))} - else: - filelist = {} - - return filelist + return {line for line in f.read().splitlines() if (line and not line.startswith('#'))} def parse_parameters(value: str) -> Path | None: @@ -175,8 +168,6 @@ def parse_parameters(value: str) -> Path | None: key, values_str = value.split('=') values = values_str.split(',') return {key: values} - else: - return None class ArgumentParser(argparse.ArgumentParser): diff --git a/isimip_utils/tests/test_files.py b/isimip_utils/tests/test_files.py index a8d3acb..cbe04eb 100644 --- a/isimip_utils/tests/test_files.py +++ b/isimip_utils/tests/test_files.py @@ -12,7 +12,7 @@ def test_find_files(): fake_path.name ] - result = find_files(file_path.parent, files) + result = find_files(files) assert len(result) assert result == [ (file_path.name, 1901, 2016) @@ -31,7 +31,7 @@ def test_find_files_with_pattern(): pattern = r'(_(?P\d{4}))?(_(?P\d{4}))?(_\w+)?\.nc\d*$' - result = find_files(file_path.parent, files, pattern=pattern) + result = find_files(files, pattern=pattern) assert len(result) assert result == [ (none_path.name, None, None), # result is sorted diff --git a/isimip_utils/tests/test_pandas.py b/isimip_utils/tests/test_pandas.py index 68a29dc..0c2906e 100644 --- a/isimip_utils/tests/test_pandas.py +++ b/isimip_utils/tests/test_pandas.py @@ -102,8 +102,8 @@ def test_get_first_data_var(extraction, result): @pytest.mark.parametrize('extraction,result', [ - ('bbox', ('Near-Surface Air Temperature [K]', )), - ('point', ('Near-Surface Air Temperature [K]', )) + ('bbox', ('tas [K]', )), + ('point', ('tas [K]', )) ]) def test_get_data_var_labels(extraction, result): with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: @@ -112,7 +112,7 @@ def test_get_data_var_labels(extraction, result): @pytest.mark.parametrize('extraction,result', [ - ('point', 'Near-Surface Air Temperature [K]') + ('point', 'tas [K]') ]) def test_get_first_data_var_label(extraction, result): with open_dataset(constants.EXTRACTIONS_PATH / extractions[extraction]) as ds: diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py index 5ea473c..a785ea9 100644 --- a/isimip_utils/tests/test_plot.py +++ b/isimip_utils/tests/test_plot.py @@ -221,21 +221,22 @@ def test_plot_grid(): df_empty = pd.DataFrame({ 'time': dataframes[2]['time'], 'tas': np.nan }) - permutations = [ + grid_permutations = [ ('a', 'x'), ('a', 'y'), - ('b', 'x') + ('b', 'x'), ] + plot_permutations = [()] plots = {} - for permutation, df in zip(permutations, dataframes, strict=True): + for permutation, df in zip(grid_permutations, dataframes, strict=True): plots[permutation] = plot_line(df) empty_plot = plot_line(df, empty=True) - permutations.append(('b', 'y')) + grid_permutations.append(('b', 'y')) - chart = plot_grid(permutations, plots, x='independent', empty_plot=empty_plot, layer=False) + chart = plot_grid(grid_permutations, plot_permutations, plots, x='independent', empty_plot=empty_plot, layer=False) top, bottom = chart.vconcat top_left, top_right = top.hconcat From 3d9c5508ba78bff46c3c4a31d8e503f8efa91f9c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sun, 11 Jan 2026 15:31:43 +0100 Subject: [PATCH 128/162] Revert changes to cli.py --- isimip_utils/cli.py | 20 ++++++++++++++------ isimip_utils/tests/test_cli.py | 6 +++--- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index fa1f5a1..952d31d 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -80,7 +80,7 @@ def parse_dict(string: str) -> dict[str, list[str]] | None: } -def parse_list(string: str) -> list[str] | None: +def parse_list(string: str) -> list[str]: """Parse a comma-separated string into a list. Args: @@ -91,6 +91,8 @@ def parse_list(string: str) -> list[str] | None: """ if string: return [value.strip() for value in string.split(',')] + else: + return [] def parse_version(value: str) -> str: @@ -124,7 +126,7 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() -def parse_locations(value: str) -> list[str | Path] | None: +def parse_locations(value: str) -> list[str | Path]: """Parse and expand a location string as list of URL or Path objects. Args: @@ -138,9 +140,11 @@ def parse_locations(value: str) -> list[str | Path] | None: string.rstrip('/') if urlparse(string).scheme else Path(string).expanduser() for string in value.split() ] + else: + return [] -def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: +def parse_filelist(filelist_file: str | Path | None) -> set[str]: """Parse a filelist file into a set of file paths. Args: @@ -148,14 +152,16 @@ def parse_filelist(filelist_file: str | Path | None) -> set[str] | None: Lines starting with '#' are treated as comments. Returns: - Set of file paths. + List of file paths. """ if filelist_file: with open(filelist_file) as f: - return {line for line in f.read().splitlines() if (line and not line.startswith('#'))} + return list({line for line in f.read().splitlines() if (line and not line.startswith('#'))}) + else: + return [] -def parse_parameters(value: str) -> Path | None: +def parse_parameters(value: str) -> Path: """Parse and expand a parameters string (a=b). Args: @@ -168,6 +174,8 @@ def parse_parameters(value: str) -> Path | None: key, values_str = value.split('=') values = values_str.split(',') return {key: values} + else: + return {} class ArgumentParser(argparse.ArgumentParser): diff --git a/isimip_utils/tests/test_cli.py b/isimip_utils/tests/test_cli.py index 3436d4e..16a171b 100644 --- a/isimip_utils/tests/test_cli.py +++ b/isimip_utils/tests/test_cli.py @@ -49,7 +49,7 @@ def test_parse_locations(): def test_parse_locations_none(): result = parse_locations('') - assert result is None + assert result == [] def test_parse_filelist(): @@ -70,7 +70,7 @@ def test_parse_filelist(): def test_parse_filelist_none(): result = parse_filelist(None) - assert result is None + assert result == [] def test_parse_parameters(): @@ -80,7 +80,7 @@ def test_parse_parameters(): def test_parse_parameters_none(): result = parse_parameters('') - assert result is None + assert result == {} def test_argument_parser(): From 405433331dc6d08f05daa7183faf07334fe0583e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 12 Jan 2026 11:57:51 +0100 Subject: [PATCH 129/162] Use env_prefix with ArgumentParser --- isimip_utils/cli.py | 7 +++++-- isimip_utils/tests/test_cli.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 952d31d..6fa7f4b 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -196,6 +196,8 @@ class ArgumentParser(argparse.ArgumentParser): '/etc/isimip.toml', ] + env_prefix = 'ISIMIP_' + def parse_args(self, *args) -> argparse.Namespace: return super().parse_args(*args, namespace=self.build_default_args()) @@ -229,12 +231,13 @@ def build_default_args(self) -> argparse.Namespace: if not action.required and action.dest != 'help': key = action.dest key_upper = key.upper() + key_env = self.env_prefix + key_upper value = None - if os.getenv(key_upper): + if os.getenv(key_env): # if the attribute is in the environment, take the value - value = os.getenv(key_upper) + value = os.getenv(key_env) if value.lower() == 'true': value = True elif value.lower() == 'false': diff --git a/isimip_utils/tests/test_cli.py b/isimip_utils/tests/test_cli.py index 16a171b..b6617e2 100644 --- a/isimip_utils/tests/test_cli.py +++ b/isimip_utils/tests/test_cli.py @@ -110,7 +110,7 @@ def test_argument_parser_with_config(tmp_path): def test_argument_parser_with_env(): - os.environ["TEST"] = "env_value" + os.environ["ISIMIP_TEST"] = "env_value" try: parser = ArgumentParser() @@ -119,4 +119,4 @@ def test_argument_parser_with_env(): args = parser.parse_args([]) assert args.test == "env_value" finally: - del os.environ["TEST"] + del os.environ["ISIMIP_TEST"] From 1eb3abba8033673d61f4cac041c1d321e9a2280b Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 12 Jan 2026 12:43:28 +0100 Subject: [PATCH 130/162] Refactor exclude_path and include_path --- isimip_utils/tests/test_utils.py | 49 ++++++++++++++++++++------------ isimip_utils/utils.py | 33 ++++++++++++--------- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/isimip_utils/tests/test_utils.py b/isimip_utils/tests/test_utils.py index f262aeb..6efbd03 100644 --- a/isimip_utils/tests/test_utils.py +++ b/isimip_utils/tests/test_utils.py @@ -12,12 +12,6 @@ validate_lon, ) -paths = [ - 'a/b/c', - 'a/b/d', - 'a/b/e' -] - def test_singleton(): a = Singleton() @@ -67,18 +61,37 @@ def test_validate_lon_error(lon): validate_lon(lon) -def test_exclude_path(): - assert exclude_path([], 'a/b/c') is False - assert exclude_path(paths, 'a/b/c') is True - assert exclude_path(paths, 'a/b/cc') is True - assert exclude_path(paths, 'a/b/f') is False - - -def test_include_path(): - assert include_path([], 'a/b/c') is True - assert include_path(paths, 'a/b/c') is True - assert include_path(paths, 'a/b/cc') is True - assert include_path(paths, 'a/b/f') is False +@pytest.mark.parametrize('exclude,path,match,result', ( + ([], 'a/b/c', 'any', False), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/c', 'any', True), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/cc', 'any', True), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/f', 'any', False), + (['a_b', 'c_d'], 'a_b_c_d', 'any', True), + (['a_b', 'c_d'], 'a_b_c_d', 'all', True), + (['a_b', 'c_e'], 'a_b_c_d', 'any', True), + (['a_b', 'c_e'], 'a_b_c_d', 'all', False), + (['a_e', 'c_d'], 'a_b_c_d', 'any', True), + (['a_e', 'c_d'], 'a_b_c_e', 'all', False), +)) +def test_exclude_path(exclude, path, match, result): + assert exclude_path(exclude, path, match) is result + + + +@pytest.mark.parametrize('include,path,match,result', ( + ([], 'a/b/c', 'any', True), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/c', 'any', True), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/cc', 'any', True), + (['a/b/c', 'a/b/d', 'a/b/e'], 'a/b/f', 'any', False), + (['a_b', 'c_d'], 'a_b_c_d', 'any', True), + (['a_b', 'c_d'], 'a_b_c_d', 'all', True), + (['a_b', 'c_e'], 'a_b_c_d', 'any', True), + (['a_b', 'c_e'], 'a_b_c_d', 'all', False), + (['a_e', 'c_d'], 'a_b_c_d', 'any', True), + (['a_e', 'c_d'], 'a_b_c_e', 'all', False), +)) +def test_include_path(include, path, match, result): + assert include_path(include, path, match) is result @pytest.mark.parametrize('values,result', [ diff --git a/isimip_utils/utils.py b/isimip_utils/utils.py index af8fd44..f455e74 100644 --- a/isimip_utils/utils.py +++ b/isimip_utils/utils.py @@ -1,7 +1,7 @@ """Additional utility functions for ISIMIP tools.""" from collections.abc import Callable from pathlib import Path -from typing import Any +from typing import Any, Literal from .exceptions import ValidationError @@ -49,40 +49,47 @@ def __get__(self, instance: Any, cls: type | None = None) -> Any: return value -def exclude_path(exclude: list[str] | None, path: Path | str) -> bool: +def exclude_path(exclude: list[str] | None, path: Path | str, match: Literal['any', 'all'] = 'any') -> bool: """Check if a path should be excluded based on exclude patterns. Args: - exclude (list[str] | None): List of exclude patterns (strings). Path is excluded if it - starts with any pattern. + exclude (list[str] | None): List of include patterns (strings). Path is excluded if it + contains any or all patterns, depending on the match argument or if include list is None/empty. path (Path | str): Path to check for exclusion. + match ('any', 'all'): Match all or any of the lines in exclude. Returns: True if path should be excluded, False otherwise. """ if exclude: - for exclude_string in exclude: - if str(path).startswith(exclude_string): - return True + if match == 'any': + return any(string in str(path) for string in exclude) + elif match == 'all': + return all(string in str(path) for string in exclude) + else: + raise ValidationError(f'match={match} needs to be "any" or "all"') return False -def include_path(include: list[str] | None, path: Path | str) -> bool: +def include_path(include: list[str] | None, path: Path | str, match: Literal['any', 'all'] = 'any') -> bool: """Check if a path should be included based on include patterns. Args: include (list[str] | None): List of include patterns (strings). Path is included if it - starts with any pattern, or if include list is None/empty. + contains any or all patterns, depending on the match argument or if include list is None/empty. path (Path | str): Path to check for inclusion. + match ('any', 'all'): Match all or any of the lines in exclude. Returns: True if path should be included, False otherwise. """ if include: - for include_string in include: - if str(path).startswith(include_string): - return True - return False + if match == 'any': + return any(string in str(path) for string in include) + elif match == 'all': + return all(string in str(path) for string in include) + else: + raise ValidationError(f'match={match} needs to be "any" or "all"') else: return True From c74553e072fd6bbe37a7d849ee4644be863b993a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 12 Jan 2026 15:39:02 +0100 Subject: [PATCH 131/162] Omit title for empty plots --- isimip_utils/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 9ea22aa..b028134 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -166,7 +166,7 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None x_type = x_type or ('T' if get_first_coord_axis(df) == 'T' else 'Q') x = alt.X( f'{x_field}:{x_type}', - title=x_label + title=x_label if not empty else "" ) y_field = y_field or get_first_data_var(df) @@ -174,7 +174,7 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None y_type = y_type or 'Q' y = alt.Y( f'{y_field}:{y_type}', - title=y_label, + title=y_label if not empty else "", axis=alt.Axis(format=y_format) if y_format else alt.Axis(), scale=alt.Scale(zero=False, nice=False) ) From b7e0d362d74d5985b7806893414a950ac3a118c4 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 13 Jan 2026 13:50:17 +0100 Subject: [PATCH 132/162] Fix Settings.update --- isimip_utils/config.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 98cbb48..b47d220 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -56,15 +56,13 @@ def update(self, values: dict[str, Any]) -> dict[str, Any]: if not hasattr(self, name): raise ValueError(f'unknown key "{key}"') - if isinstance(current_value, list): - current_value.extend(value if isinstance(value, list) else [value]) - elif isinstance(current_value, dict): - if not isinstance(value, dict): - raise ValueError(f'key "{key}" is not a dict') - self._settings[name].update(value) - else: - self._settings[name] = value + if isinstance(current_value, list) and not isinstance(value, list): + raise ValueError(f'key "{key}" is not a list') + if isinstance(current_value, dict) and not isinstance(value, dict): + raise ValueError(f'key "{key}" is not a dict') + + self._settings[name] = value def update_from_toml(self, path): """Update the settings from a toml file.. From b272fb8adefe137c89b4b50ed068fbb6e20efd8e Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 13 Jan 2026 14:33:20 +0100 Subject: [PATCH 133/162] Fix growing season handling --- isimip_utils/xarray.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index bb408da..b2ad4a2 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -150,8 +150,20 @@ def open_dataset(path: str | Path, decode_cf: bool = True, load: bool = False) - if ds['time'].units.startswith('growing seasons'): units = ds['time'].units.replace('growing seasons', 'common_years') - times = cftime.num2date(ds['time'], units, calendar='365_day') - ds['time'] = times + + ds['time'].attrs['long_name'] = 'Growing season' + ds['time'].attrs['units'] = '' + + time_array = cftime.num2date(ds['time'].values, units=units, calendar='365_day') + time = xr.DataArray( + time_array, + dims=['time'], + coords={'time': time_array}, + name='time', + attrs=ds['time'].attrs + ) + + ds = ds.assign_coords(time=time) if load: ds.load() From ca2c38fe7c46f1e0ca94e68b43eb434b5fa8973c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 13 Jan 2026 17:54:41 +0100 Subject: [PATCH 134/162] Refactor config file handling in cli.py --- isimip_utils/cli.py | 48 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 6fa7f4b..dea9188 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -126,7 +126,7 @@ def parse_path(value: str) -> Path: return Path(value).expanduser() -def parse_locations(value: str) -> list[str | Path]: +def parse_locations(value: str | list) -> list[str | Path]: """Parse and expand a location string as list of URL or Path objects. Args: @@ -138,7 +138,7 @@ def parse_locations(value: str) -> list[str | Path]: if value: return [ string.rstrip('/') if urlparse(string).scheme else Path(string).expanduser() - for string in value.split() + for string in (value.split() if isinstance(value, str) else value) ] else: return [] @@ -198,8 +198,8 @@ class ArgumentParser(argparse.ArgumentParser): env_prefix = 'ISIMIP_' - def parse_args(self, *args) -> argparse.Namespace: - return super().parse_args(*args, namespace=self.build_default_args()) + def parse_args(self, *args, config_path=None) -> argparse.Namespace: + return super().parse_args(*args, namespace=self.build_default_args(config_path)) def get_defaults(self) -> dict: defaults = {} @@ -210,7 +210,7 @@ def get_defaults(self) -> dict: defaults.update(vars(self.build_default_args())) return defaults - def read_config(self) -> dict: + def read_global_config(self) -> dict: for config_file in self.config_files: config_path = Path(config_file).expanduser() if config_path.is_file(): @@ -220,9 +220,18 @@ def read_config(self) -> dict: return data[self.prog] return {} - def build_default_args(self) -> argparse.Namespace: - # read config file - config = self.read_config() + def read_local_config(self, config_path) -> dict: + if config_path and config_path.is_file(): + with open(config_path, 'rb') as fp: + return tomllib.load(fp) + return {} + + def build_default_args(self, config_path=None) -> argparse.Namespace: + # read config file(s) + config = dict( + **self.read_global_config(), + **self.read_local_config(config_path) + ) # init the default namespace default_args = argparse.Namespace() @@ -245,18 +254,25 @@ def build_default_args(self) -> argparse.Namespace: elif value.lower() == 'none': value = None + # apply action type + if value and action.type is not None: + try: + value = action.type(value) + except argparse.ArgumentTypeError as e: + raise ConfigError(f'argument "{key}": {e}') from e + elif config and key in config: # if the attribute is in the config file, take it from there value = config.get(key) - if value is not None: - # apply action.type - if action.type is not None: + # apply certain action types + if value and action.type in [parse_filelist, parse_locations, parse_path, parse_version]: try: value = action.type(value) except argparse.ArgumentTypeError as e: raise ConfigError(f'argument "{key}": {e}') from e + if value is not None: # check action.action if action.const and value not in [True, False]: raise ConfigError(f'argument "{key}": invalid choice "{value}" (choose true or false)') @@ -265,6 +281,16 @@ def build_default_args(self) -> argparse.Namespace: if action.choices and value not in action.choices: raise ConfigError(f'argument "{key}": invalid choice "{value}" (choose from {action.choices})') + # check list + if action.type in (list, parse_list, parse_locations): + if not isinstance(value, list): + raise ConfigError(f'argument "{key}": needs to be a list') + + # check dict + if action.type in (dict, parse_dict): + if not isinstance(value, list): + raise ConfigError(f'argument "{key}": needs to be a dict') + # add the key and value to the default_args setattr(default_args, key, value) From a70259c9c77adb001b8d05f25da550a3e47f0288 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 13 Jan 2026 17:56:08 +0100 Subject: [PATCH 135/162] Remove update methods from Settings --- isimip_utils/config.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/isimip_utils/config.py b/isimip_utils/config.py index b47d220..6dd4c77 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -1,6 +1,5 @@ """Configuration management for ISIMIP tools.""" import logging -import tomllib from typing import Any from .utils import Singleton @@ -43,38 +42,6 @@ def to_dict(self) -> dict[str, Any]: """ return self._settings - def update(self, values: dict[str, Any]) -> dict[str, Any]: - """Update the settings from a dictionary. - - Args: - values (dict[str, Any]): Dictionary of setting key-value pairs. - """ - for key, value in values.items(): - name = key.upper() - current_value = self._settings[name] - - if not hasattr(self, name): - raise ValueError(f'unknown key "{key}"') - - if isinstance(current_value, list) and not isinstance(value, list): - raise ValueError(f'key "{key}" is not a list') - - if isinstance(current_value, dict) and not isinstance(value, dict): - raise ValueError(f'key "{key}" is not a dict') - - self._settings[name] = value - - def update_from_toml(self, path): - """Update the settings from a toml file.. - - Args: - path (Path): Path to the toml file/. - """ - if path and path.exists(): - config = tomllib.loads(path.read_text()) - self.update(config) - - @classmethod def from_dict(cls, values: dict[str, Any]) -> 'Settings': """Create a Settings instance from a dictionary. From 9ecfc1e59b18765ee00c93e044905d0020f5ed21 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 13 Jan 2026 18:27:29 +0100 Subject: [PATCH 136/162] Fix build_default_args --- isimip_utils/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index dea9188..f148460 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -237,7 +237,7 @@ def build_default_args(self, config_path=None) -> argparse.Namespace: default_args = argparse.Namespace() for action in self._actions: - if not action.required and action.dest != 'help': + if action.dest not in ['config', 'help']: key = action.dest key_upper = key.upper() key_env = self.env_prefix + key_upper From 34bc2333b23054b7fda1ac01131552e5bed012d2 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 19 Jan 2026 18:25:17 +0100 Subject: [PATCH 137/162] Fix set_nan_to_fill_value --- isimip_utils/tests/test_xarray.py | 112 +++++++++++++++++++++++++++--- isimip_utils/xarray.py | 22 +++--- 2 files changed, 114 insertions(+), 20 deletions(-) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 076def4..d2ef689 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -61,11 +61,105 @@ def test_init_dataset(): ''') +def test_init_dataset_float(): + lon_size, lat_size = 18, 9 + + var = np.random.rand(lat_size, lon_size).astype(np.float32) + + attrs = { + 'var': { + 'long_name': 'Variable' + } + } + + ds = init_dataset(lon=lon_size, lat=lat_size, var=var, attrs=attrs) + + assert isinstance(ds, xr.Dataset) + assert ds.sizes['lon'] == lon_size + assert ds.sizes['lat'] == lat_size + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + lon = 18 ; + lat = 9 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + float var(lat, lon) ; + var:_FillValue = 1.e+20f ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20f ; +} +''') + +def test_init_dataset_double(): + lon_size, lat_size = 18, 9 + + var = np.random.rand(lat_size, lon_size).astype(np.float64) + + attrs = { + 'var': { + 'long_name': 'Variable' + } + } + + ds = init_dataset(lon=lon_size, lat=lat_size, var=var, attrs=attrs) + + assert isinstance(ds, xr.Dataset) + assert ds.sizes['lon'] == lon_size + assert ds.sizes['lat'] == lat_size + + test_path = constants.OUTPUT_PATH / 'test.nc' + test_path.unlink(missing_ok=True) + + write_dataset(ds, test_path) + + output = helper.call(f'ncdump -h {test_path}') + + helper.assert_multiline_strings_equal(output, ''' +netcdf test { +dimensions: + lon = 18 ; + lat = 9 ; +variables: + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "Longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "Latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + double var(lat, lon) ; + var:_FillValue = 1.e+20 ; + var:long_name = "Variable" ; + var:missing_value = 1.e+20 ; +} +''') + def test_init_dataset_args(): lon_size, lat_size, time_size = 180, 90, 10 time = np.arange(time_size, dtype=np.float64) - var = np.random.rand(time_size, lat_size, lon_size).astype(np.float64) + var = np.random.rand(time_size, lat_size, lon_size).astype(np.float32) attrs = { 'var': { @@ -119,8 +213,8 @@ def test_init_dataset_args(): time:calendar = "365_day" ; time:units = "days since 2000-01-01 00:00:00" ; time:axis = "T" ; - double var(time, lat, lon) ; - var:_FillValue = 1.e+20 ; + float var(time, lat, lon) ; + var:_FillValue = 1.e+20f ; var:long_name = "Variable" ; var:missing_value = 1.e+20f ; } @@ -128,7 +222,7 @@ def test_init_dataset_args(): def test_init_dataset_latlon(): - var = np.random.rand(10, 1, 1).astype(np.float64) + var = np.random.rand(10, 1, 1).astype(np.float32) attrs = { 'var': { @@ -182,8 +276,8 @@ def test_init_dataset_latlon(): time:calendar = "proleptic_gregorian" ; time:units = "days since 1601-1-1 00:00:00" ; time:axis = "T" ; - double var(time, lat, lon) ; - var:_FillValue = 1.e+20 ; + float var(time, lat, lon) ; + var:_FillValue = 1.e+20f ; var:long_name = "Variable" ; var:missing_value = 1.e+20f ; } @@ -193,7 +287,7 @@ def test_init_dataset_latlon(): def test_init_dataset_dims(): a = np.arange(0, 2, dtype=np.float64) b = np.arange(0, 3, dtype=np.float64) - var = np.random.rand(b.size, a.size, 360, 720).astype(np.float64) + var = np.random.rand(b.size, a.size, 360, 720).astype(np.float32) attrs = { 'var': { @@ -250,8 +344,8 @@ def test_init_dataset_dims(): double a(a) ; a:long_name = "A Axis" ; a:axis = "A" ; - double var(b, a, lat, lon) ; - var:_FillValue = 1.e+20 ; + float var(b, a, lat, lon) ; + var:_FillValue = 1.e+20f ; var:long_name = "Variable" ; var:missing_value = 1.e+20f ; } diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index b2ad4a2..cf21d17 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -33,8 +33,7 @@ } } -FILL_VALUE = np.float64(1e20) -MISSING_VALUE = np.float32(1e20) +FILL_VALUE = 1e20 def init_dataset(lon: None | int | np.ndarray = 720, lat: None | int | np.ndarray = 360, @@ -115,8 +114,6 @@ def init_dataset(lon: None | int | np.ndarray = 720, if data_var in attrs: ds.data_vars[data_var].attrs.update(attrs[data_var]) - # ds.data_vars[data_var].attrs["_FillValue"] = FILL_VALUE - # set global attributes ds.attrs = attrs.get('global', {}) @@ -297,7 +294,8 @@ def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: if '_FillValue' not in ds.data_vars[data_var].attrs: ds.data_vars[data_var].attrs['_FillValue'] = FILL_VALUE if 'missing_value' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['missing_value'] = MISSING_VALUE + missing_value = np.array(FILL_VALUE, dtype=ds[data_var].dtype) + ds.data_vars[data_var].attrs['missing_value'] = missing_value return ds @@ -310,9 +308,10 @@ def set_fill_value_to_nan(ds: xr.Dataset) -> xr.Dataset: Returns: Dataset with fill values replaced by NaN. """ - for var in ds.data_vars: - fill_value = ds[var].attrs.get('_FillValue', FILL_VALUE) - ds[var] = ds[var].where(ds[var] != fill_value) + for data_var in ds.data_vars: + fill_value = ds[data_var].attrs.get('_FillValue', FILL_VALUE) + missing_value = np.array(fill_value, dtype=ds[data_var].dtype) + ds[data_var] = ds[data_var].where(ds[data_var] != missing_value) return ds @@ -325,9 +324,10 @@ def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: Returns: Dataset with NaN values replaced by fill values. """ - for var in ds.data_vars: - fill_value = ds[var].attrs.get('_FillValue', FILL_VALUE) - ds[var] = ds[var].where(~np.isnan(ds[var]), fill_value) + for data_var in ds.data_vars: + fill_value = ds[data_var].attrs.get('_FillValue', FILL_VALUE) + missing_value = np.array(fill_value, dtype=ds[data_var].dtype) + ds[data_var] = ds[data_var].fillna(missing_value) return ds From be6f1d5adde209e87f31f3bcf51ebf477edf2cba Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 20 Jan 2026 12:10:01 +0100 Subject: [PATCH 138/162] Add Settings.from_toml --- isimip_utils/config.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/isimip_utils/config.py b/isimip_utils/config.py index 6dd4c77..835feed 100644 --- a/isimip_utils/config.py +++ b/isimip_utils/config.py @@ -1,5 +1,7 @@ """Configuration management for ISIMIP tools.""" import logging +import tomllib +from pathlib import Path from typing import Any from .utils import Singleton @@ -57,3 +59,26 @@ def from_dict(cls, values: dict[str, Any]) -> 'Settings': instance._settings = {key.upper(): value for key, value in values.items() if key not in cls.ignore_keys} logger.debug('settings = %s', instance) return instance + + @classmethod + def from_toml(cls, path: Path, section: str | None = None) -> 'Settings': + """Create a Settings instance from a TOML file. + + Args: + path (Path): Path to the TOML file. + section (str): Section to use. + + Returns: + A Settings instance populated with the content of the TOML file. + All keys are converted to uppercase. + """ + values = {} + with open(path, 'rb') as fp: + data = tomllib.load(fp) + if section: + if section in data: + values = data[section] + else: + values = data + + return cls.from_dict(values) From 88bae2d131d5a10a2ceeade1ffb55b108efe1fad Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 26 Jan 2026 16:07:49 +0100 Subject: [PATCH 139/162] Refactor extraction functions --- isimip_utils/extractions.py | 82 ++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index fa33297..2f34023 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -7,7 +7,7 @@ from .exceptions import ExtractionError from .utils import validate_lat, validate_lon -from .xarray import compute_offset, compute_time +from .xarray import compute_offset, compute_time, get_attrs, set_attrs, set_fill_value_to_nan logger = logging.getLogger(__name__) @@ -178,7 +178,7 @@ def mask_mask(ds: xr.Dataset, mask_ds: xr.Dataset, mask_var: str = 'mask', def compute_spatial_average(ds: xr.Dataset, weights: xr.DataArray | None = None) -> xr.Dataset: - """Compute spatial average over lat/lon dimensions. + """Compute the spatial average over lat/lon dimensions for each timestep. Args: ds (xr.Dataset): Dataset with lat/lon dimensions. @@ -193,11 +193,17 @@ def compute_spatial_average(ds: xr.Dataset, weights: xr.DataArray | None = None) logger.warn('no weights provided, using latitude-dependent weights') weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) - return ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) + attrs = get_attrs(ds) + + ds = set_fill_value_to_nan(ds) + ds = ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) + ds = set_attrs(ds, attrs) + + return ds def compute_temporal_average(ds: xr.Dataset) -> xr.Dataset: - """Compute temporal average over time dimension. + """Compute the temporal average over time dimension. Args: ds (xr.Dataset): Dataset with time dimension. @@ -206,7 +212,73 @@ def compute_temporal_average(ds: xr.Dataset) -> xr.Dataset: Dataset with time dimension averaged out. """ logger.info('compute temporal average') - return ds.mean(dim='time', skipna=True).astype(np.float32) + + attrs = get_attrs(ds) + + ds = set_fill_value_to_nan(ds) + ds = ds.mean(dim='time', skipna=True).astype(np.float32) + ds = set_attrs(ds, attrs) + + return ds + + +def compute_min(ds: xr.Dataset): + """Compute the minimum value for each timestep. + + Args: + ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + + Returns: + Dataset with the minimum value for each timestep. + """ + logger.info('compute min') + + attrs = get_attrs(ds) + + ds = set_fill_value_to_nan(ds) + ds = ds.min(dim=('lat', 'lon'), skipna=True).astype(np.float32) + ds = set_attrs(ds, attrs) + + return ds + +def compute_max(ds: xr.Dataset): + """Compute the minimum value for each timestep. + + Args: + ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + + Returns: + Dataset with the minimum value for each timestep. + """ + logger.info('compute max') + + attrs = get_attrs(ds) + + ds = set_fill_value_to_nan(ds) + ds = ds.max(dim=('lat', 'lon'), skipna=True).astype(np.float32) + ds = set_attrs(ds, attrs) + + return ds + + +def compute_sum(ds: xr.Dataset): + """Compute the sum over lat/lon for each timestep. + + Args: + ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + + Returns: + Dataset with the sum over lat/lon for each timestep. + """ + logger.info('compute sum') + + attrs = get_attrs(ds) + + ds = set_fill_value_to_nan(ds) + ds = ds.sum(dim=('lat', 'lon'), skipna=True).astype(np.float32) + ds = set_attrs(ds, attrs) + + return ds def count_values(ds: xr.Dataset, dim: list | None = None) -> xr.Dataset: From a9cf2a7618bb3f3c96c3c98e8946ab80dd763315 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 Jan 2026 14:41:09 +0100 Subject: [PATCH 140/162] Add compute_aggregation and refactor extractions again --- isimip_utils/extractions.py | 151 +++++++++++++++---------- isimip_utils/tests/constants.py | 2 +- isimip_utils/tests/test_extractions.py | 76 ++++++------- testing/setup.py | 22 ++-- 4 files changed, 141 insertions(+), 110 deletions(-) diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index 2f34023..a1e50d5 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -1,6 +1,8 @@ """Data extraction and manipulation utilities for xarray datasets.""" import logging +from collections.abc import Iterable from datetime import datetime +from typing import Literal import numpy as np import xarray as xr @@ -177,123 +179,148 @@ def mask_mask(ds: xr.Dataset, mask_ds: xr.Dataset, mask_var: str = 'mask', return ds.where(np.isclose(mask_ds[mask_var], 0 if inverse else 1)) -def compute_spatial_average(ds: xr.Dataset, weights: xr.DataArray | None = None) -> xr.Dataset: - """Compute the spatial average over lat/lon dimensions for each timestep. +def compute_aggregation(ds: xr.Dataset, type: Literal['mean', 'min', 'max', 'sum', 'std'], + dim: str | Iterable | None = None, weights: xr.DataArray | None = None) -> xr.Dataset: + """Compute aggregated values over selected dimensions and add dummy dimensions like CDO. Args: - ds (xr.Dataset): Dataset with lat/lon dimensions. - weights (xr.DataArray | None): Weights for averaging. If None, uses latitude-dependent weights. + ds (xr.Dataset): Dataset to process. + type (str): Type of aggregation. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. Returns: - Dataset with lat/lon dimensions averaged out. + Dataset with aggregated values over selected dimensions. """ - logger.info('compute spatial average') + dim = dim or ('lat', 'lon') + dim_expand = {d: [0] for d in ([dim] if isinstance(dim, str) else dim)} + dim_transpose = list(ds.dims) - if weights is None: - logger.warn('no weights provided, using latitude-dependent weights') - weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) + logger.info('compute %s %s', type, dim) attrs = get_attrs(ds) ds = set_fill_value_to_nan(ds) - ds = ds.weighted(weights).mean(dim=('lat', 'lon'), skipna=True).astype(np.float32) + + if type in ('mean', 'std', 'sum') and dim == ('lat', 'lon'): + if weights is None: + logger.warn('no weights provided, using latitude-dependent weights') + weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) + + ds = ds.weighted(weights) + + if type == 'mean': + ds = ds.mean(dim=dim, skipna=True) + elif type == 'std': + ds = ds.std(dim=dim, skipna=True) + elif type == 'sum': + ds = ds.sum(dim=dim, skipna=True) + elif type == 'min': + ds = ds.min(dim=dim, skipna=True) + elif type == 'max': + ds = ds.max(dim=dim, skipna=True) + else: + raise RuntimeError(f'unknown type "{type}" in compute_aggregation') + + ds = ds.expand_dims(**dim_expand).transpose(*dim_transpose).astype(np.float32) ds = set_attrs(ds, attrs) return ds -def compute_temporal_average(ds: xr.Dataset) -> xr.Dataset: - """Compute the temporal average over time dimension. +def compute_mean(ds: xr.Dataset, dim: str | Iterable | None = None, weights: xr.DataArray | None = None) -> xr.Dataset: + """ + Compute mean values over selected dimensions and add dummy dimensions like CDO. Wrapper for compute_aggregation. Args: - ds (xr.Dataset): Dataset with time dimension. + ds (xr.Dataset): Dataset to process. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. Returns: - Dataset with time dimension averaged out. + Dataset with mean values over selected dimensions. """ - logger.info('compute temporal average') - - attrs = get_attrs(ds) - - ds = set_fill_value_to_nan(ds) - ds = ds.mean(dim='time', skipna=True).astype(np.float32) - ds = set_attrs(ds, attrs) - - return ds + return compute_aggregation(ds, 'mean', dim, weights) -def compute_min(ds: xr.Dataset): - """Compute the minimum value for each timestep. +def compute_std(ds: xr.Dataset, dim: str | Iterable | None = None, weights: xr.DataArray | None = None) -> xr.Dataset: + """ + Compute the standard deviation over selected dimensions and add dummy dimensions like CDO. + Wrapper for compute_aggregation. Args: - ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + ds (xr.Dataset): Dataset to process. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. Returns: - Dataset with the minimum value for each timestep. + Dataset with the standard deviation over selected dimensions. """ - logger.info('compute min') + return compute_aggregation(ds, 'std', dim, weights) - attrs = get_attrs(ds) - ds = set_fill_value_to_nan(ds) - ds = ds.min(dim=('lat', 'lon'), skipna=True).astype(np.float32) - ds = set_attrs(ds, attrs) - - return ds - -def compute_max(ds: xr.Dataset): - """Compute the minimum value for each timestep. +def compute_sum(ds: xr.Dataset, dim: str | Iterable | None = None, weights: xr.DataArray | None = None) -> xr.Dataset: + """ + Compute the sum over selected dimensions and add dummy dimensions like CDO. Wrapper for compute_aggregation. Args: - ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + ds (xr.Dataset): Dataset to process. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. Returns: - Dataset with the minimum value for each timestep. + Dataset with the sum over selected dimensions. """ - logger.info('compute max') + return compute_aggregation(ds, 'sum', dim, weights) - attrs = get_attrs(ds) - - ds = set_fill_value_to_nan(ds) - ds = ds.max(dim=('lat', 'lon'), skipna=True).astype(np.float32) - ds = set_attrs(ds, attrs) - - return ds - -def compute_sum(ds: xr.Dataset): - """Compute the sum over lat/lon for each timestep. +def compute_min(ds: xr.Dataset, dim: str | Iterable | None = None) -> xr.Dataset: + """ + Compute minimum values over selected dimensions and add dummy dimensions like CDO. Wrapper for compute_aggregation. Args: - ds (xr.Dataset): Dataset with (time, lat, lon) dimensions. + ds (xr.Dataset): Dataset to process. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. Returns: - Dataset with the sum over lat/lon for each timestep. + Dataset with minimum values over selected dimensions. """ - logger.info('compute sum') + return compute_aggregation(ds, 'min', dim) - attrs = get_attrs(ds) - ds = set_fill_value_to_nan(ds) - ds = ds.sum(dim=('lat', 'lon'), skipna=True).astype(np.float32) - ds = set_attrs(ds, attrs) +def compute_max(ds: xr.Dataset, dim: str | Iterable | None = None) -> xr.Dataset: + """ + Compute maximum values over selected dimensions and add dummy dimensions like CDO. Wrapper for compute_aggregation. - return ds + Args: + ds (xr.Dataset): Dataset to process. + dim (str|Iterable): Dimensions along which apply mean [default: ('lat', 'lon')] + weights (xr.DataArray | None): Weights for averaging over lat/lon. If None, uses latitude-dependent weights. + + Returns: + Dataset with maximum values over selected dimensions. + """ + return compute_aggregation(ds, 'max', dim) -def count_values(ds: xr.Dataset, dim: list | None = None) -> xr.Dataset: +def count_values(ds: xr.Dataset, dim: str | Iterable | None = None) -> xr.Dataset: """Count non-NaN values over lat/lon dimensions. Args: ds (xr.Dataset): Dataset with lat/lon dimensions. - dim (list): Dimensions along which to count [default: ('lat', 'lon')] + dim (str|Iterable): Dimensions along which to count [default: ('lat', 'lon')] Returns: Dataset with count of non-NaN values per time step. """ - logger.info('count values') dim = dim or ('lat', 'lon') - return ds.count(dim=dim).astype(np.float32) + logger.info('count values over %s', dim) + + ds = set_fill_value_to_nan(ds) + ds = ds.count(dim=dim).astype(np.float32) + + return ds def concat_extraction(ds1: xr.Dataset | None, ds2: xr.Dataset) -> xr.Dataset: diff --git a/isimip_utils/tests/constants.py b/isimip_utils/tests/constants.py index 6d02fbf..a3b977f 100644 --- a/isimip_utils/tests/constants.py +++ b/isimip_utils/tests/constants.py @@ -42,7 +42,7 @@ DATE = date(2021, 1, 1) PERIOD = date(2021, 4, 1), date(2021, 9, 30) -BBOX = (0, 10, -5, 5) +BBOX = (70, 80, -5, 5) POINT = (52.395833, 13.061389) POINT_INDEX = (386, 75) diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py index cd3d67c..f972f4f 100644 --- a/isimip_utils/tests/test_extractions.py +++ b/isimip_utils/tests/test_extractions.py @@ -2,8 +2,12 @@ import pytest from isimip_utils.extractions import ( - compute_spatial_average, - compute_temporal_average, + compute_aggregation, + compute_max, + compute_mean, + compute_min, + compute_std, + compute_sum, concat_extraction, count_values, mask_bbox, @@ -14,7 +18,7 @@ select_time, ) from isimip_utils.tests import constants, helper -from isimip_utils.xarray import get_attrs, open_dataset, set_attrs, write_dataset +from isimip_utils.xarray import open_dataset, write_dataset @pytest.mark.parametrize('decode_cf', (True, False)) @@ -213,57 +217,48 @@ def test_mask_mask_concat(decode_cf): helper.call(f'cdo diff {extraction_path} {cdo_path}') +@pytest.mark.parametrize('type', ('mean', 'min', 'max', 'sum', 'std')) @pytest.mark.parametrize('decode_cf', (True, False)) -def test_compute_spatial_average(decode_cf): +def test_compute_aggregation(type, decode_cf): gridarea_path = constants.SHARE_PATH / 'gridarea.nc' gridarea_ds = open_dataset(gridarea_path) west, east, south, north = constants.BBOX dataset_path = constants.DATASETS_PATH / constants.TAS_PATH - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean_') + extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', f'_select-bbox-{type}_') extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: - attrs = get_attrs(file_ds) ds = select_bbox(file_ds, west, east, south, north) - ds = compute_spatial_average(ds, weights=gridarea_ds["cell_area"]) - ds = set_attrs(ds, attrs) - write_dataset(ds, extraction_path) - - cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean-cdo_') - helper.call(f'cdo diff {extraction_path} {cdo_path}') - - -@pytest.mark.parametrize('decode_cf', (True, False)) -def test_compute_spatial_average_concat(decode_cf): - gridarea_path = constants.SHARE_PATH / 'gridarea.nc' - gridarea_ds = open_dataset(gridarea_path) - - west, east, south, north = constants.BBOX - - extraction_ds = None - for path in constants.TAS_SPLIT_PATHS: - dataset_path = constants.DATASETS_PATH / path - with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: - attrs = get_attrs(file_ds) - ds = select_bbox(file_ds, west, east, south, north) - ds = compute_spatial_average(ds, weights=gridarea_ds["cell_area"]) - ds = set_attrs(ds, attrs) - extraction_ds = concat_extraction(extraction_ds, ds) + if type == 'mean': + ds = compute_mean(ds, weights=gridarea_ds['cell_area']) + elif type == 'std': + ds = compute_std(ds, weights=gridarea_ds['cell_area']) + elif type == 'sum': + ds = compute_sum(ds, weights=gridarea_ds['cell_area']) + elif type == 'max': + ds = compute_max(ds) + elif type == 'min': + ds = compute_min(ds) - extraction_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean_') - extraction_path.unlink(missing_ok=True) + write_dataset(ds, extraction_path) - write_dataset(extraction_ds, extraction_path) + # allow for a small relative difference, translated into an absolute difference + if type == 'sum': + abslim = 3.36e+07 + elif type == 'std': + abslim = 1e-7 + else: + abslim = 0.0 - cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-mean-cdo_') - helper.call(f'cdo diff {extraction_path} {cdo_path}') + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', f'_select-bbox-{type}-cdo_') + helper.call(f'cdo diff,abslim={abslim} {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) -def test_compute_temporal_average(decode_cf): +def test_compute_mean_time(decode_cf): west, east, south, north = constants.BBOX dataset_path = constants.DATASETS_PATH / constants.TAS_PATH @@ -271,14 +266,15 @@ def test_compute_temporal_average(decode_cf): extraction_path.unlink(missing_ok=True) with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: - attrs = get_attrs(file_ds) ds = select_bbox(file_ds, west, east, south, north) - ds = compute_temporal_average(ds) - ds = set_attrs(ds, attrs) + ds = compute_aggregation(ds, 'mean', dim='time') write_dataset(ds, extraction_path) + # allow for a small relative difference, translated into an absolute difference + abslim = 2.45e-4 + cdo_path = constants.EXTRACTIONS_PATH / constants.TAS_PATH.replace('_global_', '_select-bbox-map-cdo_') - helper.call(f'cdo diff,abslim=0.001 {extraction_path} {cdo_path}') + helper.call(f'cdo diff,abslim={abslim} {extraction_path} {cdo_path}') @pytest.mark.parametrize('decode_cf', (True, False)) diff --git a/testing/setup.py b/testing/setup.py index ff72485..208e19a 100755 --- a/testing/setup.py +++ b/testing/setup.py @@ -9,7 +9,7 @@ def main(): run_select_period() run_select_point() run_select_bbox() - run_select_bbox_mean() + run_select_bbox_aggregations() run_select_bbox_map() run_mask_bbox() run_mask_mask() @@ -126,18 +126,26 @@ def run_select_bbox(): f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') -def run_select_bbox_mean(): +def run_select_bbox_aggregations(): west, east, south, north = constants.BBOX for path in [constants.TAS_PATH, *constants.TAS_SPLIT_PATHS]: input_path = constants.DATASETS_PATH / path - output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', '_select-bbox-mean-cdo_') - output_path.parent.mkdir(parents=True, exist_ok=True) + for aggregation in ['mean', 'min', 'max', 'sum', 'std']: + output_path = constants.EXTRACTIONS_PATH / path.replace('_global_', f'_select-bbox-{aggregation}-cdo_') + output_path.parent.mkdir(parents=True, exist_ok=True) - if not output_path.exists(): - helper.call('cdo -f nc4c -z zip_5 -L -fldmean ' - f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') + gridarea_path = constants.SHARE_PATH / 'gridarea.nc' + + if not output_path.exists(): + if aggregation == 'sum': + helper.call(f'cdo -f nc4c -z zip_5 -L -fld{aggregation} ' + f'-sellonlatbox,{west},{east},{south},{north} ' + f'-mul {input_path} {gridarea_path} {output_path}') + else: + helper.call(f'cdo -f nc4c -z zip_5 -L -fld{aggregation} ' + f'-sellonlatbox,{west},{east},{south},{north} {input_path} {output_path}') def run_select_bbox_map(): From a5e0a318c70ef2fbe1562c9fa2a15413b77ff58a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 Jan 2026 15:04:20 +0100 Subject: [PATCH 141/162] Fix convert_time --- isimip_utils/xarray.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index cf21d17..47ca8d8 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -417,19 +417,18 @@ def convert_time(time: np.ndarray, units='days since 1601-1-1 00:00:00', calenda Returns: time (np.ndarray): Time coordinate array as np.float64. """ - if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): - return time.astype(np.float64) - - if np.issubdtype(time.dtype, np.datetime64): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - if isinstance(time, pd.core.indexes.datetimes.DatetimeIndex): - time = time.to_pydatetime() - else: - time = time.dt.to_pydatetime() - - if time.dtype == 'object' and isinstance(time[0], str): + if isinstance(time.dtype, pd.StringDtype): time = np.array([datetime.fromisoformat(t) for t in time], dtype=object) + else: + if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): + return time.astype(np.float64) + elif np.issubdtype(time.dtype, np.datetime64): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + if isinstance(time, pd.core.indexes.datetimes.DatetimeIndex): + time = time.to_pydatetime() + else: + time = time.dt.to_pydatetime() return cftime.date2num( time, calendar=calendar, units=units From 6902aae834cdbfc74fb8fae8c2a4a010b67503ba Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 Jan 2026 16:40:47 +0100 Subject: [PATCH 142/162] Remove layer option from plot_grid --- isimip_utils/plot.py | 8 +------- isimip_utils/tests/test_plot.py | 8 ++++---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index b028134..3edfd90 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -312,7 +312,7 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], plots: dict, empty_plot: alt.Chart, - layer: bool = True, x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: + x: str = 'shared', y: str = 'shared', color: str = 'shared') -> alt.Chart: """Create a grid of plots organized by parameter permutations. Args: @@ -320,7 +320,6 @@ def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], pl plot_permutations (list): List the permutations (with tuples of parameters) for each plot. plots (dict): Dictionary mapping permutation tuples to Chart objects. empty_plot (alt.Chart): Chart to use when a permutation has no data. - layer (bool): Whether to layer plots or concatenate vertically (default: True). x (str): Scale resolution for x-axis ('shared', 'independent', default: 'shared'). y (str): Scale resolution for y-axis ('shared', 'independent', default: 'shared'). color (str): Scale resolution for color ('shared', 'independent', default: 'shared'). @@ -347,9 +346,6 @@ def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], pl for plot_permutation in plot_permutations: plot = plots.get(grid_permutation + plot_permutation, empty_plot) - if not layer: - plot = plot.properties(title=' '.join(plot_permutation)) - column.append(plot) prev = grid_permutation @@ -357,8 +353,6 @@ def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], pl chart = alt.vconcat(*[ alt.hconcat(*[ alt.layer(*column, title=column_title) - if layer else - alt.vconcat(*column, title=column_title).resolve_scale(x=x, y=y, color=color) for column_title, column in row ], title=row_title).resolve_scale(x=x, y=y) for row_title, row in rows diff --git a/isimip_utils/tests/test_plot.py b/isimip_utils/tests/test_plot.py index a785ea9..a732821 100644 --- a/isimip_utils/tests/test_plot.py +++ b/isimip_utils/tests/test_plot.py @@ -236,7 +236,7 @@ def test_plot_grid(): grid_permutations.append(('b', 'y')) - chart = plot_grid(grid_permutations, plot_permutations, plots, x='independent', empty_plot=empty_plot, layer=False) + chart = plot_grid(grid_permutations, plot_permutations, plots, x='independent', empty_plot=empty_plot) top, bottom = chart.vconcat top_left, top_right = top.hconcat @@ -247,9 +247,9 @@ def test_plot_grid(): assert bottom_left.data.equals(dataframes[2]) assert bottom_right.data.equals(df_empty) - for sub_chart in [top_left, top_right, bottom_left, bottom_right]: - assert sub_chart.resolve.scale.x == 'independent' - assert sub_chart.resolve.scale.y == 'shared' + for compound_chart in [chart, top, bottom]: + assert compound_chart.resolve.scale.x == 'independent' + assert compound_chart.resolve.scale.y == 'shared' save_plot(chart, plot_path) From 853a98aa6dd80e95ca04c92088b43e98ba1ecf85 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 Jan 2026 16:41:07 +0100 Subject: [PATCH 143/162] Add title for empty plots again --- isimip_utils/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 3edfd90..ece38d6 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -166,7 +166,7 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None x_type = x_type or ('T' if get_first_coord_axis(df) == 'T' else 'Q') x = alt.X( f'{x_field}:{x_type}', - title=x_label if not empty else "" + title=x_label ) y_field = y_field or get_first_data_var(df) @@ -174,7 +174,7 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None y_type = y_type or 'Q' y = alt.Y( f'{y_field}:{y_type}', - title=y_label if not empty else "", + title=y_label, axis=alt.Axis(format=y_format) if y_format else alt.Axis(), scale=alt.Scale(zero=False, nice=False) ) From d23fbabda0d5b9c53312f9ba54ed71e0f8c70fba Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 27 Jan 2026 17:53:56 +0100 Subject: [PATCH 144/162] Fix build_default_args --- isimip_utils/cli.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index f148460..711382f 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -228,10 +228,8 @@ def read_local_config(self, config_path) -> dict: def build_default_args(self, config_path=None) -> argparse.Namespace: # read config file(s) - config = dict( - **self.read_global_config(), - **self.read_local_config(config_path) - ) + config = self.read_global_config() + config.update(self.read_local_config(config_path)) # init the default namespace default_args = argparse.Namespace() From 56097aa0cbc56f08596a6475c3c54c955d57ff19 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 28 Jan 2026 16:41:36 +0100 Subject: [PATCH 145/162] Update docs and README.md --- .github/workflows/ci.yaml | 2 +- .gitignore | 2 + README.md | 102 ++++++++++++++++++++++++-------------- docs/api/decorators.md | 3 -- docs/api/parameters.md | 3 ++ docs/api/protocol.md | 3 ++ docs/examples.md | 2 + docs/index.md | 100 ++++++++++++++++++++++++++++--------- docs/prerequisites.md | 18 ++++--- docs/releases.md | 73 --------------------------- isimip_utils/protocol.py | 2 +- mkdocs.yml | 8 +-- pyproject.toml | 1 + 13 files changed, 170 insertions(+), 149 deletions(-) delete mode 100644 docs/api/decorators.md create mode 100644 docs/api/parameters.md create mode 100644 docs/api/protocol.md create mode 100644 docs/examples.md delete mode 100644 docs/releases.md diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fdd09d3..5c9fd7f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,4 @@ -name: isimip-utils CI +name: CI on: push: diff --git a/.gitignore b/.gitignore index 553f72e..26c23b3 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,8 @@ __pycache__/ /.coverage /htmlcov +/site + /testing/datasets /testing/extractions /testing/output diff --git a/README.md b/README.md index 77d5cb6..ac1a29d 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,85 @@ ISIMIP utils ============ -[![Latest release](https://shields.io/github/v/release/ISI-MIP/isimip-utils)](https://github.com/ISI-MIP/isimip-utils/releases) -[![PyPI Release](https://img.shields.io/pypi/v/isimip-utils)](https://pypi.org/project/isimip-utils/) -[![Python Version](https://img.shields.io/badge/python->=3.8-blue)](https://www.python.org/) -[![License](https://img.shields.io/badge/license-MIT-green)](https://github.com/ISI-MIP/django-datacite/blob/master/LICENSE) - -This package contains common functionality for different ISIMIP tools, namely: - -* https://github.com/ISI-MIP/isimip-publisher -* https://github.com/ISI-MIP/isimip-qa -* https://github.com/ISI-MIP/isimip-qc - -It comprises of: - -* `isimip_utils.checksum`: Functions to compute the SHA-512 checksum of a file. -* `isimip_utils.cli`: Command-line interface utilities for argument parsing and configuration. -* `isimip_utils.config`: A settings class to combine input from `argparse`, the environment (via `python-dotenv`) and config files. -* `isimip_utils.decorators`: Decorators including a cached property implementation. -* `isimip_utils.exceptions`: Custom exceptions for ISIMIP tools. -* `isimip_utils.extractions`: Data extraction and manipulation utilities for xarray datasets. -* `isimip_utils.fetch`: Functions to fetch files from the machine-actionable ISIMIP protocols. -* `isimip_utils.files`: File search utilities with regex pattern matching. -* `isimip_utils.netcdf`: Functions to open and read NetCDF files using netCDF4. -* `isimip_utils.pandas`: DataFrame utilities for ISIMIP data processing. -* `isimip_utils.patterns`: Functions to match file names and extract ISIMIP specifiers. -* `isimip_utils.plot`: Plotting utilities using Altair for data visualization. -* `isimip_utils.utils`: Additional utility functions. -* `isimip_utils.xarray`: Functions for working with xarray datasets. +[![Python Version](https://img.shields.io/badge/python->=3.11-blue)](https://www.python.org/) +[![License](https://img.shields.io/github/license/ISI-MIP/isimip-utils?style=flat)](https://github.com/rdmorganiser/isimip-utils/blob/main/LICENSE) +[![CI status](https://github.com/ISI-MIP/isimip-utils/actions/workflows/ci.yml/badge.svg)](https://github.com/ISI-MIP/isimip-utils/actions/workflows/ci.yml) +[![Coverage status](https://coveralls.io/repos/ISI-MIP/isimip-utils/badge.svg?branch=main&service=github)](https://coveralls.io/github/ISI-MIP/isimip-utils?branch=main) +[![Latest release](https://img.shields.io/pypi/v/isimip-utils.svg?style=flat)](https://pypi.python.org/pypi/isimip-utils/) + + +[ISIMIP](https://isimip.org) offers a framework for consistently projecting the impacts +of climate change across affected sectors and spatial scales. An international network +of climate-impact modellers contribute to a comprehensive and consistent picture of the +world under different climate-change scenarios. + +This package contains various utility methods for use in custom scripts as well +as in different ISIMIP tools: + +* [ISIMIP quality control](https://github.com/ISI-MIP/isimip-qc) +* [ISIMIP quality assurance](https://github.com/ISI-MIP/isimip-qa) +* [ISIMIP publisher](https://github.com/ISI-MIP/isimip-publisher) + + +The different methods are described are [documented here](docs/index.md). Setup -===== +----- -Working on the package requires a running Python3 on your system. Installing those prerequisites is covered [here](https://github.com/ISI-MIP/isimip-utils/blob/master/docs/releases.md). +Using the package requires a running Python 3 on your system. The installation for different systems is covered +[here](docs/releases.md). -The package itself can be installed via pip: +Unless you already use an environment manager (e.g. `conda` or `uv`), it is highly recommended to use a +[virtual environment](https://docs.python.org/3/library/venv.html), which can be created using: -``` -pip install isimip-utils +```bash +python3 -m venv env +source env/bin/activate # needs to be invoked in every new terminal session ``` -The package can also be installed directly from GitHub: +The package itself can be installed via `pip`: -``` -pip install git+https://github.com/ISI-MIP/isimip-utils +```bash +pip install isimip-utils ``` For a development setup, the repo should be cloned and installed in *editable* mode: -``` +```bash git clone git@github.com:ISI-MIP/isimip-utils pip install -e isimip-utils ``` + + +Usage +----- + +Once installed, the modules can be used like any other Python library, e.g. in order to create a ISIMIP +compliant NetCDF file, you can use: + +```python +from isimip_utils.xarray import init_dataset, write_dataset + +time = np.arrange(0, 365, dtype=np.float64) +var = np.ones((365, 360, 720), dtype=np.float32) + +attrs={ + 'global': { + 'contact': 'mail@example.com' + }, + 'var': { + 'standard_name': 'var', + 'long_name': 'Variable', + 'units': '1', + } +} + +# create an xarray.Dataset +ds = init_dataset(time=time, var=var, attrs=attrs) + +# write the dataset as NetCDF file +write_dataset(ds, 'output.nc') +``` + +Please also note our [examples page](examples.md) and the [API reference](api.md). diff --git a/docs/api/decorators.md b/docs/api/decorators.md deleted file mode 100644 index 0747bff..0000000 --- a/docs/api/decorators.md +++ /dev/null @@ -1,3 +0,0 @@ -# isimip_utils.decorators - -::: isimip_utils.decorators diff --git a/docs/api/parameters.md b/docs/api/parameters.md new file mode 100644 index 0000000..ac1a072 --- /dev/null +++ b/docs/api/parameters.md @@ -0,0 +1,3 @@ +# isimip_utils.parameters + +::: isimip_utils.parameters diff --git a/docs/api/protocol.md b/docs/api/protocol.md new file mode 100644 index 0000000..7f986e0 --- /dev/null +++ b/docs/api/protocol.md @@ -0,0 +1,3 @@ +# isimip_utils.protocol + +::: isimip_utils.protocol diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..bac945d --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,2 @@ +Examples +======== diff --git a/docs/index.md b/docs/index.md index 89da52e..60c38a2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,53 +1,107 @@ ISIMIP utils ============ +[ISIMIP](https://isimip.org) offers a framework for consistently projecting the impacts +of climate change across affected sectors and spatial scales. An international network +of climate-impact modellers contribute to a comprehensive and consistent picture of the +world under different climate-change scenarios. + Overview -------- -This package contains common functionality for different ISIMIP tools, namely: +This package contains various utility methods for use in custom scripts as well +as in different ISIMIP tools: + +* [ISIMIP quality control](https://github.com/ISI-MIP/isimip-qc) +* [ISIMIP quality assurance](https://github.com/ISI-MIP/isimip-qa) +* [ISIMIP publisher](https://github.com/ISI-MIP/isimip-publisher) + +The following modules contain high-level method to extract data (e.g. aggregated time series of points, areas, shapes) +from global ISIMIP data sets and create gridded plots visualizing the data: + +* [`isimip_utils.extractions`](api/extractions.md): Create extractions using [Xarray](https://docs.xarray.dev). +* [`isimip_utils.plot`](api/plot.md): Plotting utilities using [Vega-Altair](https://altair-viz.github.io). + +Lower-level functions are provided to interact with the data sets and customize `xarray`, `pandas`, and `netcdf` +for ISIMIP conventions. + +* [`isimip_utils.xarray`](api/xarray.md): Functions for working with `xarray` datasets. +* [`isimip_utils.netcdf`](api/netcdf.md): Functions to open and read NetCDF files using netCDF4. +* [`isimip_utils.pandas`](api/pandas.md): Pandas utilities for ISIMIP data processing. -* https://github.com/ISI-MIP/isimip-publisher -* https://github.com/ISI-MIP/isimip-qa -* https://github.com/ISI-MIP/isimip-qc +Two modules focus on the interface to the [machine-readable ISIMIP protocol](https://protocol.isimip.org): -It comprises of: +* [`isimip_utils.patterns`](api/patterns.md): Functions to fetch information from machine-actionable ISIMIP protocols. +* [`isimip_utils.protocol`](api/patterns.md): Functions to match file names and extract ISIMIP specifiers. -* [`isimip_utils.checksum`](api/checksum.md): Functions to compute the SHA-512 checksum of a file. +The remaining modules contain utility functions which are used by the other modules or by the ISIMIP tools mentioned above: + +* [`isimip_utils.checksum`](api/checksum.md): Checksum computation utilities for file integrity verification. * [`isimip_utils.cli`](api/cli.md): Command-line interface utilities for argument parsing and configuration. -* [`isimip_utils.config`](api/config.md): A settings class to combine input from `argparse`, the environment (via `python-dotenv`) and config files. -* [`isimip_utils.decorators`](api/decorators.md): Decorators including a cached property implementation. +* [`isimip_utils.config`](api/config.md): A `Settings` class for command-line interface utilities. * [`isimip_utils.exceptions`](api/exceptions.md): Custom exceptions for ISIMIP tools. -* [`isimip_utils.extractions`](api/extractions.md): Data extraction and manipulation utilities for xarray datasets. -* [`isimip_utils.fetch`](api/fetch.md): Functions to fetch files from the machine-actionable ISIMIP protocols. +* [`isimip_utils.fetch`](api/fetch.md): Functions to fetch files from urls or local paths. * [`isimip_utils.files`](api/files.md): File search utilities with regex pattern matching. -* [`isimip_utils.netcdf`](api/netcdf.md): Functions to open and read NetCDF files using netCDF4. -* [`isimip_utils.pandas`](api/pandas.md): DataFrame utilities for ISIMIP data processing. -* [`isimip_utils.patterns`](api/patterns.md): Functions to match file names and extract ISIMIP specifiers. -* [`isimip_utils.plot`](api/plot.md): Plotting utilities using Altair for data visualization. +* [`isimip_utils.parameters`](api/parameters.md): Utility functions for the work with parameters and placeholders. * [`isimip_utils.utils`](api/utils.md): Additional utility functions. -* [`isimip_utils.xarray`](api/xarray.md): Functions for working with xarray datasets. Setup ----- -Working on the package requires a running Python3 on your system. Installing those prerequisites is covered [here](https://github.com/ISI-MIP/isimip-utils/blob/master/docs/releases.md). +Using the package requires a running Python 3 on your system. The installation for different systems is covered +[here](https://github.com/ISI-MIP/isimip-utils/blob/master/docs/releases.md). -The package itself can be installed via pip: +Unless you already use an environment manager (e.g. `conda` or `uv`), it is highly recommended to use a +[virtual environment](https://docs.python.org/3/library/venv.html), which can be created using: -``` -pip install isimip-utils +```bash +python3 -m venv env +source env/bin/activate # needs to be invoked in every new terminal session ``` -The package can also be installed directly from GitHub: +The package itself can be installed via `pip`: -``` -pip install git+https://github.com/ISI-MIP/isimip-utils +```bash +pip install isimip-utils ``` For a development setup, the repo should be cloned and installed in *editable* mode: -``` +```bash git clone git@github.com:ISI-MIP/isimip-utils pip install -e isimip-utils ``` + + +Usage +----- + +Once installed, the modules can be used like any other Python library, e.g. in order to create a ISIMIP +compliant NetCDF file, you can use: + +```python +from isimip_utils.xarray import init_dataset, write_dataset + +time = np.arange(0, 365, dtype=np.float64) +var = np.ones((365, 360, 720), dtype=np.float32) + +attrs={ + 'global': { + 'contact': 'mail@example.com' + }, + 'var': { + 'standard_name': 'var', + 'long_name': 'Variable', + 'units': '1', + } +} + +# create an xarray.Dataset +ds = init_dataset(time=time, var=var, attrs=attrs) + +# write the dataset as NetCDF file +write_dataset(ds, 'output.nc') +``` + +Please also note our page with additional [examples](examples.md) and the [API reference](api.md). diff --git a/docs/prerequisites.md b/docs/prerequisites.md index 482f828..fc5a6e7 100644 --- a/docs/prerequisites.md +++ b/docs/prerequisites.md @@ -1,9 +1,11 @@ -Prerequisites -------------- +Python installation +------------------- -The installation of Python (and its developing packages) differs from operating system to operating system. Optional Git is needed if a package is installed directly from GitHub. +Using the package requires a running Python 3 on your system. The installation of Python (and its developing +packages) differs from operating system to operating system. Optional Git is needed if a package is installed +directly from GitHub. -### Linux +## Linux On Linux, Python3 is probably already installed, but the development packages are usually not. Optionally, Git can be installed as well. You should be able to install all prerequisites using: @@ -21,7 +23,7 @@ sudo zypper install python3 python3-devel sudo zypper install git ``` -### macOS +## macOS While we reccoment using [Homebrew](https://brew.sh) to install Python3 on a Mac, other means of obtaining Python like [Anaconda](https://www.anaconda.com/products/individual), [MacPorts](https://www.macports.org/), or [Fink](https://www.finkproject.org/) should work just as fine: @@ -30,9 +32,9 @@ brew install python brew install git ``` -### Windows +## Windows -#### Regular installation +### Regular installation The software prerequisites need to be downloaded and installed from their particular web sites. @@ -47,6 +49,6 @@ For git: All further steps need to be performed using the windows shell `cmd.exe`. You can open it from the Start-Menu. -#### Using the Windows Subsystem for Linux (WSL) +### Using the Windows Subsystem for Linux (WSL) As an alternative for advanced users, you can use the Windows Subsystem for Linux (WSL) to install a Linux distribution within Windows 10. The installation is explained in the [Microsoft documentation](https://docs.microsoft.com/en-us/windows/wsl/install-win10). When using WSL, please install Python3 as explained in the Linux section. diff --git a/docs/releases.md b/docs/releases.md deleted file mode 100644 index 9cab732..0000000 --- a/docs/releases.md +++ /dev/null @@ -1,73 +0,0 @@ -Releases -======== - -[PyPI](https://pypi.org/) releases of this repository, are done using the following steps: - -### Install `build` and `twine` - -``` -pip install build twine -``` - -Create `~/.pypirc` - -``` -[pypi] -username: ... -password: ... - -[testpypi] -repository: https://test.pypi.org/legacy/ -username: ... -password: ... -``` - - -### Prepare repo - -1) Ensure tests are passing. - -2) Update version in `isimip_utils/__init__.py`. - -3) Build `sdist` and `bdist_wheel`: - - ``` - python -m build - ``` - -4) Check: - - ``` - twine check dist/* - ``` - - -### Release on Test PyPI - -1) Upload with `twine` to Test PyPI: - - ``` - twine upload -r testpypi dist/* - ``` - -2) Check at . - - -### Release on PyPI - -1) Upload with `twine` to PyPI: - - ``` - twine upload dist/* - ``` - -2) Check at . - - -### Create release on GitHub - -1) Commit local changes. - -2) Push changes. - -3) Create release on . diff --git a/isimip_utils/protocol.py b/isimip_utils/protocol.py index e154add..8f1baca 100644 --- a/isimip_utils/protocol.py +++ b/isimip_utils/protocol.py @@ -1,4 +1,4 @@ -"""Functions to fetch files from machine-actionable ISIMIP protocols.""" +"""Functions to fetch information from machine-actionable ISIMIP protocols.""" import logging import os import re diff --git a/mkdocs.yml b/mkdocs.yml index 2bf4e07..f4dd447 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,14 +13,12 @@ plugins: members_order: source nav: - - README: index.md - - Prerequisites: prerequisites.md - - Releases: releases.md + - Getting started: index.md + - Examples: examples.md - API reference: - isimip_utils.checksum: api/checksum.md - isimip_utils.cli: api/cli.md - isimip_utils.config: api/config.md - - isimip_utils.decorators: api/decorators.md - isimip_utils.exceptions: api/exceptions.md - isimip_utils.extractions: api/extractions.md - isimip_utils.fetch: api/fetch.md @@ -28,6 +26,8 @@ nav: - isimip_utils.netcdf: api/netcdf.md - isimip_utils.pandas: api/pandas.md - isimip_utils.patterns: api/patterns.md + - isimip_utils.parameters: api/parameters.md - isimip_utils.plot: api/plot.md - isimip_utils.utils: api/utils.md - isimip_utils.xarray: api/xarray.md + - Python installation: prerequisites.md diff --git a/pyproject.toml b/pyproject.toml index 64fd614..f4e5014 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,7 @@ pytest = ["pytest"] [tool.typos.default.extend-words] iy = "iy" +arange = "arange" [tool.coverage.report] exclude_lines = [ From 71bdf1ecbe50fa4201dc839387549d9a85829775 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 28 Jan 2026 16:41:55 +0100 Subject: [PATCH 146/162] Add CITATION.cff file --- CITATION.cff | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..41ca832 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,16 @@ +cff-version: 1.2.0 +message: If you use this software, please cite it as below. + +title: ISIMIP utils +authors: +- family-names: Klar + given-names: Jochen + orcid: https://orcid.org/0000-0002-5883-4273 +- family-names: BΓΌchner + given-names: Matthias + orcid: https://orcid.org/0000-0002-1382-7424 +- family-names: Inga + given-names: Sauer + orcid: https://orcid.org/0000-0002-9302-2131 +license: MIT +repository-code: https://github.com/ISI-MIP/isimip-utils From 254618b3ba07bafea2690ab688cba02988b250f4 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 30 Jan 2026 19:05:58 +0100 Subject: [PATCH 147/162] Fix empty plot handling --- isimip_utils/plot.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index ece38d6..4f1e1a5 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -345,14 +345,15 @@ def plot_grid(grid_permutations: list[tuple], plot_permutations: list[tuple], pl row.append((column_title, column)) for plot_permutation in plot_permutations: - plot = plots.get(grid_permutation + plot_permutation, empty_plot) - column.append(plot) + plot = plots.get(grid_permutation + plot_permutation) + if plot: + column.append(plot) prev = grid_permutation chart = alt.vconcat(*[ alt.hconcat(*[ - alt.layer(*column, title=column_title) + alt.layer(*column, title=column_title) if column else empty_plot for column_title, column in row ], title=row_title).resolve_scale(x=x, y=y) for row_title, row in rows From 49fb451b509db49d135707c4772413c5489c2af7 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 30 Jan 2026 19:43:58 +0100 Subject: [PATCH 148/162] Fix plot_line and plot_map for empty arguments --- isimip_utils/plot.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/isimip_utils/plot.py b/isimip_utils/plot.py index 4f1e1a5..7c84158 100644 --- a/isimip_utils/plot.py +++ b/isimip_utils/plot.py @@ -161,17 +161,17 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None Altair Chart object with line plot (and optional area for lower/upper bounds). """ - x_field = x_field or get_first_coord(df) - x_label = x_label or get_first_coord_label(df) - x_type = x_type or ('T' if get_first_coord_axis(df) == 'T' else 'Q') + x_field = get_first_coord(df) if x_field is None else x_field + x_label = get_first_coord_label(df) if x_label is None else x_label + x_type = ('T' if get_first_coord_axis(df) == 'T' else 'Q') if x_type is None else x_type x = alt.X( f'{x_field}:{x_type}', title=x_label ) - y_field = y_field or get_first_data_var(df) - y_label = y_label or get_first_data_var_label(df) - y_type = y_type or 'Q' + y_field = get_first_data_var(df) if y_field is None else y_field + y_label = get_first_data_var_label(df) if y_label is None else y_label + y_type = 'Q' if y_type is None else y_type y = alt.Y( f'{y_field}:{y_type}', title=y_label, @@ -179,11 +179,11 @@ def plot_line(df: pd.DataFrame, x_field: str | None = None, x_label: str | None scale=alt.Scale(zero=False, nice=False) ) - color_field = color_field or 'label' + color_field = 'label' if color_field is None else color_field if empty or color_field not in df: color = alt.Color() else: - color_type = color_type or 'N' + color_type = 'N' if color_type is None else color_type color_scale_args = {} if color_domain: color_scale_args['domain'] = color_domain @@ -276,9 +276,9 @@ def plot_map(df: pd.DataFrame, color_field: str | None = None, color_type: str | if empty: color = alt.Color() else: - color_field = color_field or get_first_data_var(df) - color_type = color_type or 'Q' - color_label = color_label or get_first_data_var_label(df) + color_field = get_first_data_var(df) if color_field is None else color_field + color_type = 'Q' if color_type is None else color_type + color_label = get_first_data_var_label(df) if color_label is None else color_label color_scale_args = {} if color_domain: From 2b9ec335f20f2e28f002896101f055d64f40a622 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 6 Feb 2026 19:41:40 +0100 Subject: [PATCH 149/162] Remove fetch_resource --- isimip_utils/protocol.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/isimip_utils/protocol.py b/isimip_utils/protocol.py index 8f1baca..3326397 100644 --- a/isimip_utils/protocol.py +++ b/isimip_utils/protocol.py @@ -145,15 +145,6 @@ def fetch_tree(path: str | Path, protocol_locations: str | list[str] = PROTOCOL_ raise NotFound(f'No tree found for {path}.') -def fetch_resource(resource_location: str | Path) -> dict: - resource = fetch_json(resource_location) - - if resource is None: - return resource - - raise NotFound(f'No resource found at {resource_location}.') - - def find_json(protocol_location: str, sub_location: str, path: str | Path) -> Generator[tuple[Path, Any], None, None]: """Find JSON files in protocol locations by traversing path components. From b189ec1d3f5ececd08148ffb22ecc63a0aa19b46 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 23 Feb 2026 20:19:32 +0100 Subject: [PATCH 150/162] Fix convert_time --- isimip_utils/tests/test_xarray.py | 18 ++++++++++-------- isimip_utils/xarray.py | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index d2ef689..31d3b7b 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -504,12 +504,6 @@ def test_create_mask(): assert np.all(np.isnan(outside_region['mask'].values)) -def test_convert_time(): - time = np.arange(0, 100, dtype=np.int8) - time_converted = convert_time(time) - assert np.array_equal(time_converted, np.arange(0, 100, dtype=np.float64)) - - def test_convert_time_datetime(): calendar = 'proleptic_gregorian' units = 'days since 2000-01-01 00:00:00' @@ -524,7 +518,15 @@ def test_convert_time_datetime(): assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) -def test_init_dataset_datetime64_index(): +def test_convert_time_datetime64(): + time = np.array(pd.date_range(start='2000-01-01', end='2000-12-31', freq='D')) + time_converted = convert_time(time) + + start = 145731 + assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) + + +def test_convert_time_datetime64_index(): time = pd.date_range(start='2000-01-01', end='2000-12-31', freq='D') time_converted = convert_time(time) @@ -532,7 +534,7 @@ def test_init_dataset_datetime64_index(): assert np.array_equal(time_converted, np.arange(start, start + 366, dtype=np.float64)) -def test_init_dataset_datetime64_series(): +def test_convert_time_datetime64_series(): time = pd.Series(pd.date_range(start='2000-01-01', end='2000-12-31', freq='D')) time_converted = convert_time(time) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 47ca8d8..b3f7120 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -419,16 +419,16 @@ def convert_time(time: np.ndarray, units='days since 1601-1-1 00:00:00', calenda """ if isinstance(time.dtype, pd.StringDtype): time = np.array([datetime.fromisoformat(t) for t in time], dtype=object) - else: - if np.issubdtype(time.dtype, np.floating) or np.issubdtype(time.dtype, np.integer): - return time.astype(np.float64) - elif np.issubdtype(time.dtype, np.datetime64): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - if isinstance(time, pd.core.indexes.datetimes.DatetimeIndex): - time = time.to_pydatetime() - else: - time = time.dt.to_pydatetime() + elif np.issubdtype(time.dtype, np.datetime64): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + + if isinstance(time, pd.DatetimeIndex): + time = time.to_pydatetime() + elif isinstance(time, pd.Series): + time = time.dt.to_pydatetime() + else: + time = pd.to_datetime(time).to_pydatetime() return cftime.date2num( time, calendar=calendar, units=units From a3cca878798146d9e5a573318f3ca6a864126d15 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 2 Mar 2026 18:21:56 +0100 Subject: [PATCH 151/162] Add PyPI publication to workflow --- .github/workflows/ci.yaml | 66 +++++++++++++++++++++++++++++++++++++-- .pre-commit-config.yaml | 5 --- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5c9fd7f..4217944 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -5,7 +5,11 @@ on: branches: - main - test + tags: + - "*" pull_request: + branches: + - main permissions: contents: read @@ -18,17 +22,22 @@ env: jobs: test: - name: Run tests + name: Run tests πŸ§ͺ runs-on: ubuntu-latest permissions: contents: write steps: - - name: Check out repository πŸ’Ύ - uses: actions/checkout@v5 + - name: Checkout repo πŸ›ŽοΈ + uses: actions/checkout@v6 with: persist-credentials: false + - name: Set up Python 🐍 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + - name: Restore testing cache πŸ“₯ uses: actions/cache@v4 with: @@ -74,3 +83,54 @@ jobs: - name: Run pytest πŸ§ͺ run: pytest --cov=isimip_utils --cov-fail-under=90 --cov-report=term-missing + + build: + name: Build distribution πŸ‘· + needs: test + runs-on: ubuntu-latest + + steps: + - name: Checkout repo πŸ›ŽοΈ + uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: Set up Python 🐍 + uses: actions/setup-python@v6 + with: + python-version: "3.x" + + - name: Install build 🧱 + run: python3 -m pip install build --user + + - name: Build a binary wheel and a source tarball πŸ› οΈ + run: python3 -m build + + - name: Store the distribution packages πŸ“€ + uses: actions/upload-artifact@v5 + with: + name: python-package-distributions + path: dist/ + + pypi: + name: Publish distribution to PyPI πŸ“¦ + if: startsWith(github.ref, 'refs/tags/') + needs: build + runs-on: ubuntu-latest + + environment: + name: pypi + url: https://pypi.org/p/isimip-utils + + permissions: + id-token: write + + steps: + - name: Download the distribution packages πŸ“₯ + uses: actions/download-artifact@v6 + with: + name: python-package-distributions + path: dist/ + + - name: Publish to PyPI πŸš€ + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 031203a..a37beb0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,8 +21,3 @@ repos: rev: v1.39.2 hooks: - id: typos - - - repo: https://github.com/zizmorcore/zizmor-pre-commit - rev: v1.16.3 - hooks: - - id: zizmor From 88cdef33980eb148c3ac6cb4c6b7994dd2269fcd Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 2 Mar 2026 18:37:45 +0100 Subject: [PATCH 152/162] Update CITATION.cff, LICENSE and pyproject.toml --- CITATION.cff | 4 +++- LICENSE | 2 +- pyproject.toml | 18 +++++++++++------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 41ca832..8644a85 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,7 +1,8 @@ cff-version: 1.2.0 -message: If you use this software, please cite it as below. title: ISIMIP utils +abstract: Common functionality for different ISIMIP tools. + authors: - family-names: Klar given-names: Jochen @@ -12,5 +13,6 @@ authors: - family-names: Inga given-names: Sauer orcid: https://orcid.org/0000-0002-9302-2131 + license: MIT repository-code: https://github.com/ISI-MIP/isimip-utils diff --git a/LICENSE b/LICENSE index 587e9da..9d51b46 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022-2025 Potsdam Institute for Climate Impact Research +Copyright (c) 2022-2026 Potsdam Institute for Climate Impact Research Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/pyproject.toml b/pyproject.toml index f4e5014..571f772 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,13 +13,14 @@ maintainers = [ description = "This package contains common functionality for different ISIMIP tools." readme = "README.md" requires-python = ">=3.11" -license = { file = "LICENSE" } +license = "MIT" +license-files = ["LICENSE"] classifiers = [ - 'Operating System :: OS Independent', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Python :: 3.13', + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] dependencies = [ "python-dotenv", @@ -70,7 +71,10 @@ docs = [ ] [tool.setuptools] -packages = ["isimip_utils"] +packages = [ + "isimip_utils", + "isimip_utils.tests" +] [tool.setuptools_scm] version_scheme = "release-branch-semver" From 53573de44bf84b856df6f761bd6fc9f7c9cf354a Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 3 Mar 2026 15:44:32 +0100 Subject: [PATCH 153/162] Fix write_dataset and refactor missing value handling --- isimip_utils/tests/test_extractions.py | 28 +++++++- isimip_utils/tests/test_xarray.py | 35 ++++------ isimip_utils/xarray.py | 91 ++++++++++++++++---------- 3 files changed, 98 insertions(+), 56 deletions(-) diff --git a/isimip_utils/tests/test_extractions.py b/isimip_utils/tests/test_extractions.py index f972f4f..cfc5f60 100644 --- a/isimip_utils/tests/test_extractions.py +++ b/isimip_utils/tests/test_extractions.py @@ -1,6 +1,7 @@ - import pytest +import numpy as np + from isimip_utils.extractions import ( compute_aggregation, compute_max, @@ -257,6 +258,17 @@ def test_compute_aggregation(type, decode_cf): helper.call(f'cdo diff,abslim={abslim} {extraction_path} {cdo_path}') +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_compute_aggregation_nan(decode_cf): + dataset_path = constants.DATASETS_PATH / constants.YIELD_PATH + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = compute_max(file_ds) + + # check that the max is not FILL_VALUE + assert (ds['yield-mai-noirr'] < 20).all() + + @pytest.mark.parametrize('decode_cf', (True, False)) def test_compute_mean_time(decode_cf): west, east, south, north = constants.BBOX @@ -286,6 +298,20 @@ def test_count_values(decode_cf): assert (ds['tas'] == 720*360).all() +@pytest.mark.parametrize('decode_cf', (True, False)) +def test_count_values_nan(decode_cf): + dataset_path = constants.DATASETS_PATH / constants.YIELD_PATH + + cdo_counts = np.array([ + int(line.split()[5]) - int(line.split()[6]) + for line in helper.call(f'cdo info {dataset_path}').splitlines()[1:-1] + ]) + + with open_dataset(dataset_path, decode_cf=decode_cf) as file_ds: + ds = count_values(file_ds) + assert (ds['yield-mai-noirr'].values == cdo_counts).all() + + @pytest.mark.parametrize('decode_cf', (True, False)) def test_count_values_mask(decode_cf): west, east, south, north = constants.BBOX diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 31d3b7b..33461bd 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -10,6 +10,7 @@ from isimip_utils.netcdf import open_dataset_read from isimip_utils.tests import constants, helper from isimip_utils.xarray import ( + add_compression_to_data_vars, add_fill_value_to_data_vars, convert_time, create_mask, @@ -20,8 +21,6 @@ order_variables, remove_fill_value_from_coords, set_attrs, - set_fill_value_to_nan, - set_nan_to_fill_value, to_dataframe, write_dataset, ) @@ -436,12 +435,16 @@ def test_add_fill_value_to_data_vars(): 'var': (['time'], np.ones(10)) } ) + + assert not ds['var'].encoding + add_fill_value_to_data_vars(ds) - assert ds['var'].attrs['_FillValue'] == 1e20 - assert ds['var'].attrs['missing_value'] == 1e20 + + assert ds['var'].encoding.get('_FillValue') == 1e20 + assert ds['var'].encoding.get('missing_value') == 1e20 -def test_set_fill_value_to_nan(): +def test_add_compression_to_data_vars(): ds = xr.Dataset( coords={ 'time': np.arange(10, dtype=np.float64) @@ -450,25 +453,13 @@ def test_set_fill_value_to_nan(): 'var': (['time'], np.ones(10)) } ) - ds['var'].values[0] = 1e20 - ds['var'].attrs['_FillValue'] = 1e20 - ds = set_fill_value_to_nan(ds) - assert np.isnan(ds['var'].values[0]) + assert not ds['var'].encoding -def test_set_nan_to_fill_value(): - ds = xr.Dataset( - coords={ - 'time': np.arange(10, dtype=np.float64) - }, - data_vars={ - 'var': (['time'], np.ones(10)) - } - ) - ds['var'].values[0] = np.nan - ds['var'].attrs['_FillValue'] = 1e20 - ds = set_nan_to_fill_value(ds) - assert ds['var'].values[0] == 1e20 + add_compression_to_data_vars(ds, 9) + + assert ds['var'].encoding.get('zlib') is True + assert ds['var'].encoding.get('complevel') == 9 def test_create_mask(): diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index b3f7120..61c63e5 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -205,16 +205,13 @@ def write_dataset(ds: xr.Dataset, path: str | Path): ds = remove_fill_value_from_coords(ds) ds = add_fill_value_to_data_vars(ds) - ds = set_nan_to_fill_value(ds) + ds = add_compression_to_data_vars(ds) ds = order_variables(ds) # time should be an unlimited dimension unlimited_dims = ['time'] if 'time' in ds.dims else [] - # data variables should be compressed - for data_var in ds.data_vars: - ds[data_var].encoding.update({'zlib': True, 'complevel': 5}) - + # write dataset as netcdf ds.to_netcdf(path, format='NETCDF4_CLASSIC', unlimited_dims=unlimited_dims) @@ -227,7 +224,12 @@ def order_variables(ds: xr.Dataset) -> xr.Dataset: Returns: Dataset with reordered variables. """ - return ds[[*ds.coords, *ds.data_vars]] + preferred_coords = ['lon', 'lat', 'time'] + + ordered_coords = [coord for coord in preferred_coords if coord in ds.coords] + remaining_coords = [coord for coord in ds.coords if coord not in preferred_coords] + + return ds[[*ordered_coords, *remaining_coords, *ds.data_vars]] def get_attrs(ds: xr.Dataset) -> dict: @@ -266,68 +268,91 @@ def set_attrs(ds: xr.Dataset, attrs: dict) -> xr.Dataset: return ds -def remove_fill_value_from_coords(ds: xr.Dataset) -> xr.Dataset: - """Remove _FillValue and missing_value attributes from the coords. +def set_fill_value_to_nan(ds: xr.Dataset) -> xr.Dataset: + """Replace fill values with NaN in data variables. This is only needed for datasets + which are read with decode_cf=False and _FillValue is not in encoding. Args: ds (xr.Dataset): Xarray Dataset to modify. Returns: - Dataset with fill value removed for the coords. + Dataset with fill values replaced by NaN. """ - for coord in ds.coords: - if '_FillValue' not in ds[coord].encoding: - ds[coord].encoding['_FillValue'] = None + for data_var in ds.data_vars: + if '_FillValue' not in ds[data_var].encoding: + ds[data_var] = ds[data_var].where(ds[data_var] != FILL_VALUE) return ds -def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: - """Add _FillValue and missing_value attributes to data_vars if not present. +def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: + """Replace NaN values with fill values in data variables. This is only needed for datasets + which are read with decode_cf=False and _FillValue is not in encoding. Args: ds (xr.Dataset): Xarray Dataset to modify. Returns: - Dataset with fill value attributes added for the data_vars. + Dataset with NaN values replaced by fill values. """ for data_var in ds.data_vars: - if '_FillValue' not in ds.data_vars[data_var].attrs: - ds.data_vars[data_var].attrs['_FillValue'] = FILL_VALUE - if 'missing_value' not in ds.data_vars[data_var].attrs: - missing_value = np.array(FILL_VALUE, dtype=ds[data_var].dtype) - ds.data_vars[data_var].attrs['missing_value'] = missing_value + if '_FillValue' not in ds[data_var].encoding: + ds[data_var] = ds[data_var].fillna(FILL_VALUE) return ds -def set_fill_value_to_nan(ds: xr.Dataset) -> xr.Dataset: - """Replace fill values with NaN in data variables. +def remove_fill_value_from_coords(ds: xr.Dataset) -> xr.Dataset: + """Remove _FillValue and missing_value attributes from the coords. Args: ds (xr.Dataset): Xarray Dataset to modify. Returns: - Dataset with fill values replaced by NaN. + Dataset with fill value removed for the coords. """ - for data_var in ds.data_vars: - fill_value = ds[data_var].attrs.get('_FillValue', FILL_VALUE) - missing_value = np.array(fill_value, dtype=ds[data_var].dtype) - ds[data_var] = ds[data_var].where(ds[data_var] != missing_value) + for coord in ds.coords: + if '_FillValue' not in ds[coord].encoding: + ds[coord].encoding['_FillValue'] = None return ds -def set_nan_to_fill_value(ds: xr.Dataset) -> xr.Dataset: - """Replace NaN values with fill values in data variables. +def add_fill_value_to_data_vars(ds: xr.Dataset) -> xr.Dataset: + """Add _FillValue and missing_value to data_vars if no encoding is present. This + is the case for a newly created Dataset. Args: ds (xr.Dataset): Xarray Dataset to modify. Returns: - Dataset with NaN values replaced by fill values. + Dataset with encoding added for the data_vars. + """ + for data_var in ds.data_vars: + encoding = ds[data_var].encoding + if not encoding: + ds[data_var].attrs.pop('_FillValue', None) + ds[data_var].attrs.pop('missing_value', None) + ds[data_var].encoding.update({ + '_FillValue': FILL_VALUE, + 'missing_value': ds[data_var].dtype.type(FILL_VALUE) + }) + + return ds + + +def add_compression_to_data_vars(ds, complevel=5) -> xr.Dataset: + """Add compression to data variables. + + Args: + ds (xr.Dataset): Xarray Dataset to reorder. + complevel (int): Compression level + + Returns: + Dataset with updated encoding. """ for data_var in ds.data_vars: - fill_value = ds[data_var].attrs.get('_FillValue', FILL_VALUE) - missing_value = np.array(fill_value, dtype=ds[data_var].dtype) - ds[data_var] = ds[data_var].fillna(missing_value) + ds[data_var].encoding.update({ + 'zlib': True, + 'complevel': complevel + }) return ds From 1f2140d81d551e666205b3be8e04e95be87c3cd4 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 3 Mar 2026 18:09:35 +0100 Subject: [PATCH 154/162] Refactor decode_times=False fallback --- isimip_utils/xarray.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 61c63e5..22a2a66 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -141,26 +141,20 @@ def open_dataset(path: str | Path, decode_cf: bool = True, load: bool = False) - try: ds = xr.open_dataset(path, decode_cf=decode_cf) - except ValueError: + except ValueError as e: # workaround for non standard times (e.g. growing seasons) ds = xr.open_dataset(path, decode_cf=decode_cf, decode_times=False) - if ds['time'].units.startswith('growing seasons'): - units = ds['time'].units.replace('growing seasons', 'common_years') + units = ds['time'].units + calendar = ds['time'].calendar - ds['time'].attrs['long_name'] = 'Growing season' - ds['time'].attrs['units'] = '' - - time_array = cftime.num2date(ds['time'].values, units=units, calendar='365_day') - time = xr.DataArray( - time_array, - dims=['time'], - coords={'time': time_array}, - name='time', - attrs=ds['time'].attrs - ) - - ds = ds.assign_coords(time=time) + if units.startswith('months'): + ds['time'] = cftime.num2date(ds['time'].values, units=units, calendar='360_day') + elif units.startswith('growing seasons'): + units = units.replace('growing seasons', 'common_years') + ds['time'] = cftime.num2date(ds['time'].values, units=units, calendar='365_day') + else: + raise ValueError(f'unable to decode time units "{units}" with calendar "{calendar}"') from e if load: ds.load() From 22c790e0d5be5e56112be052434e616199e4af9d Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Wed, 4 Mar 2026 15:30:14 +0100 Subject: [PATCH 155/162] Use warning instead of warn as log level --- isimip_utils/cli.py | 4 ++-- isimip_utils/extractions.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 711382f..9cd62de 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -18,13 +18,13 @@ def setup_env() -> None: load_dotenv(Path().cwd() / '.env') -def setup_logs(log_level: str = 'WARN', log_file: str | None = None, +def setup_logs(log_level: str = 'WARNING', log_file: str | None = None, log_console: bool = True, log_rich: bool = True, show_time: bool = False, show_path: bool = False) -> None: """Configure logging with console and/or file handlers. Args: - log_level (str): Logging level (default: 'WARN'). + log_level (str): Logging level (default: 'WARNING'). log_file (str | None): Path to log file, or None for no file logging (default: None). log_console (bool): Whether to log to console (default: True). log_rich (bool): Whether to use RichHandler for console logging (default: True). diff --git a/isimip_utils/extractions.py b/isimip_utils/extractions.py index a1e50d5..12d9829 100644 --- a/isimip_utils/extractions.py +++ b/isimip_utils/extractions.py @@ -31,7 +31,7 @@ def select_time(ds: xr.Dataset, timestamp: datetime) -> xr.Dataset | None: time = compute_time(ds, timestamp) if time < ds['time'].min() or time > ds['time'].max(): - logger.warn(f'Selected time={time} is outside the dataset.') + logger.warning(f'Selected time={time} is outside the dataset.') return None return ds.sel(time=time, method='nearest') @@ -204,7 +204,7 @@ def compute_aggregation(ds: xr.Dataset, type: Literal['mean', 'min', 'max', 'sum if type in ('mean', 'std', 'sum') and dim == ('lat', 'lon'): if weights is None: - logger.warn('no weights provided, using latitude-dependent weights') + logger.warning('no weights provided, using latitude-dependent weights') weights = np.sin(np.deg2rad(ds.lat + 0.25)) - np.sin(np.deg2rad(ds.lat - 0.25)) ds = ds.weighted(weights) From 42bb2a42d9f42317e17554faf46c6f38c556e6ad Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Tue, 3 Mar 2026 18:26:22 +0100 Subject: [PATCH 156/162] Fix xarray DEFAULT_ATTRS --- isimip_utils/xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 22a2a66..99ec0cd 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -28,7 +28,7 @@ 'standard_name': 'time', 'long_name': 'Time', 'calendar': 'proleptic_gregorian', - 'units': 'days since 1601-1-1 00:00:00', + 'units': 'days since 1661-01-01 00:00:00', 'axis': 'T' } } From 3a93261ab847a1a92b892d0507f3944dd4ab397c Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 6 Mar 2026 15:06:23 +0100 Subject: [PATCH 157/162] Add "years since " to open_dataset fallback --- isimip_utils/xarray.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 99ec0cd..04e2333 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -150,6 +150,9 @@ def open_dataset(path: str | Path, decode_cf: bool = True, load: bool = False) - if units.startswith('months'): ds['time'] = cftime.num2date(ds['time'].values, units=units, calendar='360_day') + elif units.startswith('years'): + units = units.replace('years', 'common_years') + ds['time'] = cftime.num2date(ds['time'].values, units=units, calendar='365_day') elif units.startswith('growing seasons'): units = units.replace('growing seasons', 'common_years') ds['time'] = cftime.num2date(ds['time'].values, units=units, calendar='365_day') From d731b6a0da32df6b8fad4e4d953a44b0163b83c1 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 6 Mar 2026 21:00:45 +0100 Subject: [PATCH 158/162] Fix GitHub workflow --- .github/workflows/ci.yaml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4217944..adf79b0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,7 +18,7 @@ permissions: env: PYTHONDONTWRITEBYTECODE: 1 FORCE_COLOR: 1 - PYTHON_VERSION: "3.11" + PYTHON_VERSION: "3.12" jobs: test: @@ -36,7 +36,8 @@ jobs: - name: Set up Python 🐍 uses: actions/setup-python@v6 with: - python-version: "3.12" + python-version: ${{ env.PYTHON_VERSION }} + cache: pip - name: Restore testing cache πŸ“₯ uses: actions/cache@v4 @@ -55,12 +56,6 @@ jobs: sudo apt-get update sudo apt-get install -y cdo netcdf-bin --no-install-recommends - - name: Set up Python 🐍 - uses: actions/setup-python@v6 - with: - python-version: ${{ env.PYTHON_VERSION }} - cache: pip - - name: Install package πŸ“¦ run: pip install -e .[all] @@ -98,7 +93,8 @@ jobs: - name: Set up Python 🐍 uses: actions/setup-python@v6 with: - python-version: "3.x" + python-version: ${{ env.PYTHON_VERSION }} + cache: pip - name: Install build 🧱 run: python3 -m pip install build --user From 78a332a5007f75dc67b3fef2915419e2de0481e3 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Fri, 6 Mar 2026 21:47:35 +0100 Subject: [PATCH 159/162] Fix tests --- isimip_utils/tests/test_xarray.py | 4 ++-- isimip_utils/xarray.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/isimip_utils/tests/test_xarray.py b/isimip_utils/tests/test_xarray.py index 33461bd..57933e2 100644 --- a/isimip_utils/tests/test_xarray.py +++ b/isimip_utils/tests/test_xarray.py @@ -239,7 +239,7 @@ def test_init_dataset_latlon(): assert ds.sizes['lon'] == 1 assert ds.sizes['lat'] == 1 - assert ds['time'].units == 'days since 1601-1-1 00:00:00' + assert ds['time'].units == 'days since 1601-01-01 00:00:00' assert ds['time'].calendar == 'proleptic_gregorian' assert np.array_equal(ds['var'].values, var) @@ -273,7 +273,7 @@ def test_init_dataset_latlon(): time:standard_name = "time" ; time:long_name = "Time" ; time:calendar = "proleptic_gregorian" ; - time:units = "days since 1601-1-1 00:00:00" ; + time:units = "days since 1601-01-01 00:00:00" ; time:axis = "T" ; float var(time, lat, lon) ; var:_FillValue = 1.e+20f ; diff --git a/isimip_utils/xarray.py b/isimip_utils/xarray.py index 04e2333..66759f9 100644 --- a/isimip_utils/xarray.py +++ b/isimip_utils/xarray.py @@ -28,7 +28,7 @@ 'standard_name': 'time', 'long_name': 'Time', 'calendar': 'proleptic_gregorian', - 'units': 'days since 1661-01-01 00:00:00', + 'units': 'days since 1601-01-01 00:00:00', 'axis': 'T' } } From b048d8f6ee7e4edf9541f0ef1feec231241e7d11 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Sat, 7 Mar 2026 17:43:48 +0100 Subject: [PATCH 160/162] Use hatchling as build-system --- pyproject.toml | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 571f772..dce2760 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -build-backend = "setuptools.build_meta" -requires = ["setuptools", "setuptools_scm"] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" [project] name = "isimip-utils" @@ -70,14 +70,12 @@ docs = [ "mkdocstrings-python", ] -[tool.setuptools] -packages = [ - "isimip_utils", - "isimip_utils.tests" -] +[tool.hatch.version] +source = "vcs" -[tool.setuptools_scm] -version_scheme = "release-branch-semver" +[tool.hatch.build.targets.wheel] +packages = ["isimip_utils"] +exclude = ["isimip_utils/tests"] [tool.ruff] target-version = "py311" From 82562d3909a9aece5447c459f011e56a24a49858 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 9 Mar 2026 14:58:35 +0100 Subject: [PATCH 161/162] Fix build_default_args for arguments with const --- isimip_utils/cli.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/isimip_utils/cli.py b/isimip_utils/cli.py index 9cd62de..6c517df 100644 --- a/isimip_utils/cli.py +++ b/isimip_utils/cli.py @@ -253,7 +253,7 @@ def build_default_args(self, config_path=None) -> argparse.Namespace: value = None # apply action type - if value and action.type is not None: + if value and action.type is not None and value not in [True, False]: try: value = action.type(value) except argparse.ArgumentTypeError as e: @@ -272,8 +272,11 @@ def build_default_args(self, config_path=None) -> argparse.Namespace: if value is not None: # check action.action - if action.const and value not in [True, False]: - raise ConfigError(f'argument "{key}": invalid choice "{value}" (choose true or false)') + if action.const: + if value is True: + value = action.const + elif value is False: + value = None # check action.choices if action.choices and value not in action.choices: From fda3f35df80eb2f4e1f7544800c593a67012d290 Mon Sep 17 00:00:00 2001 From: Jochen Klar Date: Mon, 9 Mar 2026 16:20:21 +0100 Subject: [PATCH 162/162] Update CITATION.cff --- CITATION.cff | 1 + 1 file changed, 1 insertion(+) diff --git a/CITATION.cff b/CITATION.cff index 8644a85..4725156 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,4 +1,5 @@ cff-version: 1.2.0 +message: If you use this software in your research, please cite it using the provided Digital Object Identifier (DOI). title: ISIMIP utils abstract: Common functionality for different ISIMIP tools.