Add waterdata.get_ratings for USGS stage-discharge ratings via STAC

thodson-usgs · claude · thodson-usgs · commit e3047631f924 · 2026-05-06T11:18:49.000-05:00
Wraps the new Water Data STAC catalog endpoint
(api.waterdata.usgs.gov/stac/v0/search) for stage-discharge rating
curves. Lives in its own module (waterdata/ratings.py) because the
transport layer — STAC search + per-feature RDB download — differs
from the OGC collections used by the rest of the package.

The function:
- Composes a CQL filter from monitoring_location_id and (single-value)
  file_type, mirroring R's logic. Multi-type requests fetch all
  matches and filter URLs client-side.
- Optionally downloads each matching .rdb asset to a user-supplied
  file_path (default: a fresh tempfile.mkdtemp), and parses with the
  existing nwis._read_rdb helper.
- Returns a dict[id -&gt; DataFrame] when download_and_parse=True, or the
  raw list of STAC features when False (cheap "what's available?"
  inspection).

Surfaces a clear ValueError for invalid file_type values and for ISO
8601 durations in `datetime` (the rating-curve service rejects them).

Mirrors R's read_waterdata_ratings in DOI-USGS/dataRetrieval; example
sites and idioms come straight from the R doc.

Tests: 9 unit tests covering filter composition, the two error paths,
and end-to-end search-and-download via requests_mock (single-site,
download_and_parse=False, and multi-type URL filtering).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
+
 **05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
 
 **05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
@@ -29,6 +29,7 @@
 )
 from .filters import FILTER_LANG
 from .nearest import get_nearest_continuous
+from .ratings import get_ratings
 from .types import (
     CODE_SERVICES,
     PROFILE_LOOKUP,
@@ -52,6 +53,7 @@
     "get_latest_daily",
     "get_monitoring_locations",
     "get_nearest_continuous",
+    "get_ratings",
     "get_reference_table",
     "get_samples",
     "get_samples_summary",
diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py
@@ -0,0 +1,260 @@
+"""USGS rating-curve retrieval via the Water Data STAC catalog.
+
+Wraps ``https://api.waterdata.usgs.gov/stac/v0/search`` and the per-feature
+RDB downloads that follow. The STAC endpoint hosts standard NWIS rating
+files (``exsa``, ``base``, ``corr``) for active streamgages — see the
+service overview at https://api.waterdata.usgs.gov/docs/stac/ and the
+WDFN announcement at https://waterdata.usgs.gov/blog/wdfn-rating-curves/.
+
+This is the discrete analogue to the OGC waterdata getters; it lives in
+its own module because the transport layer (STAC search + RDB download)
+differs from the OGC collections used by the rest of the package.
+
+The R analogue is ``read_waterdata_ratings`` in
+https://github.com/DOI-USGS/dataRetrieval/.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+import requests
+
+from dataretrieval.nwis import _read_rdb
+
+from .utils import BASE_URL, _default_headers, _format_api_dates
+
+logger = logging.getLogger(__name__)
+
+STAC_URL = f"{BASE_URL}/stac/v0"
+_VALID_FILE_TYPES: tuple[str, ...] = ("exsa", "base", "corr")
+
+
+def _build_filter(
+    monitoring_location_id: str | list[str] | None,
+    file_type: str | None,
+) -> str | None:
+    """Compose the CQL filter sent to STAC ``/search``.
+
+    Mirrors R's logic: only pin ``file_type`` when a single value was given,
+    so a multi-type request returns every matching site and the file-type
+    filtering happens client-side from the per-feature URLs.
+    """
+    parts: list[str] = []
+    if monitoring_location_id is not None:
+        ids = (
+            [monitoring_location_id]
+            if isinstance(monitoring_location_id, str)
+            else list(monitoring_location_id)
+        )
+        joined = "', '".join(ids)
+        parts.append(f"monitoring_location_id IN ('{joined}')")
+    if file_type is not None:
+        parts.append(f"file_type = '{file_type}'")
+    return " AND ".join(parts) if parts else None
+
+
+def _search(
+    filter_str: str | None,
+    datetime_str: str | None,
+    bbox: list[float] | None,
+    limit: int,
+    ssl_check: bool,
+) -> list[dict[str, Any]]:
+    """Run a single STAC ``/search`` request and return its features."""
+    params: dict[str, Any] = {"limit": limit}
+    if filter_str is not None:
+        params["filter"] = filter_str
+    if datetime_str is not None:
+        params["datetime"] = datetime_str
+    if bbox is not None:
+        params["bbox"] = ",".join(str(b) for b in bbox)
+
+    response = requests.get(
+        f"{STAC_URL}/search",
+        params=params,
+        headers=_default_headers(),
+        verify=ssl_check,
+    )
+    response.raise_for_status()
+    return response.json().get("features", [])
+
+
+def _download_and_parse(
+    feature: dict[str, Any],
+    file_path: str,
+    ssl_check: bool,
+) -> pd.DataFrame:
+    """Fetch the feature's data asset, write it to ``file_path``, parse RDB."""
+    url = feature["assets"]["data"]["href"]
+    fid = feature["id"]
+    target = os.path.join(file_path, fid)
+
+    response = requests.get(url, headers=_default_headers(), verify=ssl_check)
+    response.raise_for_status()
+
+    with open(target, "w") as f:
+        f.write(response.text)
+
+    return _read_rdb(response.text)
+
+
+def get_ratings(
+    monitoring_location_id: str | list[str] | None = None,
+    file_type: str | list[str] = "exsa",
+    file_path: str | None = None,
+    datetime: str | list[str] | None = None,
+    bbox: list[float] | None = None,
+    limit: int = 10000,
+    download_and_parse: bool = True,
+    ssl_check: bool = True,
+) -> dict[str, pd.DataFrame] | list[dict[str, Any]]:
+    """Get USGS stage-discharge rating curves from the Water Data STAC catalog.
+
+    Returns the current rating tables for one or more active USGS streamgages.
+    The catalog hosts three file types:
+
+    - ``"exsa"`` — expanded shift-adjusted rating (default). Adds a ``SHIFT``
+      column to ``"base"`` indicating the current shift for each ``INDEP``.
+    - ``"base"`` — three columns: ``INDEP`` (typically gage height, ft);
+      ``DEP`` (typically discharge, ft^3/s); ``STOR`` ("``*``" marks fixed
+      points of the rating).
+    - ``"corr"`` — three columns: ``INDEP``; ``CORR`` (correction for that
+      value); ``CORRINDEP`` (corrected INDEP).
+
+    See https://api.waterdata.usgs.gov/docs/stac/ for the upstream service
+    docs and https://waterdata.usgs.gov/blog/wdfn-rating-curves/ for the
+    background announcement. The R analogue is ``read_waterdata_ratings``
+    in https://github.com/DOI-USGS/dataRetrieval/.
+
+    Parameters
+    ----------
+    monitoring_location_id : string or list of strings, optional
+        One or more identifiers in ``AGENCY-ID`` form (e.g.
+        ``"USGS-01104475"``). If omitted, the spatial / temporal filters
+        determine the result set.
+    file_type : string or list of strings, default ``"exsa"``
+        Which rating file(s) to request. One or more of ``"exsa"``,
+        ``"base"``, ``"corr"``.
+    file_path : string, optional
+        Directory the downloaded RDB files are written to. Defaults to a
+        per-call temporary directory created via :func:`tempfile.mkdtemp`.
+    datetime : string or list of strings, optional
+        STAC ``datetime`` filter — a single date / datetime, or an
+        interval (``"start/end"``, optionally half-bounded with ``..``).
+        ISO 8601 *durations* (``"P1M"``, ``"PT36H"``, …) are **not**
+        supported by the rating-curve service; passing one raises
+        ``ValueError``.
+    bbox : list of numbers, optional
+        Only features whose geometry intersects the bounding box are
+        selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326
+        (longitude / latitude, west-south-east-north).
+    limit : int, default 10000
+        Page size for the STAC ``/search`` request (capped at 10000).
+    download_and_parse : bool, default ``True``
+        If ``True``, download every matching RDB file and parse it into a
+        ``DataFrame``. If ``False``, return the raw list of STAC feature
+        dicts so the caller can inspect what's available before pulling
+        bytes.
+    ssl_check : bool, default ``True``
+        Verify the server's SSL certificate.
+
+    Returns
+    -------
+    dict[str, pandas.DataFrame] or list[dict]
+        When ``download_and_parse=True`` (the default), a dict keyed by
+        feature ID (e.g. ``"USGS-01104475.exsa.rdb"``) mapping to a parsed
+        ``DataFrame``. When ``download_and_parse=False``, the raw list of
+        STAC feature dicts as returned by the search endpoint.
+
+    Raises
+    ------
+    ValueError
+        For an unrecognized ``file_type`` value or an ISO 8601 duration in
+        ``datetime``.
+
+    Examples
+    --------
+    .. code::
+
+        >>> # Default exsa ratings for two sites
+        >>> ratings = dataretrieval.waterdata.get_ratings(
+        ...     monitoring_location_id=["USGS-01104475", "USGS-01104460"],
+        ...     file_type="exsa",
+        ... )
+        >>> ratings["USGS-01104475.exsa.rdb"].head()
+
+        >>> # Both exsa and corr files for the same two sites
+        >>> ratings = dataretrieval.waterdata.get_ratings(
+        ...     monitoring_location_id=["USGS-01104475", "USGS-01104460"],
+        ...     file_type=["exsa", "corr"],
+        ... )
+
+        >>> # Bounding-box query, listing what's available without downloading
+        >>> features = dataretrieval.waterdata.get_ratings(
+        ...     bbox=[-95.0, 40.0, -92.0, 42.0],
+        ...     download_and_parse=False,
+        ... )
+
+        >>> # Restrict to features modified since seven days ago (no durations)
+        >>> features = dataretrieval.waterdata.get_ratings(
+        ...     bbox=[-95.0, 40.0, -92.0, 42.0],
+        ...     datetime=["2026-04-29", ".."],
+        ...     download_and_parse=False,
+        ... )
+
+    """
+    file_types = [file_type] if isinstance(file_type, str) else list(file_type)
+    invalid = [ft for ft in file_types if ft not in _VALID_FILE_TYPES]
+    if invalid:
+        raise ValueError(
+            f"Invalid file_type {invalid!r}. Valid options: {list(_VALID_FILE_TYPES)}."
+        )
+
+    if datetime is not None:
+        # The rating-curve STAC service rejects ISO 8601 durations; surface a
+        # clear error rather than letting the server return a confusing 4xx.
+        dt_values = datetime if isinstance(datetime, list) else [datetime]
+        if any(v is not None and "P" in str(v).upper() for v in dt_values):
+            raise ValueError(
+                "ISO 8601 durations (e.g. 'P7D') are not supported in "
+                "`datetime` for the rating-curve service. Provide a date or "
+                "interval instead."
+            )
+        datetime_str = _format_api_dates(datetime, date=False)
+    else:
+        datetime_str = None
+
+    # Mirror R: only pin file_type in the server-side filter when one type
+    # is requested. With multiple types, fetch all and filter URLs locally.
+    server_file_type = file_types[0] if len(file_types) == 1 else None
+    filter_str = _build_filter(monitoring_location_id, server_file_type)
+
+    features = _search(filter_str, datetime_str, bbox, limit, ssl_check)
+
+    if not download_and_parse:
+        return features
+
+    if file_path is None:
+        file_path = tempfile.mkdtemp(prefix="dataretrieval-ratings-")
+    os.makedirs(file_path, exist_ok=True)
+
+    out: dict[str, pd.DataFrame] = {}
+    for feature in features:
+        url = feature.get("assets", {}).get("data", {}).get("href", "")
+        # Skip features whose file type wasn't requested (only relevant when
+        # `file_type` is a list — single-type requests are already filtered
+        # server-side).
+        if not any(ft in url for ft in file_types):
+            continue
+        fid = feature["id"]
+        try:
+            out[fid] = _download_and_parse(feature, file_path, ssl_check)
+        except Exception as e:
+            logger.warning("Failed to download / parse %s: %s", fid, e)
+
+    return out
diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+05/06/2026: Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
	`2`	`+`
`1`	`3`	05/05/2026: Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
`2`	`4`
`3`	`5`	05/05/2026: Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.