Add waterdata.get_peaks for the annual peak-streamflow OGC collection (#267)

thodson-usgs · claude · web-flow · commit fca3d6c87006 · 2026-05-06T19:23:42.000-05:00
Wraps the new /ogcapi/v0/collections/peaks collection. Returns the
annual peak record for a monitoring location — one row per (location,
parameter, water year) — which is the standard input to flood-
frequency analysis (log-Pearson Type III etc).

The collection covers stage (parameter 00065) and discharge (00060);
typical streamgages have a series for each.

Implementation reuses the existing get_ogc_data infrastructure:
- service = "peaks"
- output_id = "peak_id" (the API's `id` field is renamed for users,
  matching the project's other get_* functions)

R has no equivalent yet; the docstring was written from scratch
following the project's existing get_* style.

Two live tests cover the happy path (single-site, both parameters
present) and a water-year filter.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 **05/06/2026:** Added `waterdata.get_field_measurements_metadata(...)` — wraps the OGC `field-measurements-metadata` collection. Returns one row per (location, parameter) field-measurement series describing its period of record, units, etc., without the underlying observations. Discrete-measurement analogue to `get_time_series_metadata`. Mirrors R's `read_waterdata_field_meta`.
 
+**05/06/2026:** Added `waterdata.get_peaks(...)` — wraps the new OGC `peaks` collection, returning the annual peak streamflow / stage record for a monitoring location (one row per water year, per parameter). Standard input to flood-frequency analysis. Supports calendar/water-year filters and the usual location/parameter/CQL knobs shared with the other OGC getters.
+
 **05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
 
 **05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
@@ -21,6 +21,7 @@
     get_latest_continuous,
     get_latest_daily,
     get_monitoring_locations,
+    get_peaks,
     get_reference_table,
     get_samples,
     get_samples_summary,
@@ -55,6 +56,7 @@
     "get_latest_daily",
     "get_monitoring_locations",
     "get_nearest_continuous",
+    "get_peaks",
     "get_ratings",
     "get_reference_table",
     "get_samples",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -1878,6 +1878,126 @@ def get_field_measurements_metadata(
     return get_ogc_data(args, output_id, service)
 
 
+def get_peaks(
+    monitoring_location_id: str | list[str] | None = None,
+    parameter_code: str | list[str] | None = None,
+    time_series_id: str | list[str] | None = None,
+    unit_of_measure: str | list[str] | None = None,
+    time: str | list[str] | None = None,
+    last_modified: str | list[str] | None = None,
+    water_year: int | list[int] | None = None,
+    year: int | list[int] | None = None,
+    month: int | list[int] | None = None,
+    day: int | list[int] | None = None,
+    peak_since: int | list[int] | None = None,
+    properties: str | list[str] | None = None,
+    skip_geometry: bool | None = None,
+    bbox: list[float] | None = None,
+    limit: int | None = None,
+    filter: str | None = None,
+    filter_lang: FILTER_LANG | None = None,
+    convert_type: bool = True,
+) -> tuple[pd.DataFrame, BaseMetadata]:
+    """Get the annual peak streamflow / stage record for a monitoring location.
+
+    Peaks are the largest values observed at a site each water year and are
+    the standard input to flood-frequency analysis (e.g. log-Pearson Type III
+    fits). The endpoint returns one row per (monitoring location, parameter,
+    water year), with the peak ``value`` and the ``time`` it occurred.
+
+    The collection covers both stage (parameter ``"00065"``, ``ft``) and
+    discharge (parameter ``"00060"``, ``ft^3/s``); a typical streamgage has a
+    series for each. Reference docs:
+    https://api.waterdata.usgs.gov/ogcapi/v0/openapi?f=html#/peaks
+
+    Parameters
+    ----------
+    monitoring_location_id : string or list of strings, optional
+        A unique identifier representing a single monitoring location, in
+        ``AGENCY-ID`` form (e.g. ``"USGS-02238500"``).
+    parameter_code : string or list of strings, optional
+        5-digit parameter code. Most peaks records are ``"00060"`` (discharge)
+        or ``"00065"`` (stage / gage height). Full list at
+        https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
+    time_series_id : string or list of strings, optional
+        ID of the time series the peak belongs to.
+    unit_of_measure : string or list of strings, optional
+        Human-readable units (e.g. ``"ft^3/s"``, ``"ft"``).
+    time : string, optional
+        Datetime, interval, or duration filter on the peak's date.
+        See :func:`get_time_series_metadata` for the full grammar.
+    last_modified : string, optional
+        Same datetime grammar as ``time``; filters on the database
+        last-modified timestamp (useful for incremental ETL polling).
+    water_year, year, month, day : int or list of ints, optional
+        Calendar / water-year filters on the peak event. The water year ends
+        September 30 (e.g. WY2024 = Oct 1, 2023 – Sep 30, 2024).
+    peak_since : int or list of ints, optional
+        Filter on the year since which the peak value has stood as the
+        record (the API serves this field as an integer; many rows are
+        ``null``).
+    properties : string or list of strings, optional
+        Subset of columns to return. Defaults to every available property.
+    skip_geometry : boolean, optional
+        Skip per-feature geometries; the returned object will be a plain
+        ``DataFrame`` with no spatial information.
+    bbox : list of numbers, optional
+        Only features whose geometry intersects the bounding box are
+        selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326
+        (longitude / latitude, west-south-east-north).
+    limit : numeric, optional
+        Page size; the maximum allowable value is 50000. Default
+        (``None``) requests the maximum allowable limit.
+    filter, filter_lang : optional
+        Server-side CQL filter passed through as the OGC ``filter`` /
+        ``filter-lang`` query parameters. See
+        :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking,
+        and the lexicographic-comparison pitfall.
+    convert_type : boolean, optional
+        If True, converts columns to appropriate types.
+
+    Returns
+    -------
+    df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame``
+        Formatted data returned from the API query.
+    md : :obj:`dataretrieval.utils.Metadata`
+        A custom metadata object pertaining to the query.
+
+    Examples
+    --------
+    .. code::
+
+        >>> # Full annual peak record at one site (both stage and discharge)
+        >>> df, md = dataretrieval.waterdata.get_peaks(
+        ...     monitoring_location_id="USGS-02238500"
+        ... )
+
+        >>> # Discharge peaks only
+        >>> df, md = dataretrieval.waterdata.get_peaks(
+        ...     monitoring_location_id="USGS-02238500",
+        ...     parameter_code="00060",
+        ... )
+
+        >>> # Multi-site peaks for a parameter, narrowed to a water-year range
+        >>> df, md = dataretrieval.waterdata.get_peaks(
+        ...     monitoring_location_id=[
+        ...         "USGS-07069000",
+        ...         "USGS-07064000",
+        ...         "USGS-07068000",
+        ...     ],
+        ...     parameter_code="00060",
+        ...     water_year=[2020, 2021, 2022, 2023],
+        ... )
+
+    """
+    service = "peaks"
+    output_id = "peak_id"
+
+    args = _get_args(locals())
+
+    return get_ogc_data(args, output_id, service)
+
+
 def get_reference_table(
     collection: str,
     limit: int | None = None,
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -17,6 +17,7 @@
     get_latest_continuous,
     get_latest_daily,
     get_monitoring_locations,
+    get_peaks,
     get_reference_table,
     get_samples,
     get_samples_summary,
@@ -399,6 +400,28 @@ def test_get_field_measurements_metadata_multi_site():
     }
 
 
+def test_get_peaks():
+    df, md = get_peaks(monitoring_location_id="USGS-02238500", skip_geometry=True)
+    assert "peak_id" in df.columns
+    assert "value" in df.columns
+    assert "water_year" in df.columns
+    assert (df["monitoring_location_id"] == "USGS-02238500").all()
+    assert set(df["parameter_code"].unique()).issubset({"00060", "00065"})
+    assert hasattr(md, "url")
+    assert hasattr(md, "query_time")
+
+
+def test_get_peaks_water_year_filter():
+    df, _ = get_peaks(
+        monitoring_location_id="USGS-02238500",
+        parameter_code="00060",
+        water_year=[2020, 2021, 2022],
+        skip_geometry=True,
+    )
+    assert (df["parameter_code"] == "00060").all()
+    assert set(df["water_year"].unique()).issubset({2020, 2021, 2022})
+
+
 def test_get_reference_table():
     df, md = get_reference_table("agency-codes")
     assert "agency_code" in df.columns