Add waterdata.get_samples_summary for per-location sample inventory (#262)

thodson-usgs · claude · web-flow · commit 6df40f58859b · 2026-05-05T14:37:40.000-05:00
Wraps the Samples database /summary/{monitoringLocationIdentifier} endpoint, mirroring the R package's summarize_waterdata_samples. Returns per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location — useful for taking inventory of what discrete-sample data exists at a site before pulling observations with get_samples. The Samples summary endpoint accepts only a single monitoring location per request, so the function takes a string (not a list). Closes #261. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
+
 **05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.
 
 **04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
@@ -21,6 +21,7 @@
     get_monitoring_locations,
     get_reference_table,
     get_samples,
+    get_samples_summary,
     get_stats_date_range,
     get_stats_por,
     get_time_series_metadata,
@@ -51,6 +52,7 @@
     "get_nearest_continuous",
     "get_reference_table",
     "get_samples",
+    "get_samples_summary",
     "get_stats_date_range",
     "get_stats_por",
     "get_time_series_metadata",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -10,6 +10,7 @@
 import logging
 from io import StringIO
 from typing import get_args
+from urllib.parse import quote
 
 import pandas as pd
 import requests
@@ -1800,6 +1801,77 @@ def get_samples(
     return df, BaseMetadata(response)
 
 
+def get_samples_summary(
+    monitoringLocationIdentifier: str,
+    ssl_check: bool = True,
+) -> tuple[pd.DataFrame, BaseMetadata]:
+    """Get a summary of discrete water-quality samples at a single monitoring location.
+
+    Wraps the Samples database summary service described at
+    https://api.waterdata.usgs.gov/samples-data/docs. The service returns one
+    row per (characteristic group, characteristic, user-supplied characteristic)
+    combination with result and activity counts and the first / most recent
+    activity dates — useful for taking inventory of what discrete-sample data
+    exists at a site before pulling the underlying observations with
+    :func:`get_samples`.
+
+    The summary service is single-site only: it accepts exactly one monitoring
+    location per request.
+
+    Parameters
+    ----------
+    monitoringLocationIdentifier : string
+        A monitoring location identifier has two parts, separated by a dash
+        (``-``): the agency code and the location number. Examples:
+        ``"USGS-040851385"``, ``"AZ014-320821110580701"``,
+        ``"CAX01-15304600"``. Bare location numbers without an agency prefix
+        are accepted by the service but return an empty result, so a prefix
+        is effectively required.
+    ssl_check : bool, optional
+        Check the SSL certificate. Default is True.
+
+    Returns
+    -------
+    df : ``pandas.DataFrame``
+        Formatted data returned from the API query.
+    md : :obj:`dataretrieval.utils.Metadata`
+        Custom ``dataretrieval`` metadata object pertaining to the query.
+
+    Examples
+    --------
+    .. code::
+
+        >>> # What discrete-sample data is available at this site?
+        >>> df, md = dataretrieval.waterdata.get_samples_summary(
+        ...     monitoringLocationIdentifier="USGS-04074950"
+        ... )
+
+    """
+    if not isinstance(monitoringLocationIdentifier, str):
+        raise TypeError(
+            "monitoringLocationIdentifier must be a string; the Samples "
+            "summary service accepts exactly one monitoring location per "
+            f"request, got {type(monitoringLocationIdentifier).__name__}."
+        )
+
+    url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}"
+    params = {"mimeType": "text/csv"}
+
+    req = PreparedRequest()
+    req.prepare_url(url, params=params)
+    logger.info("Request: %s", req.url)
+
+    response = requests.get(
+        url, params=params, verify=ssl_check, headers=_default_headers()
+    )
+
+    response.raise_for_status()
+
+    df = pd.read_csv(StringIO(response.text), delimiter=",")
+
+    return df, BaseMetadata(response)
+
+
 def get_stats_por(
     approval_status: str | None = None,
     computation_type: str | list[str] | None = None,
diff --git a/tests/data/samples_summary.txt b/tests/data/samples_summary.txt
@@ -0,0 +1,6 @@
+monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity
+USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28
+USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28
+USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28
+USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28
+USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -17,6 +17,7 @@
     get_monitoring_locations,
     get_reference_table,
     get_samples,
+    get_samples_summary,
     get_stats_date_range,
     get_stats_por,
     get_time_series_metadata,
@@ -57,6 +58,40 @@ def test_mock_get_samples(requests_mock):
     assert md.comment is None
 
 
+def test_mock_get_samples_summary(requests_mock):
+    """Tests USGS Samples summary query"""
+    request_url = (
+        "https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500"
+        "?mimeType=text%2Fcsv"
+    )
+    response_file_path = "tests/data/samples_summary.txt"
+    mock_request(requests_mock, request_url, response_file_path)
+    df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500")
+    assert type(df) is DataFrame
+    expected_columns = {
+        "monitoringLocationIdentifier",
+        "characteristicGroup",
+        "characteristic",
+        "characteristicUserSupplied",
+        "resultCount",
+        "activityCount",
+        "firstActivity",
+        "mostRecentActivity",
+    }
+    assert expected_columns.issubset(df.columns)
+    assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all()
+    assert md.url == request_url
+    assert isinstance(md.query_time, datetime.timedelta)
+    assert md.header == {"mock_header": "value"}
+    assert md.comment is None
+
+
+def test_get_samples_summary_rejects_list():
+    """The summary endpoint accepts only one site; a list must raise TypeError."""
+    with pytest.raises(TypeError, match="exactly one monitoring location"):
+        get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"])
+
+
 def test_check_profiles():
     """Tests that correct errors are raised for invalid profiles."""
     with pytest.raises(ValueError):

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+05/05/2026: Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
	`2`	`+`
`1`	`3`	05/01/2026: The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after 2026-11-01. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.
`2`	`4`
`3`	`5`	04/23/2026: Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.