Skip to content

Commit 6df40f5

Browse files
thodson-usgsclaude
andauthored
Add waterdata.get_samples_summary for per-location sample inventory (#262)
Wraps the Samples database /summary/{monitoringLocationIdentifier} endpoint, mirroring the R package's summarize_waterdata_samples. Returns per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location — useful for taking inventory of what discrete-sample data exists at a site before pulling observations with get_samples. The Samples summary endpoint accepts only a single monitoring location per request, so the function takes a string (not a list). Closes #261. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent dd70cfa commit 6df40f5

5 files changed

Lines changed: 117 additions & 0 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
2+
13
**05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.
24

35
**04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.

dataretrieval/waterdata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
get_monitoring_locations,
2222
get_reference_table,
2323
get_samples,
24+
get_samples_summary,
2425
get_stats_date_range,
2526
get_stats_por,
2627
get_time_series_metadata,
@@ -51,6 +52,7 @@
5152
"get_nearest_continuous",
5253
"get_reference_table",
5354
"get_samples",
55+
"get_samples_summary",
5456
"get_stats_date_range",
5557
"get_stats_por",
5658
"get_time_series_metadata",

dataretrieval/waterdata/api.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import logging
1111
from io import StringIO
1212
from typing import get_args
13+
from urllib.parse import quote
1314

1415
import pandas as pd
1516
import requests
@@ -1800,6 +1801,77 @@ def get_samples(
18001801
return df, BaseMetadata(response)
18011802

18021803

1804+
def get_samples_summary(
1805+
monitoringLocationIdentifier: str,
1806+
ssl_check: bool = True,
1807+
) -> tuple[pd.DataFrame, BaseMetadata]:
1808+
"""Get a summary of discrete water-quality samples at a single monitoring location.
1809+
1810+
Wraps the Samples database summary service described at
1811+
https://api.waterdata.usgs.gov/samples-data/docs. The service returns one
1812+
row per (characteristic group, characteristic, user-supplied characteristic)
1813+
combination with result and activity counts and the first / most recent
1814+
activity dates — useful for taking inventory of what discrete-sample data
1815+
exists at a site before pulling the underlying observations with
1816+
:func:`get_samples`.
1817+
1818+
The summary service is single-site only: it accepts exactly one monitoring
1819+
location per request.
1820+
1821+
Parameters
1822+
----------
1823+
monitoringLocationIdentifier : string
1824+
A monitoring location identifier has two parts, separated by a dash
1825+
(``-``): the agency code and the location number. Examples:
1826+
``"USGS-040851385"``, ``"AZ014-320821110580701"``,
1827+
``"CAX01-15304600"``. Bare location numbers without an agency prefix
1828+
are accepted by the service but return an empty result, so a prefix
1829+
is effectively required.
1830+
ssl_check : bool, optional
1831+
Check the SSL certificate. Default is True.
1832+
1833+
Returns
1834+
-------
1835+
df : ``pandas.DataFrame``
1836+
Formatted data returned from the API query.
1837+
md : :obj:`dataretrieval.utils.Metadata`
1838+
Custom ``dataretrieval`` metadata object pertaining to the query.
1839+
1840+
Examples
1841+
--------
1842+
.. code::
1843+
1844+
>>> # What discrete-sample data is available at this site?
1845+
>>> df, md = dataretrieval.waterdata.get_samples_summary(
1846+
... monitoringLocationIdentifier="USGS-04074950"
1847+
... )
1848+
1849+
"""
1850+
if not isinstance(monitoringLocationIdentifier, str):
1851+
raise TypeError(
1852+
"monitoringLocationIdentifier must be a string; the Samples "
1853+
"summary service accepts exactly one monitoring location per "
1854+
f"request, got {type(monitoringLocationIdentifier).__name__}."
1855+
)
1856+
1857+
url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}"
1858+
params = {"mimeType": "text/csv"}
1859+
1860+
req = PreparedRequest()
1861+
req.prepare_url(url, params=params)
1862+
logger.info("Request: %s", req.url)
1863+
1864+
response = requests.get(
1865+
url, params=params, verify=ssl_check, headers=_default_headers()
1866+
)
1867+
1868+
response.raise_for_status()
1869+
1870+
df = pd.read_csv(StringIO(response.text), delimiter=",")
1871+
1872+
return df, BaseMetadata(response)
1873+
1874+
18031875
def get_stats_por(
18041876
approval_status: str | None = None,
18051877
computation_type: str | list[str] | None = None,

tests/data/samples_summary.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
monitoringLocationIdentifier,characteristicGroup,characteristic,characteristicUserSupplied,resultCount,activityCount,firstActivity,mostRecentActivity
2+
USGS-04183500,Information,Bottle or bag sampler material (construction),Bottle or bag sampler material (construction),893,893,2017-01-02,2026-04-28
3+
USGS-04183500,Information,NWIS lot number,"NWIS lot number, sulfuric acid, 4.5 normal (1:7), 1 milliliter, National Field Supply Service (NFSS) stock number Q438FLD",893,893,2017-01-02,2026-04-28
4+
USGS-04183500,Information,NWIS lot number,"NWIS lot number, vacuum tube, 10.5 milliliters, FCCVT (filtered, chilled, vacuum tube)",877,877,2017-01-02,2026-04-28
5+
USGS-04183500,Information,Number of sampling points,Number of sampling points,136,136,2013-10-23,2026-04-28
6+
USGS-04183500,Information,Sampler nozzle diameter,Sampler nozzle diameter,97,97,2017-01-24,2026-04-28

tests/waterdata_test.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
get_monitoring_locations,
1818
get_reference_table,
1919
get_samples,
20+
get_samples_summary,
2021
get_stats_date_range,
2122
get_stats_por,
2223
get_time_series_metadata,
@@ -57,6 +58,40 @@ def test_mock_get_samples(requests_mock):
5758
assert md.comment is None
5859

5960

61+
def test_mock_get_samples_summary(requests_mock):
62+
"""Tests USGS Samples summary query"""
63+
request_url = (
64+
"https://api.waterdata.usgs.gov/samples-data/summary/USGS-04183500"
65+
"?mimeType=text%2Fcsv"
66+
)
67+
response_file_path = "tests/data/samples_summary.txt"
68+
mock_request(requests_mock, request_url, response_file_path)
69+
df, md = get_samples_summary(monitoringLocationIdentifier="USGS-04183500")
70+
assert type(df) is DataFrame
71+
expected_columns = {
72+
"monitoringLocationIdentifier",
73+
"characteristicGroup",
74+
"characteristic",
75+
"characteristicUserSupplied",
76+
"resultCount",
77+
"activityCount",
78+
"firstActivity",
79+
"mostRecentActivity",
80+
}
81+
assert expected_columns.issubset(df.columns)
82+
assert (df["monitoringLocationIdentifier"] == "USGS-04183500").all()
83+
assert md.url == request_url
84+
assert isinstance(md.query_time, datetime.timedelta)
85+
assert md.header == {"mock_header": "value"}
86+
assert md.comment is None
87+
88+
89+
def test_get_samples_summary_rejects_list():
90+
"""The summary endpoint accepts only one site; a list must raise TypeError."""
91+
with pytest.raises(TypeError, match="exactly one monitoring location"):
92+
get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"])
93+
94+
6095
def test_check_profiles():
6196
"""Tests that correct errors are raised for invalid profiles."""
6297
with pytest.raises(ValueError):

0 commit comments

Comments
 (0)