Skip to content

Commit ba11d6f

Browse files
thodson-usgsclaude
andauthored
Add waterdata.get_ratings for USGS stage-discharge ratings via STAC (#269)
Wraps the new Water Data STAC catalog endpoint (api.waterdata.usgs.gov/stac/v0/search) for stage-discharge rating curves. Lives in its own module (waterdata/ratings.py) because the transport layer — STAC search + per-feature RDB download — differs from the OGC collections used by the rest of the package. The function: - Composes a CQL filter from monitoring_location_id and (single-value) file_type, mirroring R's logic. Multi-type requests fetch all matches and filter URLs client-side. - Optionally downloads each matching .rdb asset to a user-supplied file_path (default: a fresh tempfile.mkdtemp), and parses with the existing nwis._read_rdb helper. - Returns a dict[id -> DataFrame] when download_and_parse=True, or the raw list of STAC features when False (cheap "what's available?" inspection). Surfaces a clear ValueError for invalid file_type values and for ISO 8601 durations in `datetime` (the rating-curve service rejects them). Mirrors R's read_waterdata_ratings in DOI-USGS/dataRetrieval; example sites and idioms come straight from the R doc. Refactors rdb utilities out of the deprecated nwis module. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0d079ca commit ba11d6f

8 files changed

Lines changed: 635 additions & 91 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
2+
13
**05/06/2026:** Added `waterdata.get_field_measurements_metadata(...)` — wraps the OGC `field-measurements-metadata` collection. Returns one row per (location, parameter) field-measurement series describing its period of record, units, etc., without the underlying observations. Discrete-measurement analogue to `get_time_series_metadata`. Mirrors R's `read_waterdata_field_meta`.
24

35
**05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.

dataretrieval/nwis.py

Lines changed: 14 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
from __future__ import annotations
88

99
import warnings
10-
from io import StringIO
1110
from json import JSONDecodeError
1211

1312
import pandas as pd
1413
import requests
1514

15+
from dataretrieval.rdb import read_rdb
1616
from dataretrieval.utils import BaseMetadata
1717

1818
from .utils import query
@@ -44,6 +44,14 @@
4444
# NAD83
4545
_CRS = "EPSG:4269"
4646

47+
_NWIS_RDB_DTYPES = {
48+
"site_no": str,
49+
"dec_long_va": float,
50+
"dec_lat_va": float,
51+
"parm_cd": str,
52+
"parameter_cd": str,
53+
}
54+
4755

4856
def _parse_json_or_raise(response: requests.Response) -> pd.DataFrame:
4957
"""Parse a JSON NWIS response, raising a helpful error on HTML responses."""
@@ -1018,64 +1026,13 @@ def _read_json(json):
10181026

10191027

10201028
def _read_rdb(rdb):
1021-
"""
1022-
Convert NWIS rdb table into a ``pandas.dataframe``.
1023-
1024-
Parameters
1025-
----------
1026-
rdb: string
1027-
A string representation of an rdb table
1028-
1029-
Returns
1030-
-------
1031-
df: ``pandas.dataframe``
1032-
A formatted pandas data frame
1029+
"""Parse an NWIS RDB response and apply NWIS-specific post-processing.
10331030
1031+
Thin wrapper around :func:`dataretrieval.rdb.read_rdb` that adds the
1032+
NWIS column-dtype hints and runs :func:`format_response` (datetime
1033+
index, multi-site MultiIndex, optional GeoDataFrame).
10341034
"""
1035-
if "<html>" in rdb.lower() or "<!doctype html>" in rdb.lower():
1036-
raise ValueError(
1037-
"Received HTML response instead of RDB. This often indicates "
1038-
"that the service has been moved or is currently unavailable."
1039-
)
1040-
1041-
count = 0
1042-
lines = rdb.splitlines()
1043-
1044-
for line in lines:
1045-
# ignore comment lines
1046-
if line.startswith("#"):
1047-
count = count + 1
1048-
1049-
else:
1050-
break
1051-
1052-
if count >= len(lines):
1053-
# All lines are comments — the service returned no data rows (e.g.
1054-
# "No sites found matching all criteria"). This is a legitimate empty
1055-
# result, so return an empty DataFrame rather than raising.
1056-
return pd.DataFrame()
1057-
1058-
fields = lines[count].split("\t")
1059-
fields = [field.replace(",", "").strip() for field in fields if field.strip()]
1060-
dtypes = {
1061-
"site_no": str,
1062-
"dec_long_va": float,
1063-
"dec_lat_va": float,
1064-
"parm_cd": str,
1065-
"parameter_cd": str,
1066-
}
1067-
1068-
df = pd.read_csv(
1069-
StringIO(rdb),
1070-
delimiter="\t",
1071-
skiprows=count + 2,
1072-
names=fields,
1073-
na_values="NaN",
1074-
dtype=dtypes,
1075-
)
1076-
1077-
df = format_response(df)
1078-
return df
1035+
return format_response(read_rdb(rdb, dtypes=_NWIS_RDB_DTYPES))
10791036

10801037

10811038
def _check_sites_value_types(sites):

dataretrieval/rdb.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""Parser for the USGS RDB tab-separated text format.
2+
3+
RDB (Relational DataBase) is the text format used by NWIS web services
4+
and by the Water Data STAC catalog's rating-curve assets. Every RDB
5+
file has the same shape:
6+
7+
- One or more ``#``-prefixed comment lines carrying provenance metadata
8+
(data source, retrieval timestamp, station name, parameter codes, etc.).
9+
- A tab-separated header row naming each column.
10+
- A second tab-separated row giving column format specs (e.g. ``5s 15s``);
11+
it is informational only and skipped during parsing.
12+
- Tab-separated data rows.
13+
14+
This module exposes the parsing primitives that both ``dataretrieval.nwis``
15+
and ``dataretrieval.waterdata.ratings`` use. Callers layer their own
16+
post-processing (NWIS-specific datetime indexing, ratings-specific
17+
``df.attrs`` provenance, etc.) on top of the raw frame.
18+
"""
19+
20+
from __future__ import annotations
21+
22+
from io import StringIO
23+
24+
import pandas as pd
25+
26+
27+
def read_rdb(text: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame:
28+
"""Parse an RDB text response into a ``pandas.DataFrame``.
29+
30+
Parameters
31+
----------
32+
text : str
33+
The RDB text response from a USGS web service.
34+
dtypes : dict[str, type] or None, optional
35+
Optional column-name to dtype hints, forwarded to
36+
``pandas.read_csv``. Unknown column names are silently ignored, so
37+
callers may safely pass a dict of all columns they might be
38+
interested in.
39+
40+
Returns
41+
-------
42+
pandas.DataFrame
43+
The parsed data. An RDB consisting only of comment lines (e.g. a
44+
"no sites found" response) returns an empty DataFrame rather than
45+
raising.
46+
47+
Raises
48+
------
49+
ValueError
50+
If the response body looks like HTML, which usually means the
51+
service has been moved, is degraded, or returned an error page.
52+
"""
53+
if "<html>" in text.lower() or "<!doctype html>" in text.lower():
54+
raise ValueError(
55+
"Received HTML response instead of RDB. This often indicates "
56+
"that the service has been moved or is currently unavailable."
57+
)
58+
59+
lines = text.splitlines()
60+
header_idx = next(
61+
(i for i, line in enumerate(lines) if not line.startswith("#")),
62+
len(lines),
63+
)
64+
if header_idx == len(lines):
65+
# All lines are comments — a legitimate empty result.
66+
return pd.DataFrame()
67+
68+
fields = [f.replace(",", "").strip() for f in lines[header_idx].split("\t")]
69+
fields = [f for f in fields if f]
70+
71+
return pd.read_csv(
72+
StringIO(text),
73+
delimiter="\t",
74+
skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row
75+
names=fields,
76+
na_values="NaN",
77+
dtype=dtypes,
78+
)
79+
80+
81+
def extract_rdb_comment(text: str) -> list[str]:
82+
"""Return the RDB ``#``-prefixed comment block, raw and in original order.
83+
84+
Each entry includes its leading ``#`` and any whitespace, matching what
85+
R's ``dataRetrieval`` returns from ``comment(df)``. The comment block
86+
carries provenance metadata that is otherwise lost during parsing —
87+
data source, retrieval timestamp, parameter codes, rating id and
88+
last-shifted timestamp for ratings, etc.
89+
"""
90+
return [line for line in text.splitlines() if line.startswith("#")]

dataretrieval/waterdata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
)
3131
from .filters import FILTER_LANG
3232
from .nearest import get_nearest_continuous
33+
from .ratings import get_ratings
3334
from .types import (
3435
CODE_SERVICES,
3536
PROFILE_LOOKUP,
@@ -54,6 +55,7 @@
5455
"get_latest_daily",
5556
"get_monitoring_locations",
5657
"get_nearest_continuous",
58+
"get_ratings",
5759
"get_reference_table",
5860
"get_samples",
5961
"get_samples_summary",

0 commit comments

Comments
 (0)