Skip to content

Commit 63b586e

Browse files
thodson-usgsclaude
andauthored
Add waterdata.get_combined_metadata for combined location + time-series inventory (#264)
Wraps the Water Data API's combined-metadata collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. Each row carries every column from both source endpoints, so any location attribute (state, HUC, site type, drainage area, well depth, ...) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, ...) in a single query. Mirrors R's read_waterdata_combined_meta. Implementation re-uses the existing get_ogc_data infrastructure: the function is a thin parameter declaration plus a service / output_id pair (combined-metadata, combined_meta_id), since _switch_arg_id, _switch_properties_id, _construct_api_requests, and _walk_pages are all already service-agnostic. Closes #263. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6df40f5 commit 63b586e

4 files changed

Lines changed: 269 additions & 0 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
2+
13
**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
24

35
**05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.

dataretrieval/waterdata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .api import (
1414
get_channel,
1515
get_codes,
16+
get_combined_metadata,
1617
get_continuous,
1718
get_daily,
1819
get_field_measurements,
@@ -43,6 +44,7 @@
4344
"SERVICES",
4445
"get_channel",
4546
"get_codes",
47+
"get_combined_metadata",
4648
"get_continuous",
4749
"get_daily",
4850
"get_field_measurements",

dataretrieval/waterdata/api.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,238 @@ def get_time_series_metadata(
932932
return get_ogc_data(args, output_id, service)
933933

934934

935+
def get_combined_metadata(
936+
monitoring_location_id: str | list[str] | None = None,
937+
parameter_code: str | list[str] | None = None,
938+
parameter_name: str | list[str] | None = None,
939+
parameter_description: str | list[str] | None = None,
940+
unit_of_measure: str | list[str] | None = None,
941+
statistic_id: str | list[str] | None = None,
942+
data_type: str | list[str] | None = None,
943+
computation_identifier: str | list[str] | None = None,
944+
thresholds: float | list[float] | None = None,
945+
sublocation_identifier: str | list[str] | None = None,
946+
primary: str | list[str] | None = None,
947+
parent_time_series_id: str | list[str] | None = None,
948+
web_description: str | list[str] | None = None,
949+
last_modified: str | list[str] | None = None,
950+
begin: str | list[str] | None = None,
951+
end: str | list[str] | None = None,
952+
agency_code: str | list[str] | None = None,
953+
agency_name: str | list[str] | None = None,
954+
monitoring_location_number: str | list[str] | None = None,
955+
monitoring_location_name: str | list[str] | None = None,
956+
district_code: str | list[str] | None = None,
957+
country_code: str | list[str] | None = None,
958+
country_name: str | list[str] | None = None,
959+
state_code: str | list[str] | None = None,
960+
state_name: str | list[str] | None = None,
961+
county_code: str | list[str] | None = None,
962+
county_name: str | list[str] | None = None,
963+
minor_civil_division_code: str | list[str] | None = None,
964+
site_type_code: str | list[str] | None = None,
965+
site_type: str | list[str] | None = None,
966+
hydrologic_unit_code: str | list[str] | None = None,
967+
basin_code: str | list[str] | None = None,
968+
altitude: str | list[str] | None = None,
969+
altitude_accuracy: str | list[str] | None = None,
970+
altitude_method_code: str | list[str] | None = None,
971+
altitude_method_name: str | list[str] | None = None,
972+
vertical_datum: str | list[str] | None = None,
973+
vertical_datum_name: str | list[str] | None = None,
974+
horizontal_positional_accuracy_code: str | list[str] | None = None,
975+
horizontal_positional_accuracy: str | list[str] | None = None,
976+
horizontal_position_method_code: str | list[str] | None = None,
977+
horizontal_position_method_name: str | list[str] | None = None,
978+
original_horizontal_datum: str | list[str] | None = None,
979+
original_horizontal_datum_name: str | list[str] | None = None,
980+
drainage_area: str | list[str] | None = None,
981+
contributing_drainage_area: str | list[str] | None = None,
982+
time_zone_abbreviation: str | list[str] | None = None,
983+
uses_daylight_savings: str | list[str] | None = None,
984+
construction_date: str | list[str] | None = None,
985+
aquifer_code: str | list[str] | None = None,
986+
national_aquifer_code: str | list[str] | None = None,
987+
aquifer_type_code: str | list[str] | None = None,
988+
well_constructed_depth: str | list[str] | None = None,
989+
hole_constructed_depth: str | list[str] | None = None,
990+
depth_source_code: str | list[str] | None = None,
991+
properties: str | list[str] | None = None,
992+
skip_geometry: bool | None = None,
993+
bbox: list[float] | None = None,
994+
limit: int | None = None,
995+
filter: str | None = None,
996+
filter_lang: FILTER_LANG | None = None,
997+
convert_type: bool = True,
998+
) -> tuple[pd.DataFrame, BaseMetadata]:
999+
"""Get combined monitoring-location and time-series metadata.
1000+
1001+
The ``combined-metadata`` collection joins the monitoring-locations
1002+
catalog with the time-series-metadata catalog so that one row is
1003+
returned per (location, parameter, statistic) inventory entry,
1004+
carrying every column from both source endpoints. This makes it the
1005+
most flexible "what data is available" endpoint in the Water Data
1006+
API: any monitoring-location attribute (state, HUC, site type,
1007+
drainage area, well-construction depth, …) can be combined with any
1008+
time-series attribute (parameter code, statistic, data type, period
1009+
of record, …) in a single query.
1010+
1011+
See the OpenAPI reference for the full list of supported fields:
1012+
https://api.waterdata.usgs.gov/ogcapi/v0/openapi?f=html#/combined-metadata
1013+
The R analogue is ``read_waterdata_combined_meta`` in
1014+
https://github.com/DOI-USGS/dataRetrieval/.
1015+
1016+
All ~35 location-catalog kwargs are accepted (``agency_code``,
1017+
``state_name``, ``drainage_area``, ``aquifer_code``, …) but only
1018+
the most-used ones are documented below; see
1019+
:func:`get_monitoring_locations` for per-field descriptions.
1020+
1021+
Parameters
1022+
----------
1023+
monitoring_location_id : string or list of strings, optional
1024+
A unique identifier representing a single monitoring location.
1025+
Created by combining the agency code (e.g. ``USGS``) with the ID
1026+
number (e.g. ``02238500``), separated by a hyphen
1027+
(e.g. ``"USGS-02238500"``).
1028+
parameter_code : string or list of strings, optional
1029+
5-digit codes used to identify the constituent measured and the
1030+
units of measure. See
1031+
https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
1032+
parameter_name : string or list of strings, optional
1033+
A human-understandable name corresponding to ``parameter_code``.
1034+
parameter_description : string or list of strings, optional
1035+
A human-readable description of what is being measured.
1036+
unit_of_measure : string or list of strings, optional
1037+
A human-readable description of the units of measurement
1038+
associated with an observation.
1039+
statistic_id : string or list of strings, optional
1040+
A code corresponding to the statistic an observation represents
1041+
(e.g. ``00001`` max, ``00002`` min, ``00003`` mean). Full list at
1042+
https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html.
1043+
data_type : string or list of strings, optional
1044+
The type of data the time series represents, e.g.
1045+
``"Continuous values"``, ``"Daily values"``,
1046+
``"Field measurements"``.
1047+
computation_identifier : string or list of strings, optional
1048+
Indicates whether the data from this time series represent a
1049+
specific statistical computation.
1050+
thresholds : numeric or list of numbers, optional
1051+
Numeric limits known for a time series (e.g. historic maximum,
1052+
below-which-the-sensor-is-non-operative).
1053+
sublocation_identifier : string or list of strings, optional
1054+
primary : string or list of strings, optional
1055+
A flag identifying whether the time series is "primary". Primary
1056+
time series are standard observations that have undergone Bureau
1057+
review and approval. Non-primary (provisional) time series have a
1058+
missing ``primary`` value, are produced for timely best-science
1059+
use, and are retained by this system for only 120 days.
1060+
parent_time_series_id : string or list of strings, optional
1061+
web_description : string or list of strings, optional
1062+
A description of what this time series represents, as used by
1063+
WDFN and other USGS data dissemination products.
1064+
last_modified, begin, end : string, optional
1065+
Datetime fields that accept either an RFC 3339 datetime, an
1066+
interval (``"start/end"``, optionally half-bounded with ``..``),
1067+
or an ISO 8601 duration (e.g. ``"P1M"``, ``"PT36H"``). See
1068+
:func:`get_time_series_metadata` for the full grammar.
1069+
state_name, county_name, hydrologic_unit_code, site_type, \
1070+
site_type_code : string or list of strings, optional
1071+
Common location-catalog filters carried over from the
1072+
``monitoring-locations`` collection. The function also accepts
1073+
the full list of location-catalog kwargs (agency, district,
1074+
altitude, vertical/horizontal datum, drainage area, aquifer,
1075+
well construction, …); see :func:`get_monitoring_locations` for
1076+
descriptions of each.
1077+
properties : string or list of strings, optional
1078+
Subset of columns to return. Defaults to every available
1079+
property.
1080+
skip_geometry : boolean, optional
1081+
Skip per-feature geometries; the returned object will be a plain
1082+
``DataFrame`` with no spatial information. The Water Data APIs
1083+
use camelCase ``skipGeometry`` in CQL2 queries.
1084+
bbox : list of numbers, optional
1085+
Only features whose geometry intersects the bounding box are
1086+
selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326
1087+
(longitude/latitude, west-south-east-north).
1088+
limit : numeric, optional
1089+
Page size; the maximum allowable value is 50000. Default
1090+
(``None``) requests the maximum allowable limit.
1091+
filter, filter_lang : optional
1092+
Server-side CQL filter passed through as the OGC ``filter`` /
1093+
``filter-lang`` query parameters. See
1094+
:mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking,
1095+
and the lexicographic-comparison pitfall.
1096+
convert_type : boolean, optional
1097+
If True, converts columns to appropriate types.
1098+
1099+
Returns
1100+
-------
1101+
df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame``
1102+
Formatted data returned from the API query.
1103+
md : :obj:`dataretrieval.utils.Metadata`
1104+
A custom metadata object pertaining to the query.
1105+
1106+
Examples
1107+
--------
1108+
.. code::
1109+
1110+
>>> # All time series and field measurements at a single surface-water site
1111+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1112+
... monitoring_location_id="USGS-05407000"
1113+
... )
1114+
1115+
>>> # Same, for a groundwater well — water-level and aquifer columns
1116+
>>> # are populated where the surface-water example has nulls
1117+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1118+
... monitoring_location_id="USGS-375907091432201"
1119+
... )
1120+
1121+
>>> # Every series in a single county, useful for area-of-interest workflows
1122+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1123+
... state_name="Wisconsin", county_name="Dane County"
1124+
... )
1125+
1126+
>>> # Inventory across multiple HUCs, restricted to streams and springs
1127+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1128+
... hydrologic_unit_code=["11010008", "11010009"],
1129+
... site_type=["Stream", "Spring"],
1130+
... )
1131+
1132+
>>> # Discharge time series at three sites with at least one
1133+
>>> # observation in the past month
1134+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1135+
... monitoring_location_id=[
1136+
... "USGS-07069000",
1137+
... "USGS-07064000",
1138+
... "USGS-07068000",
1139+
... ],
1140+
... end="P1M",
1141+
... parameter_code="00060",
1142+
... )
1143+
1144+
>>> # Two-step "what's available?" → "fetch it" workflow:
1145+
>>> # 1. inventory the sites in two HUCs
1146+
>>> hucs, _ = dataretrieval.waterdata.get_combined_metadata(
1147+
... hydrologic_unit_code=["11010008", "11010009"],
1148+
... site_type="Stream",
1149+
... )
1150+
>>> # 2. pull continuous discharge at every distinct site found
1151+
>>> sites = hucs["monitoring_location_id"].unique().tolist()
1152+
>>> df, md = dataretrieval.waterdata.get_continuous(
1153+
... monitoring_location_id=sites,
1154+
... parameter_code="00060",
1155+
... time="P1D",
1156+
... )
1157+
1158+
"""
1159+
service = "combined-metadata"
1160+
output_id = "combined_meta_id"
1161+
1162+
args = _get_args(locals())
1163+
1164+
return get_ogc_data(args, output_id, service)
1165+
1166+
9351167
def get_latest_continuous(
9361168
monitoring_location_id: str | list[str] | None = None,
9371169
parameter_code: str | list[str] | None = None,

tests/waterdata_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from dataretrieval.waterdata import (
1111
get_channel,
12+
get_combined_metadata,
1213
get_continuous,
1314
get_daily,
1415
get_field_measurements,
@@ -335,6 +336,38 @@ def test_get_time_series_metadata():
335336
assert hasattr(md, "query_time")
336337

337338

339+
def test_get_combined_metadata():
340+
df, md = get_combined_metadata(
341+
monitoring_location_id="USGS-05407000",
342+
skip_geometry=True,
343+
)
344+
assert "monitoring_location_id" in df.columns
345+
assert "parameter_code" in df.columns
346+
assert "data_type" in df.columns
347+
assert "drainage_area" in df.columns
348+
assert (df["monitoring_location_id"] == "USGS-05407000").all()
349+
assert hasattr(md, "url")
350+
assert hasattr(md, "query_time")
351+
352+
353+
def test_get_combined_metadata_multi_site_post():
354+
df, _ = get_combined_metadata(
355+
monitoring_location_id=[
356+
"USGS-07069000",
357+
"USGS-07064000",
358+
"USGS-07068000",
359+
],
360+
parameter_code="00060",
361+
skip_geometry=True,
362+
)
363+
assert set(df["monitoring_location_id"].unique()) == {
364+
"USGS-07069000",
365+
"USGS-07064000",
366+
"USGS-07068000",
367+
}
368+
assert (df["parameter_code"] == "00060").all()
369+
370+
338371
def test_get_reference_table():
339372
df, md = get_reference_table("agency-codes")
340373
assert "agency_code" in df.columns

0 commit comments

Comments
 (0)