Skip to content

Commit 755f72a

Browse files
thodson-usgsclaude
andcommitted
Add waterdata.get_combined_metadata for combined location + time-series inventory
Wraps the Water Data API's combined-metadata collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. Each row carries every column from both source endpoints, so any location attribute (state, HUC, site type, drainage area, well depth, ...) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, ...) in a single query. Mirrors R's read_waterdata_combined_meta. Implementation re-uses the existing get_ogc_data infrastructure: the function is a thin parameter declaration plus a service / output_id pair (combined-metadata, combined_meta_id), since _switch_arg_id, _switch_properties_id, _construct_api_requests, and _walk_pages are all already service-agnostic. Closes #263. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6df40f5 commit 755f72a

4 files changed

Lines changed: 252 additions & 0 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
2+
13
**05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`.
24

35
**05/01/2026:** The `nadp` module is now deprecated. Calling any of `get_annual_MDN_map`, `get_annual_NTN_map`, or `get_zip` will emit a `DeprecationWarning`. The module is scheduled for removal on or after **2026-11-01**. NADP is not a USGS data source; users should retrieve NADP data directly from https://nadp.slh.wisc.edu/.

dataretrieval/waterdata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .api import (
1414
get_channel,
1515
get_codes,
16+
get_combined_metadata,
1617
get_continuous,
1718
get_daily,
1819
get_field_measurements,
@@ -43,6 +44,7 @@
4344
"SERVICES",
4445
"get_channel",
4546
"get_codes",
47+
"get_combined_metadata",
4648
"get_continuous",
4749
"get_daily",
4850
"get_field_measurements",

dataretrieval/waterdata/api.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,219 @@ def get_time_series_metadata(
932932
return get_ogc_data(args, output_id, service)
933933

934934

935+
def get_combined_metadata(
936+
monitoring_location_id: str | list[str] | None = None,
937+
parameter_code: str | list[str] | None = None,
938+
parameter_name: str | list[str] | None = None,
939+
parameter_description: str | list[str] | None = None,
940+
unit_of_measure: str | list[str] | None = None,
941+
statistic_id: str | list[str] | None = None,
942+
data_type: str | list[str] | None = None,
943+
computation_identifier: str | list[str] | None = None,
944+
thresholds: int | None = None,
945+
sublocation_identifier: str | list[str] | None = None,
946+
primary: str | list[str] | None = None,
947+
parent_time_series_id: str | list[str] | None = None,
948+
web_description: str | list[str] | None = None,
949+
last_modified: str | list[str] | None = None,
950+
begin: str | list[str] | None = None,
951+
end: str | list[str] | None = None,
952+
agency_code: str | list[str] | None = None,
953+
agency_name: str | list[str] | None = None,
954+
monitoring_location_number: str | list[str] | None = None,
955+
monitoring_location_name: str | list[str] | None = None,
956+
district_code: str | list[str] | None = None,
957+
country_code: str | list[str] | None = None,
958+
country_name: str | list[str] | None = None,
959+
state_code: str | list[str] | None = None,
960+
state_name: str | list[str] | None = None,
961+
county_code: str | list[str] | None = None,
962+
county_name: str | list[str] | None = None,
963+
minor_civil_division_code: str | list[str] | None = None,
964+
site_type_code: str | list[str] | None = None,
965+
site_type: str | list[str] | None = None,
966+
hydrologic_unit_code: str | list[str] | None = None,
967+
basin_code: str | list[str] | None = None,
968+
altitude: str | list[str] | None = None,
969+
altitude_accuracy: str | list[str] | None = None,
970+
altitude_method_code: str | list[str] | None = None,
971+
altitude_method_name: str | list[str] | None = None,
972+
vertical_datum: str | list[str] | None = None,
973+
vertical_datum_name: str | list[str] | None = None,
974+
horizontal_positional_accuracy_code: str | list[str] | None = None,
975+
horizontal_positional_accuracy: str | list[str] | None = None,
976+
horizontal_position_method_code: str | list[str] | None = None,
977+
horizontal_position_method_name: str | list[str] | None = None,
978+
original_horizontal_datum: str | list[str] | None = None,
979+
original_horizontal_datum_name: str | list[str] | None = None,
980+
drainage_area: str | list[str] | None = None,
981+
contributing_drainage_area: str | list[str] | None = None,
982+
time_zone_abbreviation: str | list[str] | None = None,
983+
uses_daylight_savings: str | list[str] | None = None,
984+
construction_date: str | list[str] | None = None,
985+
aquifer_code: str | list[str] | None = None,
986+
national_aquifer_code: str | list[str] | None = None,
987+
aquifer_type_code: str | list[str] | None = None,
988+
well_constructed_depth: str | list[str] | None = None,
989+
hole_constructed_depth: str | list[str] | None = None,
990+
depth_source_code: str | list[str] | None = None,
991+
properties: str | list[str] | None = None,
992+
skip_geometry: bool | None = None,
993+
bbox: list[float] | None = None,
994+
limit: int | None = None,
995+
filter: str | None = None,
996+
filter_lang: FILTER_LANG | None = None,
997+
convert_type: bool = True,
998+
) -> tuple[pd.DataFrame, BaseMetadata]:
999+
"""Get combined monitoring-location and time-series metadata.
1000+
1001+
The ``combined-metadata`` collection joins the monitoring-locations
1002+
catalog with the time-series-metadata catalog so that one row is
1003+
returned per (location, parameter, statistic) inventory entry,
1004+
carrying every column from both source endpoints. This makes it the
1005+
most flexible "what data is available" endpoint in the Water Data
1006+
API: any monitoring-location attribute (state, HUC, site type,
1007+
drainage area, well-construction depth, …) can be combined with any
1008+
time-series attribute (parameter code, statistic, data type, period
1009+
of record, …) in a single query.
1010+
1011+
See the OpenAPI reference for the full list of supported fields:
1012+
https://api.waterdata.usgs.gov/ogcapi/v0/openapi?f=html#/combined-metadata
1013+
The R analogue is ``read_waterdata_combined_meta`` in
1014+
https://github.com/DOI-USGS/dataRetrieval/.
1015+
1016+
Parameters
1017+
----------
1018+
monitoring_location_id : string or list of strings, optional
1019+
A unique identifier representing a single monitoring location.
1020+
Created by combining the agency code (e.g. ``USGS``) with the ID
1021+
number (e.g. ``02238500``), separated by a hyphen
1022+
(e.g. ``"USGS-02238500"``).
1023+
parameter_code : string or list of strings, optional
1024+
5-digit codes used to identify the constituent measured and the
1025+
units of measure. See
1026+
https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
1027+
parameter_name : string or list of strings, optional
1028+
A human-understandable name corresponding to ``parameter_code``.
1029+
parameter_description : string or list of strings, optional
1030+
A human-readable description of what is being measured.
1031+
unit_of_measure : string or list of strings, optional
1032+
A human-readable description of the units of measurement
1033+
associated with an observation.
1034+
statistic_id : string or list of strings, optional
1035+
A code corresponding to the statistic an observation represents
1036+
(e.g. ``00001`` max, ``00002`` min, ``00003`` mean). Full list at
1037+
https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html.
1038+
data_type : string or list of strings, optional
1039+
The type of data the time series represents, e.g.
1040+
``"Continuous"``, ``"Daily"``, ``"Field measurements"``.
1041+
computation_identifier : string or list of strings, optional
1042+
Indicates whether the data from this time series represent a
1043+
specific statistical computation.
1044+
thresholds : numeric or list of numbers, optional
1045+
Numeric limits known for a time series (e.g. historic maximum,
1046+
below-which-the-sensor-is-non-operative).
1047+
sublocation_identifier : string or list of strings, optional
1048+
primary : string or list of strings, optional
1049+
A flag identifying whether the time series is "primary". Primary
1050+
time series are standard observations that have undergone Bureau
1051+
review and approval. Non-primary (provisional) time series have a
1052+
missing ``primary`` value, are produced for timely best-science
1053+
use, and are retained by this system for only 120 days.
1054+
parent_time_series_id : string or list of strings, optional
1055+
web_description : string or list of strings, optional
1056+
A description of what this time series represents, as used by
1057+
WDFN and other USGS data dissemination products.
1058+
last_modified, begin, end : string, optional
1059+
Datetime fields that accept either an RFC 3339 datetime, an
1060+
interval (``"start/end"``, optionally half-bounded with ``..``),
1061+
or an ISO 8601 duration (e.g. ``"P1M"``, ``"PT36H"``). See
1062+
:func:`get_time_series_metadata` for the full grammar.
1063+
agency_code, agency_name, monitoring_location_number, \
1064+
monitoring_location_name, district_code, country_code, country_name, \
1065+
state_code, state_name, county_code, county_name, \
1066+
minor_civil_division_code, site_type_code, site_type, \
1067+
hydrologic_unit_code, basin_code : string or list of strings, optional
1068+
Location-catalog filters carried over from the
1069+
``monitoring-locations`` collection.
1070+
altitude, altitude_accuracy, altitude_method_code, \
1071+
altitude_method_name, vertical_datum, vertical_datum_name, \
1072+
horizontal_positional_accuracy_code, horizontal_positional_accuracy, \
1073+
horizontal_position_method_code, horizontal_position_method_name, \
1074+
original_horizontal_datum, original_horizontal_datum_name, \
1075+
drainage_area, contributing_drainage_area, time_zone_abbreviation, \
1076+
uses_daylight_savings, construction_date : string or list of strings, optional
1077+
Spatial / datum / construction attributes carried over from the
1078+
``monitoring-locations`` collection.
1079+
aquifer_code, national_aquifer_code, aquifer_type_code, \
1080+
well_constructed_depth, hole_constructed_depth, depth_source_code : \
1081+
string or list of strings, optional
1082+
Groundwater-well attributes (only populated for well sites).
1083+
properties : string or list of strings, optional
1084+
Subset of columns to return. Defaults to every available
1085+
property.
1086+
skip_geometry : boolean, optional
1087+
Skip per-feature geometries; the returned object will be a plain
1088+
``DataFrame`` with no spatial information. The Water Data APIs
1089+
use camelCase ``skipGeometry`` in CQL2 queries.
1090+
bbox : list of numbers, optional
1091+
Only features whose geometry intersects the bounding box are
1092+
selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326
1093+
(longitude/latitude, west-south-east-north).
1094+
limit : numeric, optional
1095+
Page size; the maximum allowable value is 50000. Default
1096+
(``None``) requests the maximum allowable limit.
1097+
filter, filter_lang : optional
1098+
Server-side CQL filter passed through as the OGC ``filter`` /
1099+
``filter-lang`` query parameters. See
1100+
:mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking,
1101+
and the lexicographic-comparison pitfall.
1102+
convert_type : boolean, optional
1103+
If True, converts columns to appropriate types.
1104+
1105+
Returns
1106+
-------
1107+
df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame``
1108+
Formatted data returned from the API query.
1109+
md : :obj:`dataretrieval.utils.Metadata`
1110+
A custom metadata object pertaining to the query.
1111+
1112+
Examples
1113+
--------
1114+
.. code::
1115+
1116+
>>> # All time series and field measurements at a single site
1117+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1118+
... monitoring_location_id="USGS-05407000"
1119+
... )
1120+
1121+
>>> # Inventory across multiple HUCs, restricted to streams and springs
1122+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1123+
... hydrologic_unit_code=["11010008", "11010009"],
1124+
... site_type=["Stream", "Spring"],
1125+
... )
1126+
1127+
>>> # Discharge time series at three sites with at least one
1128+
>>> # observation in the past month
1129+
>>> df, md = dataretrieval.waterdata.get_combined_metadata(
1130+
... monitoring_location_id=[
1131+
... "USGS-07069000",
1132+
... "USGS-07064000",
1133+
... "USGS-07068000",
1134+
... ],
1135+
... end="P1M",
1136+
... parameter_code="00060",
1137+
... )
1138+
1139+
"""
1140+
service = "combined-metadata"
1141+
output_id = "combined_meta_id"
1142+
1143+
args = _get_args(locals())
1144+
1145+
return get_ogc_data(args, output_id, service)
1146+
1147+
9351148
def get_latest_continuous(
9361149
monitoring_location_id: str | list[str] | None = None,
9371150
parameter_code: str | list[str] | None = None,

tests/waterdata_test.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from dataretrieval.waterdata import (
1111
get_channel,
12+
get_combined_metadata,
1213
get_continuous,
1314
get_daily,
1415
get_field_measurements,
@@ -335,6 +336,40 @@ def test_get_time_series_metadata():
335336
assert hasattr(md, "query_time")
336337

337338

339+
def test_get_combined_metadata():
340+
df, md = get_combined_metadata(
341+
monitoring_location_id="USGS-05407000",
342+
skip_geometry=True,
343+
)
344+
# Combined metadata returns one row per (parameter, statistic, data_type),
345+
# carrying both location-catalog and time-series-catalog columns.
346+
assert "monitoring_location_id" in df.columns
347+
assert "parameter_code" in df.columns
348+
assert "data_type" in df.columns
349+
assert "drainage_area" in df.columns
350+
assert (df["monitoring_location_id"] == "USGS-05407000").all()
351+
assert hasattr(md, "url")
352+
assert hasattr(md, "query_time")
353+
354+
355+
def test_get_combined_metadata_multi_site_post():
356+
df, md = get_combined_metadata(
357+
monitoring_location_id=[
358+
"USGS-07069000",
359+
"USGS-07064000",
360+
"USGS-07068000",
361+
],
362+
parameter_code="00060",
363+
skip_geometry=True,
364+
)
365+
assert set(df["monitoring_location_id"].unique()) == {
366+
"USGS-07069000",
367+
"USGS-07064000",
368+
"USGS-07068000",
369+
}
370+
assert (df["parameter_code"] == "00060").all()
371+
372+
338373
def test_get_reference_table():
339374
df, md = get_reference_table("agency-codes")
340375
assert "agency_code" in df.columns

0 commit comments

Comments
 (0)