Skip to content

Commit ee653e5

Browse files
thodson-usgsclaude
andauthored
feat(waterdata): add get_cql for generalized CQL2 queries (#284)
Adds get_cql(service, cql, ...), a single public entry point for querying any Water Data OGC API collection with an arbitrary CQL2 filter — for predicates the typed getters (get_daily, get_continuous, …) can't express: a top-level or, like with % wildcards, comparison operators, nested boolean trees, and geometry predicates beyond a bounding box. The CQL2 body (str or dict) is POSTed verbatim; the result is shaped like the typed getters (wire id renamed, columns ordered/sorted, dtypes coerced) and returned as (DataFrame, BaseMetadata). Addresses #198 Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent f903491 commit ee653e5

5 files changed

Lines changed: 466 additions & 42 deletions

File tree

dataretrieval/waterdata/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
get_codes,
1616
get_combined_metadata,
1717
get_continuous,
18+
get_cql,
1819
get_daily,
1920
get_field_measurements,
2021
get_field_measurements_metadata,
@@ -37,6 +38,7 @@
3738
PROFILE_LOOKUP,
3839
PROFILES,
3940
SERVICES,
41+
WATERDATA_SERVICES,
4042
)
4143

4244
__all__ = [
@@ -45,10 +47,12 @@
4547
"PROFILES",
4648
"PROFILE_LOOKUP",
4749
"SERVICES",
50+
"WATERDATA_SERVICES",
4851
"get_channel",
4952
"get_codes",
5053
"get_combined_metadata",
5154
"get_continuous",
55+
"get_cql",
5256
"get_daily",
5357
"get_field_measurements",
5458
"get_field_measurements_metadata",

dataretrieval/waterdata/api.py

Lines changed: 161 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,22 @@
2828
METADATA_COLLECTIONS,
2929
PROFILES,
3030
SERVICES,
31+
WATERDATA_SERVICES,
3132
)
3233
from dataretrieval.waterdata.utils import (
34+
_OUTPUT_ID_BY_SERVICE,
35+
GEOPANDAS,
3336
SAMPLES_URL,
37+
_as_str_list,
3438
_check_profiles,
39+
_construct_cql_request,
3540
_default_headers,
41+
_finalize_ogc,
3642
_get_args,
3743
_raise_for_non_200,
44+
_run_sync,
45+
_switch_properties_id,
46+
_walk_pages,
3847
get_ogc_data,
3948
get_stats_data,
4049
)
@@ -252,12 +261,11 @@ def get_daily(
252261
... )
253262
"""
254263
service = "daily"
255-
output_id = "daily_id"
256264

257265
# Build argument dictionary, omitting None values
258266
args = _get_args(locals())
259267

260-
return get_ogc_data(args, output_id, service)
268+
return get_ogc_data(args, service)
261269

262270

263271
def get_continuous(
@@ -440,12 +448,11 @@ def get_continuous(
440448
... )
441449
"""
442450
service = "continuous"
443-
output_id = "continuous_id"
444451

445452
# Build argument dictionary, omitting None values
446453
args = _get_args(locals())
447454

448-
return get_ogc_data(args, output_id, service)
455+
return get_ogc_data(args, service)
449456

450457

451458
def get_monitoring_locations(
@@ -738,12 +745,11 @@ def get_monitoring_locations(
738745
... )
739746
"""
740747
service = "monitoring-locations"
741-
output_id = "monitoring_location_id"
742748

743749
# Build argument dictionary, omitting None values
744750
args = _get_args(locals())
745751

746-
return get_ogc_data(args, output_id, service)
752+
return get_ogc_data(args, service)
747753

748754

749755
def get_time_series_metadata(
@@ -961,12 +967,11 @@ def get_time_series_metadata(
961967
... )
962968
"""
963969
service = "time-series-metadata"
964-
output_id = "time_series_id"
965970

966971
# Build argument dictionary, omitting None values
967972
args = _get_args(locals())
968973

969-
return get_ogc_data(args, output_id, service)
974+
return get_ogc_data(args, service)
970975

971976

972977
def get_combined_metadata(
@@ -1192,11 +1197,10 @@ def get_combined_metadata(
11921197
11931198
"""
11941199
service = "combined-metadata"
1195-
output_id = "combined_meta_id"
11961200

11971201
args = _get_args(locals())
11981202

1199-
return get_ogc_data(args, output_id, service)
1203+
return get_ogc_data(args, service)
12001204

12011205

12021206
def get_latest_continuous(
@@ -1386,12 +1390,11 @@ def get_latest_continuous(
13861390
... )
13871391
"""
13881392
service = "latest-continuous"
1389-
output_id = "latest_continuous_id"
13901393

13911394
# Build argument dictionary, omitting None values
13921395
args = _get_args(locals())
13931396

1394-
return get_ogc_data(args, output_id, service)
1397+
return get_ogc_data(args, service)
13951398

13961399

13971400
def get_latest_daily(
@@ -1582,12 +1585,11 @@ def get_latest_daily(
15821585
... )
15831586
"""
15841587
service = "latest-daily"
1585-
output_id = "latest_daily_id"
15861588

15871589
# Build argument dictionary, omitting None values
15881590
args = _get_args(locals())
15891591

1590-
return get_ogc_data(args, output_id, service)
1592+
return get_ogc_data(args, service)
15911593

15921594

15931595
def get_field_measurements(
@@ -1772,12 +1774,11 @@ def get_field_measurements(
17721774
... )
17731775
"""
17741776
service = "field-measurements"
1775-
output_id = "field_measurement_id"
17761777

17771778
# Build argument dictionary, omitting None values
17781779
args = _get_args(locals())
17791780

1780-
return get_ogc_data(args, output_id, service)
1781+
return get_ogc_data(args, service)
17811782

17821783

17831784
def get_field_measurements_metadata(
@@ -1888,11 +1889,10 @@ def get_field_measurements_metadata(
18881889
18891890
"""
18901891
service = "field-measurements-metadata"
1891-
output_id = "field_series_id"
18921892

18931893
args = _get_args(locals())
18941894

1895-
return get_ogc_data(args, output_id, service)
1895+
return get_ogc_data(args, service)
18961896

18971897

18981898
def get_peaks(
@@ -2008,11 +2008,10 @@ def get_peaks(
20082008
20092009
"""
20102010
service = "peaks"
2011-
output_id = "peak_id"
20122011

20132012
args = _get_args(locals())
20142013

2015-
return get_ogc_data(args, output_id, service)
2014+
return get_ogc_data(args, service)
20162015

20172016

20182017
def get_reference_table(
@@ -2842,8 +2841,148 @@ def get_channel(
28422841
... )
28432842
"""
28442843
service = "channel-measurements"
2845-
output_id = "channel_measurements_id"
28462844

28472845
args = _get_args(locals())
28482846

2849-
return get_ogc_data(args, output_id, service)
2847+
return get_ogc_data(args, service)
2848+
2849+
2850+
def get_cql(
2851+
service: WATERDATA_SERVICES,
2852+
cql: str | dict,
2853+
*,
2854+
properties: str | Iterable[str] | None = None,
2855+
bbox: list[float] | None = None,
2856+
limit: int | None = None,
2857+
skip_geometry: bool | None = None,
2858+
convert_type: bool = True,
2859+
) -> tuple[pd.DataFrame, BaseMetadata]:
2860+
"""Query a Water Data OGC API collection with an arbitrary CQL2 filter.
2861+
2862+
Sends ``cql`` as a CQL2 filter against ``service`` and returns the matching
2863+
features, shaped like the typed getters (``get_daily``, ``get_continuous``,
2864+
…): the wire ``id`` renamed to the service's id column, columns ordered and
2865+
sorted, and dtypes coerced. Use it when you need a predicate the typed
2866+
getters can't express — a top-level ``or``, ``like`` with ``%`` wildcards,
2867+
comparison operators, nested boolean trees, or a geometry predicate beyond a
2868+
bounding box; prefer a typed getter when one covers the query.
2869+
2870+
The request is a single POST with the ``cql`` body sent verbatim, so there
2871+
are no multi-value arguments to chunk: narrow a query whose URL or body
2872+
would exceed the server's size cap rather than relying on automatic
2873+
chunking.
2874+
2875+
The CQL2 grammar is documented at
2876+
https://api.waterdata.usgs.gov/docs/ogcapi/complex-queries/.
2877+
2878+
Parameters
2879+
----------
2880+
service : str
2881+
OGC collection name. Must be one of
2882+
:data:`dataretrieval.waterdata.types.WATERDATA_SERVICES`
2883+
(e.g. ``"daily"``, ``"monitoring-locations"``).
2884+
cql : str or dict
2885+
CQL2 query. A ``dict`` is JSON-serialized for transport; a ``str`` is
2886+
sent through unchanged. The query goes into the HTTP POST body with
2887+
``Content-Type: application/query-cql-json``.
2888+
properties : str or iterable of str, optional
2889+
Server-side property whitelist (passed as ``properties=`` on the URL).
2890+
Reduces payload size. ``"id"`` resolves to the service's ``output_id``
2891+
(e.g. ``daily_id``) the same way it does in the typed wrappers.
2892+
bbox : list of float, optional
2893+
Bounding box ``[xmin, ymin, xmax, ymax]`` in CRS 4326. Combines with the
2894+
CQL filter as an additional spatial predicate.
2895+
limit : int, optional
2896+
Page size, clamped server-side to 50,000.
2897+
skip_geometry : bool, optional
2898+
If True, the server omits geometry from each feature
2899+
(``skipGeometry=true``).
2900+
convert_type : bool, default True
2901+
Coerce date/datetime/numeric columns to typed dtypes after the
2902+
DataFrame is built.
2903+
2904+
Returns
2905+
-------
2906+
df : pandas.DataFrame or geopandas.GeoDataFrame
2907+
Result of the query. GeoDataFrame when ``geopandas`` is installed and
2908+
geometry is present.
2909+
md : :class:`dataretrieval.utils.BaseMetadata`
2910+
Request metadata (URL, query time, response headers).
2911+
2912+
Examples
2913+
--------
2914+
.. code::
2915+
2916+
>>> # Daily values for two parameter codes at two sites
2917+
>>> # (compound AND-of-INs).
2918+
>>> from dataretrieval import waterdata
2919+
>>> cql = {
2920+
... "op": "and",
2921+
... "args": [
2922+
... {
2923+
... "op": "in",
2924+
... "args": [
2925+
... {"property": "parameter_code"},
2926+
... ["00060", "00065"],
2927+
... ],
2928+
... },
2929+
... {
2930+
... "op": "in",
2931+
... "args": [
2932+
... {"property": "monitoring_location_id"},
2933+
... ["USGS-07367300", "USGS-03277200"],
2934+
... ],
2935+
... },
2936+
... ],
2937+
... }
2938+
>>> df, md = waterdata.get_cql(service="daily", cql=cql)
2939+
2940+
>>> # Monitoring locations whose HUC starts with "02070010"
2941+
>>> # (LIKE with the CQL2 ``%`` wildcard).
2942+
>>> df, md = waterdata.get_cql(
2943+
... service="monitoring-locations",
2944+
... cql='{"op": "like", "args": ['
2945+
... '{"property": "hydrologic_unit_code"},'
2946+
... ' "02070010%"]}',
2947+
... )
2948+
"""
2949+
if service not in _OUTPUT_ID_BY_SERVICE:
2950+
raise ValueError(
2951+
f"Unknown service {service!r}. Valid services: "
2952+
f"{sorted(_OUTPUT_ID_BY_SERVICE)}."
2953+
)
2954+
output_id = _OUTPUT_ID_BY_SERVICE[service]
2955+
2956+
# ``dict`` is the pythonic input — serialize on the way out. ``str`` is sent
2957+
# verbatim so callers who already have a CQL2 doc (e.g. imported from a
2958+
# config file) don't need to re-parse it.
2959+
body = json.dumps(cql, separators=(",", ":")) if isinstance(cql, dict) else cql
2960+
2961+
properties_list = _as_str_list(properties, "properties")
2962+
2963+
# Translate user-facing names (``daily_id``/``id``) to the wire ``id`` the
2964+
# OGC API expects, matching the typed getters.
2965+
wire_properties = _switch_properties_id(properties_list, output_id, service)
2966+
2967+
req = _construct_cql_request(
2968+
service,
2969+
body,
2970+
properties=wire_properties,
2971+
bbox=bbox,
2972+
limit=limit,
2973+
skip_geometry=skip_geometry,
2974+
)
2975+
2976+
async def _run() -> tuple[pd.DataFrame, httpx.Response]:
2977+
return await _walk_pages(geopd=GEOPANDAS, req=req)
2978+
2979+
df, response = _run_sync(_run, service=service)
2980+
2981+
return _finalize_ogc(
2982+
df,
2983+
response,
2984+
properties=properties_list,
2985+
output_id=output_id,
2986+
convert_type=convert_type,
2987+
service=service,
2988+
)

dataretrieval/waterdata/types.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,24 @@
4040
"results",
4141
]
4242

43+
# OGC API time-series/monitoring collections queryable via ``get_cql``.
44+
# Keep in sync with ``utils._OUTPUT_ID_BY_SERVICE`` (same keys): that dict maps
45+
# each service to its user-facing ``id`` column and is the runtime source of
46+
# truth ``get_cql`` validates against.
47+
WATERDATA_SERVICES = Literal[
48+
"channel-measurements",
49+
"combined-metadata",
50+
"continuous",
51+
"daily",
52+
"field-measurements",
53+
"field-measurements-metadata",
54+
"latest-continuous",
55+
"latest-daily",
56+
"monitoring-locations",
57+
"peaks",
58+
"time-series-metadata",
59+
]
60+
4361
PROFILES = Literal[
4462
"actgroup",
4563
"actmetric",

0 commit comments

Comments
 (0)