Skip to content

Commit 1383ce9

Browse files
committed
refactor(waterdata): simplify get_waterdata internals
Code-review pass on PR DOI-USGS#284. - Lift ``WATERDATA_SERVICES`` Literal into ``types.py``. Use it as the ``service`` arg type of ``get_waterdata`` so editors offer completion and type-checkers catch typos. The runtime source of truth (``_OUTPUT_ID_BY_SERVICE`` in utils.py) is unchanged; the Literal is kept in sync by hand and a comment notes that. - Extract ``_ogc_query_params(properties, bbox, limit, skip_geometry)`` in utils.py. The same ``skipGeometry``/``limit``/``bbox``/``properties`` block previously appeared twice — once in ``_construct_api_requests`` and once in the new ``_construct_cql_request`` — and is now built in one place. - Extract ``_finalize_ogc_frame(df, response, properties, service, output_id, convert_type)`` for the post-processing tail (``_deal_with_empty`` -> ``_type_cols`` -> ``_arrange_cols`` -> ``_sort_rows`` -> ``BaseMetadata``). Both ``get_ogc_data`` and ``get_waterdata`` route through it now, so the typed-kwargs and raw-CQL2 paths produce identically-shaped DataFrames by construction rather than by parallel maintenance. - Drop the ``client`` kwarg from ``get_waterdata``. None of the other public ``get_*`` getters expose it, and the rationale (HTTP session reuse) applies to all of them or none. If we want to expose session reuse, that's a separate PR that touches the whole family. - Collapse the ``properties`` normalization block to None-first ordering so the common case (no properties) reads first. - Drop the docstring breadcrumb to ``utils._OUTPUT_ID_BY_SERVICE``; point readers at ``types.WATERDATA_SERVICES`` (the user-facing Literal) instead. All 148 unit tests pass; ``_construct_api_requests`` and ``_construct_cql_request`` produce byte-identical requests to before.
1 parent dffb168 commit 1383ce9

3 files changed

Lines changed: 89 additions & 76 deletions

File tree

dataretrieval/waterdata/api.py

Lines changed: 16 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,19 @@
2424
METADATA_COLLECTIONS,
2525
PROFILES,
2626
SERVICES,
27+
WATERDATA_SERVICES,
2728
)
2829
from dataretrieval.waterdata.utils import (
2930
_OUTPUT_ID_BY_SERVICE,
3031
GEOPANDAS,
3132
SAMPLES_URL,
32-
_arrange_cols,
3333
_check_profiles,
3434
_construct_cql_request,
35-
_deal_with_empty,
3635
_default_headers,
36+
_finalize_ogc_frame,
3737
_get_args,
3838
_normalize_str_iterable,
39-
_sort_rows,
4039
_switch_properties_id,
41-
_type_cols,
4240
_walk_pages,
4341
get_ogc_data,
4442
get_stats_data,
@@ -2845,15 +2843,14 @@ def get_channel(
28452843

28462844

28472845
def get_waterdata(
2848-
service: str,
2846+
service: WATERDATA_SERVICES,
28492847
cql: str | dict,
28502848
*,
28512849
properties: str | Iterable[str] | None = None,
28522850
bbox: list[float] | None = None,
28532851
limit: int | None = None,
28542852
skip_geometry: bool | None = None,
28552853
convert_type: bool = True,
2856-
client: requests.Session | None = None,
28572854
) -> tuple[pd.DataFrame, BaseMetadata]:
28582855
"""Generalized OGC API CQL2 query.
28592856
@@ -2871,9 +2868,9 @@ def get_waterdata(
28712868
Parameters
28722869
----------
28732870
service : str
2874-
OGC collection name (e.g. ``"daily"``, ``"monitoring-locations"``).
2875-
See :data:`dataretrieval.waterdata.utils._OUTPUT_ID_BY_SERVICE`
2876-
for the recognized list.
2871+
OGC collection name. Must be one of
2872+
:data:`dataretrieval.waterdata.types.WATERDATA_SERVICES`
2873+
(e.g. ``"daily"``, ``"monitoring-locations"``).
28772874
cql : str or dict
28782875
CQL2 query. A ``dict`` is JSON-serialized for transport; a
28792876
``str`` is sent through unchanged. The query goes into the
@@ -2886,7 +2883,8 @@ def get_waterdata(
28862883
to the service's ``output_id`` (e.g. ``daily_id``) the same way
28872884
it does in the typed wrappers.
28882885
bbox : list of float, optional
2889-
Bounding box ``[xmin, ymin, xmax, ymax]`` in CRS 4326.
2886+
Bounding box ``[xmin, ymin, xmax, ymax]`` in CRS 4326. Combines
2887+
with the CQL filter as an additional spatial predicate.
28902888
limit : int, optional
28912889
Page size, clamped server-side to 50,000.
28922890
skip_geometry : bool, optional
@@ -2895,10 +2893,6 @@ def get_waterdata(
28952893
convert_type : bool, default True
28962894
Coerce date/datetime/numeric columns to typed dtypes after the
28972895
DataFrame is built.
2898-
client : requests.Session, optional
2899-
Reuse an existing HTTP session (handy for batching or
2900-
injecting custom retry/auth adapters). A short-lived session
2901-
is created internally if not provided.
29022896
29032897
Returns
29042898
-------
@@ -2934,11 +2928,7 @@ def get_waterdata(
29342928
... },
29352929
... ],
29362930
... }
2937-
>>> df, md = waterdata.get_waterdata(
2938-
... service="daily",
2939-
... cql=cql,
2940-
... time=("2023-01-01", "2024-01-01"),
2941-
... )
2931+
>>> df, md = waterdata.get_waterdata(service="daily", cql=cql)
29422932
29432933
>>> # Monitoring locations whose HUC starts with "02070010"
29442934
>>> # (LIKE with the CQL2 ``%`` wildcard).
@@ -2961,10 +2951,10 @@ def get_waterdata(
29612951
# imported from a config file) don't need to re-parse it.
29622952
body = json.dumps(cql, separators=(",", ":")) if isinstance(cql, dict) else cql
29632953

2964-
if isinstance(properties, str):
2965-
properties_list = [properties]
2966-
elif properties is None:
2954+
if properties is None:
29672955
properties_list = None
2956+
elif isinstance(properties, str):
2957+
properties_list = [properties]
29682958
else:
29692959
properties_list = _normalize_str_iterable(properties, "properties")
29702960

@@ -2983,12 +2973,7 @@ def get_waterdata(
29832973
skip_geometry=skip_geometry,
29842974
)
29852975

2986-
df, response = _walk_pages(geopd=GEOPANDAS, req=req, client=client)
2987-
2988-
df = _deal_with_empty(df, properties_list, service)
2989-
if convert_type:
2990-
df = _type_cols(df)
2991-
df = _arrange_cols(df, properties_list, output_id)
2992-
df = _sort_rows(df)
2993-
2994-
return df, BaseMetadata(response)
2976+
df, response = _walk_pages(geopd=GEOPANDAS, req=req)
2977+
return _finalize_ogc_frame(
2978+
df, response, properties_list, service, output_id, convert_type
2979+
)

dataretrieval/waterdata/types.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,26 @@
4040
"results",
4141
]
4242

43+
# OGC API collection names that the typed waterdata getters cover.
44+
# Used as the ``service`` arg type of :func:`get_waterdata`. Keep in
45+
# sync with ``_OUTPUT_ID_BY_SERVICE`` in
46+
# :mod:`dataretrieval.waterdata.utils` — the dict is the source of
47+
# truth at runtime; this Literal exists so editors can offer
48+
# completion and type-checkers can catch typos at call sites.
49+
WATERDATA_SERVICES = Literal[
50+
"channel-measurements",
51+
"combined-metadata",
52+
"continuous",
53+
"daily",
54+
"field-measurements",
55+
"field-measurements-metadata",
56+
"latest-continuous",
57+
"latest-daily",
58+
"monitoring-locations",
59+
"peaks",
60+
"time-series-metadata",
61+
]
62+
4363
PROFILES = Literal[
4464
"actgroup",
4565
"actmetric",

dataretrieval/waterdata/utils.py

Lines changed: 53 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -512,14 +512,7 @@ def _construct_api_requests(
512512
for k, v in kwargs.items()
513513
}
514514

515-
params["skipGeometry"] = skip_geometry
516-
params["limit"] = 50000 if limit is None or limit > 50000 else limit
517-
518-
# `len()` instead of truthiness: a numpy ndarray would raise on `if bbox:`.
519-
if bbox is not None and len(bbox) > 0:
520-
params["bbox"] = ",".join(map(str, bbox))
521-
if properties:
522-
params["properties"] = ",".join(properties)
515+
params.update(_ogc_query_params(properties, bbox, limit, skip_geometry))
523516

524517
# Translate CQL filter Python names to the hyphenated URL parameter that
525518
# the OGC API expects. The Python kwarg is `filter_lang` because hyphens
@@ -548,6 +541,26 @@ def _construct_api_requests(
548541
return request.prepare()
549542

550543

544+
def _ogc_query_params(
545+
properties: list[str] | None = None,
546+
bbox: list[float] | None = None,
547+
limit: int | None = None,
548+
skip_geometry: bool | None = None,
549+
) -> dict[str, Any]:
550+
"""The ``skipGeometry``/``limit``/``bbox``/``properties`` block of
551+
an OGC URL, used by both :func:`_construct_api_requests` and
552+
:func:`_construct_cql_request`."""
553+
params: dict[str, Any] = {
554+
"skipGeometry": skip_geometry,
555+
"limit": 50000 if limit is None or limit > 50000 else limit,
556+
}
557+
if bbox is not None and len(bbox) > 0:
558+
params["bbox"] = ",".join(map(str, bbox))
559+
if properties:
560+
params["properties"] = ",".join(properties)
561+
return params
562+
563+
551564
def _construct_cql_request(
552565
service: str,
553566
cql_body: str,
@@ -558,45 +571,21 @@ def _construct_cql_request(
558571
) -> requests.PreparedRequest:
559572
"""Build a POST/CQL2 request for the generalized ``get_waterdata`` path.
560573
561-
Distinct from :func:`_construct_api_requests` because that function
562-
derives the CQL2 body from typed kwargs; here the body is passed
563-
through verbatim so a caller can express predicates the typed
564-
wrappers can't (top-level ``or``, ``like`` with ``%`` wildcards,
565-
comparison operators, …).
566-
567-
Parameters
568-
----------
569-
service : str
570-
Collection name, e.g. ``"daily"``.
571-
cql_body : str
572-
Pre-serialized CQL2-JSON. Sent as the request body unchanged.
573-
properties : list of str, optional
574-
Server-side property whitelist. Joined with commas.
575-
bbox : list of float, optional
576-
Bounding box ``[xmin, ymin, xmax, ymax]``. Joined with commas.
577-
limit : int, optional
578-
Page size; clamped to the server max of 50,000.
579-
skip_geometry : bool, optional
580-
If True, sets ``skipGeometry=true`` so the server omits
581-
geometry from each feature.
574+
Distinct from :func:`_construct_api_requests`: that function derives
575+
the CQL2 body from typed kwargs and also handles the GET-with-comma-
576+
separated-values path. Here the body is passed through verbatim so
577+
a caller can express predicates the typed wrappers can't (top-level
578+
``or``, ``like`` with ``%`` wildcards, comparison operators, …).
582579
"""
583580
service_url = f"{OGC_API_URL}/collections/{service}/items"
584-
params: dict[str, Any] = {
585-
"skipGeometry": bool(skip_geometry),
586-
"limit": 50000 if limit is None or limit > 50000 else limit,
587-
}
588-
if bbox is not None and len(bbox) > 0:
589-
params["bbox"] = ",".join(map(str, bbox))
590-
if properties:
591-
params["properties"] = ",".join(properties)
592581
headers = _default_headers()
593582
headers["Content-Type"] = "application/query-cql-json"
594583
request = requests.Request(
595584
method="POST",
596585
url=service_url,
597586
headers=headers,
598587
data=cql_body,
599-
params=params,
588+
params=_ogc_query_params(properties, bbox, limit, skip_geometry),
600589
)
601590
return request.prepare()
602591

@@ -941,6 +930,29 @@ def _sort_rows(df: pd.DataFrame) -> pd.DataFrame:
941930
return df
942931

943932

933+
def _finalize_ogc_frame(
934+
df: pd.DataFrame,
935+
response: requests.Response,
936+
properties: list[str] | None,
937+
service: str,
938+
output_id: str,
939+
convert_type: bool,
940+
) -> tuple[pd.DataFrame, BaseMetadata]:
941+
"""Apply the standard OGC post-processing tail to a raw frame and
942+
wrap the response in :class:`BaseMetadata`.
943+
944+
Shared by :func:`get_ogc_data` (typed-kwargs path) and
945+
:func:`dataretrieval.waterdata.api.get_waterdata` (raw-CQL2 path)
946+
so both surfaces produce identically-shaped DataFrames.
947+
"""
948+
df = _deal_with_empty(df, properties, service)
949+
if convert_type:
950+
df = _type_cols(df)
951+
df = _arrange_cols(df, properties, output_id)
952+
df = _sort_rows(df)
953+
return df, BaseMetadata(response)
954+
955+
944956
def get_ogc_data(
945957
args: dict[str, Any], output_id: str, service: str
946958
) -> tuple[pd.DataFrame, BaseMetadata]:
@@ -987,13 +999,9 @@ def get_ogc_data(
987999
args = {k: v for k, v in args.items() if v is not None}
9881000

9891001
return_list, response = _fetch_once(args)
990-
return_list = _deal_with_empty(return_list, properties, service)
991-
if convert_type:
992-
return_list = _type_cols(return_list)
993-
return_list = _arrange_cols(return_list, properties, output_id)
994-
return_list = _sort_rows(return_list)
995-
996-
return return_list, BaseMetadata(response)
1002+
return _finalize_ogc_frame(
1003+
return_list, response, properties, service, output_id, convert_type
1004+
)
9971005

9981006

9991007
@filters.chunked(build_request=_construct_api_requests)

0 commit comments

Comments
 (0)