Skip to content

Commit 665383f

Browse files
thodson-usgsclaude
andauthored
fix(waterdata): small coherence cleanups (annotations, column order, defensiveness) (#306)
Five small, low-risk fixes surfaced by the package review: 1. get_latest_continuous / get_latest_daily: `value` was annotated `int`, but every other getter (and the docstrings) use `str | Iterable[str]`; the `int` hint also rejected the multi-value list filtering the others advertise. 2. get_time_series_metadata: `thresholds` was annotated `int`, vs `float | list[float]` on get_combined_metadata for the same queryable. 3. _arrange_cols: the "move the synthetic per-record id column to the end" set was a hand-maintained literal that omitted peak_id, channel_measurements_id, combined_meta_id, and field_series_id, so those four getters left their id at the front instead of the end like daily_id. Derive the set from _OUTPUT_ID_BY_SERVICE (every output id except the user-facing monitoring_location_id and time_series_id) so it stays in sync and can't drift again when a service is added. 4. _next_req_url: returned a falsy `href` ("") instead of None, contradicting its Optional[str] contract. Return None. 5. _get_resp_data (geopandas branch): mirror the non-geopandas branch's `f.get("id")` so a feature missing a top-level id yields None rather than a KeyError. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 18090d4 commit 665383f

2 files changed

Lines changed: 18 additions & 14 deletions

File tree

dataretrieval/waterdata/api.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ def get_time_series_metadata(
768768
unit_of_measure: str | Iterable[str] | None = None,
769769
computation_period_identifier: str | Iterable[str] | None = None,
770770
computation_identifier: str | Iterable[str] | None = None,
771-
thresholds: int | None = None,
771+
thresholds: float | list[float] | None = None,
772772
sublocation_identifier: str | Iterable[str] | None = None,
773773
primary: str | Iterable[str] | None = None,
774774
parent_time_series_id: str | Iterable[str] | None = None,
@@ -1213,7 +1213,7 @@ def get_latest_continuous(
12131213
approval_status: str | Iterable[str] | None = None,
12141214
unit_of_measure: str | Iterable[str] | None = None,
12151215
qualifier: str | Iterable[str] | None = None,
1216-
value: int | None = None,
1216+
value: str | Iterable[str] | None = None,
12171217
last_modified: str | Iterable[str] | None = None,
12181218
skip_geometry: bool | None = None,
12191219
time: str | Iterable[str] | None = None,
@@ -1407,7 +1407,7 @@ def get_latest_daily(
14071407
approval_status: str | Iterable[str] | None = None,
14081408
unit_of_measure: str | Iterable[str] | None = None,
14091409
qualifier: str | Iterable[str] | None = None,
1410-
value: int | None = None,
1410+
value: str | Iterable[str] | None = None,
14111411
last_modified: str | Iterable[str] | None = None,
14121412
skip_geometry: bool | None = None,
14131413
time: str | Iterable[str] | None = None,

dataretrieval/waterdata/utils.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@
8686
"time-series-metadata": "time_series_id",
8787
}
8888

89+
# Every service's output id EXCEPT the two that are genuinely user-facing
90+
# (``monitoring_location_id`` and ``time_series_id``). The rest are synthetic
91+
# per-record ids that ``_arrange_cols`` moves to the end of a result frame.
92+
# Derived from ``_OUTPUT_ID_BY_SERVICE`` so adding a service can't silently
93+
# leave a stray id column at the front again.
94+
_EXTRA_ID_COLS = set(_OUTPUT_ID_BY_SERVICE.values()) - {
95+
"monitoring_location_id",
96+
"time_series_id",
97+
}
98+
8999

90100
def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str):
91101
"""
@@ -806,7 +816,7 @@ def _next_req_url(
806816
continue
807817
href = link.get("href")
808818
if not href:
809-
return href
819+
return None
810820
# Refuse to follow a next-page link to a different host —
811821
# the request's headers/auth were minted for the original
812822
# host and shouldn't leak to whatever a poisoned response
@@ -908,7 +918,9 @@ def _get_resp_data(
908918

909919
# Organize json into geodataframe and make sure id column comes along.
910920
df = gpd.GeoDataFrame.from_features(features)
911-
df["id"] = pd.json_normalize(features)["id"].values
921+
# Mirror the non-geopandas branch's defensive ``f.get("id")`` so a feature
922+
# missing a top-level ``id`` yields None rather than a KeyError.
923+
df["id"] = [f.get("id") for f in features]
912924
df = df[["id"] + [col for col in df.columns if col != "id"]]
913925

914926
# If no geometry present, then return pandas dataframe. A geodataframe
@@ -1295,15 +1307,7 @@ def _arrange_cols(
12951307

12961308
# Move meaningless-to-user, extra id columns to the end
12971309
# of the dataframe, if they exist
1298-
extra_id_col = set(df.columns).intersection(
1299-
{
1300-
"latest_continuous_id",
1301-
"latest_daily_id",
1302-
"daily_id",
1303-
"continuous_id",
1304-
"field_measurement_id",
1305-
}
1306-
)
1310+
extra_id_col = set(df.columns).intersection(_EXTRA_ID_COLS)
13071311

13081312
# If the arbitrary id column is returned (either due to properties
13091313
# being none or NaN), then move it to the end of the dataframe, but

0 commit comments

Comments
 (0)