fix(waterdata): small coherence cleanups (annotations, column order, defensiveness) (#306)

thodson-usgs · claude · web-flow · commit 665383f72594 · 2026-05-31T19:55:57.000-05:00
Five small, low-risk fixes surfaced by the package review:

1. get_latest_continuous / get_latest_daily: `value` was annotated `int`, but
   every other getter (and the docstrings) use `str | Iterable[str]`; the `int`
   hint also rejected the multi-value list filtering the others advertise.
2. get_time_series_metadata: `thresholds` was annotated `int`, vs
   `float | list[float]` on get_combined_metadata for the same queryable.
3. _arrange_cols: the "move the synthetic per-record id column to the end" set
   was a hand-maintained literal that omitted peak_id, channel_measurements_id,
   combined_meta_id, and field_series_id, so those four getters left their id at
   the front instead of the end like daily_id. Derive the set from
   _OUTPUT_ID_BY_SERVICE (every output id except the user-facing
   monitoring_location_id and time_series_id) so it stays in sync and can't
   drift again when a service is added.
4. _next_req_url: returned a falsy `href` ("") instead of None, contradicting
   its Optional[str] contract. Return None.
5. _get_resp_data (geopandas branch): mirror the non-geopandas branch's
   `f.get("id")` so a feature missing a top-level id yields None rather than a
   KeyError.

Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -768,7 +768,7 @@ def get_time_series_metadata(
     unit_of_measure: str | Iterable[str] | None = None,
     computation_period_identifier: str | Iterable[str] | None = None,
     computation_identifier: str | Iterable[str] | None = None,
-    thresholds: int | None = None,
+    thresholds: float | list[float] | None = None,
     sublocation_identifier: str | Iterable[str] | None = None,
     primary: str | Iterable[str] | None = None,
     parent_time_series_id: str | Iterable[str] | None = None,
@@ -1213,7 +1213,7 @@ def get_latest_continuous(
     approval_status: str | Iterable[str] | None = None,
     unit_of_measure: str | Iterable[str] | None = None,
     qualifier: str | Iterable[str] | None = None,
-    value: int | None = None,
+    value: str | Iterable[str] | None = None,
     last_modified: str | Iterable[str] | None = None,
     skip_geometry: bool | None = None,
     time: str | Iterable[str] | None = None,
@@ -1407,7 +1407,7 @@ def get_latest_daily(
     approval_status: str | Iterable[str] | None = None,
     unit_of_measure: str | Iterable[str] | None = None,
     qualifier: str | Iterable[str] | None = None,
-    value: int | None = None,
+    value: str | Iterable[str] | None = None,
     last_modified: str | Iterable[str] | None = None,
     skip_geometry: bool | None = None,
     time: str | Iterable[str] | None = None,
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -86,6 +86,16 @@
     "time-series-metadata": "time_series_id",
 }
 
+# Every service's output id EXCEPT the two that are genuinely user-facing
+# (``monitoring_location_id`` and ``time_series_id``). The rest are synthetic
+# per-record ids that ``_arrange_cols`` moves to the end of a result frame.
+# Derived from ``_OUTPUT_ID_BY_SERVICE`` so adding a service can't silently
+# leave a stray id column at the front again.
+_EXTRA_ID_COLS = set(_OUTPUT_ID_BY_SERVICE.values()) - {
+    "monitoring_location_id",
+    "time_series_id",
+}
+
 
 def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str):
     """
@@ -806,7 +816,7 @@ def _next_req_url(
             continue
         href = link.get("href")
         if not href:
-            return href
+            return None
         # Refuse to follow a next-page link to a different host —
         # the request's headers/auth were minted for the original
         # host and shouldn't leak to whatever a poisoned response
@@ -908,7 +918,9 @@ def _get_resp_data(
 
     # Organize json into geodataframe and make sure id column comes along.
     df = gpd.GeoDataFrame.from_features(features)
-    df["id"] = pd.json_normalize(features)["id"].values
+    # Mirror the non-geopandas branch's defensive ``f.get("id")`` so a feature
+    # missing a top-level ``id`` yields None rather than a KeyError.
+    df["id"] = [f.get("id") for f in features]
     df = df[["id"] + [col for col in df.columns if col != "id"]]
 
     # If no geometry present, then return pandas dataframe. A geodataframe
@@ -1295,15 +1307,7 @@ def _arrange_cols(
 
     # Move meaningless-to-user, extra id columns to the end
     # of the dataframe, if they exist
-    extra_id_col = set(df.columns).intersection(
-        {
-            "latest_continuous_id",
-            "latest_daily_id",
-            "daily_id",
-            "continuous_id",
-            "field_measurement_id",
-        }
-    )
+    extra_id_col = set(df.columns).intersection(_EXTRA_ID_COLS)
 
     # If the arbitrary id column is returned (either due to properties
     # being none or NaN), then move it to the end of the dataframe, but