Skip to content

Commit 0ad568c

Browse files
thodson-usgsclaude
andcommitted
waterdata: tighten stats and OGC pagination — geometry, KeyError, non-200
Three correctness fixes to the two pagination loops in dataretrieval.waterdata.utils. * `get_stats_data` honored `GEOPANDAS` for the first page but hard-coded `geopd=False` on every continuation page. With geopandas installed, a multi-page stats response started as a `GeoDataFrame` and pages 2..N came back as plain `DataFrame`s; `pd.concat` then silently downgraded the result and the caller lost geometry / CRS. Use `geopd=GEOPANDAS` on every page. * `get_stats_data` indexed `body["next"]` directly, raising `KeyError` on responses without that key (some terminal responses simply omit it). Switch to `body.get("next")`, which produces `None` and exits the loop cleanly. * Both `get_stats_data`'s in-loop request and `_walk_pages`'s in-loop request returned the response without checking `status_code`. A 4xx or 5xx page whose body happened to JSON-decode could be appended as data, then pagination quietly stopped — the caller got a partial result with no warning. Add an explicit `if status_code != 200` raise inside each loop so the existing log-and-truncate handler fires deliberately rather than incidentally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 51ac674 commit 0ad568c

2 files changed

Lines changed: 127 additions & 3 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ def _walk_pages(
606606
headers=headers,
607607
data=content if method == "POST" else None,
608608
)
609+
if resp.status_code != 200:
610+
raise RuntimeError(_error_body(resp))
609611
dfs.append(_get_resp_data(resp, geopd=geopd))
610612
curr_url = _next_req_url(resp)
611613
except Exception: # noqa: BLE001
@@ -1058,7 +1060,7 @@ def get_stats_data(
10581060
all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)]
10591061

10601062
# Look for a next code in the response body
1061-
next_token = body["next"]
1063+
next_token = body.get("next")
10621064

10631065
while next_token:
10641066
args["next_token"] = next_token
@@ -1070,9 +1072,11 @@ def get_stats_data(
10701072
params=args,
10711073
headers=headers,
10721074
)
1075+
if resp.status_code != 200:
1076+
raise RuntimeError(_error_body(resp))
10731077
body = resp.json()
1074-
all_dfs.append(_handle_stats_nesting(body, geopd=False))
1075-
next_token = body["next"]
1078+
all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
1079+
next_token = body.get("next")
10761080
except Exception: # noqa: BLE001
10771081
error_text = _error_body(resp)
10781082
logger.error("Request incomplete. %s", error_text)

tests/waterdata_utils_test.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from unittest import mock
22

3+
import pandas as pd
34
import requests
45

56
from dataretrieval.waterdata.utils import (
67
_get_args,
78
_walk_pages,
9+
get_stats_data,
810
)
911

1012

@@ -80,3 +82,121 @@ def test_walk_pages_multiple_mocked():
8082
assert mock_client.send.called
8183
assert mock_client.request.called
8284
assert mock_client.request.call_args[0][1] == "https://example.com/page2"
85+
86+
87+
def test_walk_pages_raises_on_non_200_in_loop():
88+
"""`_walk_pages` must surface a non-200 mid-loop, not silently truncate.
89+
90+
Regression: previously any non-200 page was appended (with whatever
91+
body it had) and pagination quietly stopped because `_get_resp_data`
92+
or `_next_req_url` raised inside the bare except. The user got a
93+
partial result with no warning.
94+
"""
95+
resp1 = mock.MagicMock()
96+
resp1.json.return_value = {
97+
"numberReturned": 1,
98+
"features": [{"id": "1", "properties": {"val": "a"}}],
99+
"links": [],
100+
}
101+
resp1.headers = {}
102+
resp1.links = {"next": {"url": "https://example.com/page2"}}
103+
resp1.status_code = 200
104+
105+
resp2 = mock.MagicMock()
106+
resp2.status_code = 500
107+
resp2.text = "<html>error</html>"
108+
109+
mock_client = mock.MagicMock(spec=requests.Session)
110+
mock_client.send.return_value = resp1
111+
mock_client.request.return_value = resp2
112+
113+
mock_req = mock.MagicMock(spec=requests.PreparedRequest)
114+
mock_req.method = "GET"
115+
mock_req.headers = {}
116+
mock_req.url = "https://example.com/page1"
117+
118+
df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client)
119+
120+
# Page 1 still returned; page 2 logged-and-stopped after the explicit
121+
# status check raised. The contract here is "log + truncate", same
122+
# as the pre-fix bare-except behavior, but now the raise inside the
123+
# loop is intentional rather than incidental.
124+
assert len(df) == 1
125+
126+
127+
# --- get_stats_data pagination ----------------------------------------------
128+
129+
130+
def _stats_feature():
131+
"""Build a single feature shaped to satisfy ``_handle_stats_nesting``."""
132+
return {
133+
"type": "Feature",
134+
"id": "USGS-1",
135+
"geometry": None,
136+
"properties": {
137+
"monitoring_location_id": "USGS-1",
138+
"data": [
139+
{
140+
"parameter_code": "00060",
141+
"unit_of_measure": "ft^3/s",
142+
"parent_time_series_id": "abc",
143+
"values": [{"value": 1.0}],
144+
}
145+
],
146+
},
147+
}
148+
149+
150+
def _stats_body(features, next_token=None):
151+
body = {
152+
"type": "FeatureCollection",
153+
"features": features,
154+
"numberReturned": len(features),
155+
}
156+
if next_token is not None:
157+
body["next"] = next_token
158+
return body
159+
160+
161+
def test_get_stats_data_handles_missing_next_key():
162+
"""A response without a ``next`` key must not raise KeyError.
163+
164+
Regression: ``body["next"]`` raised when the key was absent. Now
165+
uses ``body.get("next")`` so a missing key means "no more pages".
166+
"""
167+
resp = mock.MagicMock()
168+
resp.status_code = 200
169+
resp.json.return_value = _stats_body([_stats_feature()])
170+
# No "next" key at all.
171+
172+
client = mock.MagicMock(spec=requests.Session)
173+
client.send.return_value = resp
174+
175+
df, _ = get_stats_data(
176+
args={}, service="observationNormals", expand_percentiles=False, client=client
177+
)
178+
assert isinstance(df, pd.DataFrame)
179+
assert len(df) >= 1
180+
181+
182+
def test_get_stats_data_truncates_on_non_200_continuation():
183+
"""A 4xx/5xx on a continuation page must log and stop, not crash."""
184+
resp1 = mock.MagicMock()
185+
resp1.status_code = 200
186+
resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc")
187+
188+
resp2 = mock.MagicMock()
189+
resp2.status_code = 503
190+
resp2.text = "Service Unavailable"
191+
resp2.url = "https://example.com/page2"
192+
193+
client = mock.MagicMock(spec=requests.Session)
194+
client.send.return_value = resp1
195+
client.request.return_value = resp2
196+
197+
df, _ = get_stats_data(
198+
args={}, service="observationNormals", expand_percentiles=False, client=client
199+
)
200+
# Page 1 still surfaces; page 2 was caught by the in-loop status check.
201+
assert isinstance(df, pd.DataFrame)
202+
assert len(df) >= 1

0 commit comments

Comments
 (0)