Skip to content

Commit 404b8ce

Browse files
thodson-usgsclaude
andcommitted
Add get_stats_data failure tests and tighten PR #273
Follow-up cleanups on top of the silent-truncation fix: - New tests for the get_stats_data parallel pagination loop, mirroring the _walk_pages ConnectionError + 5xx pair (closes coverage gap from the prior commits). - Lift the hard-coded logger-name string to a module-level _LOGGER_NAME constant derived from the utils module's __name__ (self-syncing). - logger.error → logger.exception in both pagination except clauses so the traceback is automatically attached to the log record. - _raise_for_non_200 docstring: drop the historical bug narration, add a one-line note distinguishing it from Response.raise_for_status (we route through _error_body for USGS-API-aware messages). - Drop the inconsistent "resp may be stale" comment; the `as e` and variable naming already convey intent, and get_stats_data already sidesteps it by logging `url` instead of `resp.url`. - _run_get_stats_data_with_failure: make monkeypatch a required arg (both callers pass it), and swap the lambda for MagicMock so future signature drift in _handle_stats_nesting isn't silently masked. - Tighten test docstrings that narrated the PR/bug. - Extract _error_log_messages(caplog) helper (used 4 times). - Lift the conditional links ternary out of the dict literal in _resp_ok. - NEWS.md: bump date to today. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 13a63fc commit 404b8ce

3 files changed

Lines changed: 96 additions & 25 deletions

File tree

NEWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved.
1+
**05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved.
22

33
**05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.
44

dataretrieval/waterdata/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -371,9 +371,10 @@ def _error_body(resp: requests.Response):
371371
def _raise_for_non_200(resp: requests.Response) -> None:
372372
"""Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200.
373373
374-
Used by both the initial-request gate and the pagination loops; without
375-
the loop-side check, a 5xx body lacking ``numberReturned`` would be
376-
silently treated as an empty page and pagination would stop quietly.
374+
Routes through ``_error_body`` (USGS-API-aware: handles 429/403
375+
specially, extracts ``code``/``description`` from JSON error bodies)
376+
rather than ``Response.raise_for_status``, which raises
377+
``HTTPError`` with a generic message.
377378
"""
378379
if resp.status_code != 200:
379380
raise RuntimeError(_error_body(resp))
@@ -649,8 +650,7 @@ def _walk_pages(
649650
dfs.append(_get_resp_data(resp, geopd=geopd))
650651
curr_url = _next_req_url(resp)
651652
except Exception as e: # noqa: BLE001
652-
# `resp` may be stale (prior page) if client.request() raised.
653-
logger.error("Request incomplete: %s", e)
653+
logger.exception("Request incomplete: %s", e)
654654
logger.warning(
655655
"Request failed for URL: %s. Data download interrupted.", curr_url
656656
)
@@ -1114,7 +1114,7 @@ def get_stats_data(
11141114
all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
11151115
next_token = body["next"]
11161116
except Exception as e: # noqa: BLE001
1117-
logger.error("Request incomplete: %s", e)
1117+
logger.exception("Request incomplete: %s", e)
11181118
logger.warning(
11191119
"Request failed for URL: %s. Data download interrupted.", url
11201120
)

tests/waterdata_utils_test.py

Lines changed: 89 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
import requests
66

7+
import dataretrieval.waterdata.utils as _utils_module
78
from dataretrieval.waterdata.utils import (
89
_arrange_cols,
910
_format_api_dates,
@@ -12,6 +13,8 @@
1213
_walk_pages,
1314
)
1415

16+
_LOGGER_NAME = _utils_module.__name__
17+
1518

1619
def test_get_args_basic():
1720
local_vars = {
@@ -89,21 +92,25 @@ def test_walk_pages_multiple_mocked():
8992

9093
def _resp_ok(features):
9194
"""Build a 200-OK mock response carrying the given features list."""
95+
links = [{"rel": "next", "href": "https://example.com/page2"}] if features else []
9296
resp = mock.MagicMock()
9397
resp.json.return_value = {
9498
"numberReturned": len(features),
9599
"features": features,
96-
"links": [{"rel": "next", "href": "https://example.com/page2"}]
97-
if features
98-
else [],
100+
"links": links,
99101
}
100102
resp.headers = {}
101-
resp.links = {"next": {"url": "https://example.com/page2"}} if features else {}
102103
resp.status_code = 200
103104
resp.url = "https://example.com/page1"
104105
return resp
105106

106107

108+
def _error_log_messages(caplog):
109+
"""Pull ERROR-and-above message strings out of caplog. The four
110+
pagination-failure tests below all assert against the same shape."""
111+
return [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR]
112+
113+
107114
def _walk_pages_with_failure(failure_resp_or_exc):
108115
"""Run _walk_pages where page 1 succeeds and page 2 fails as given."""
109116
resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}])
@@ -125,22 +132,20 @@ def _walk_pages_with_failure(failure_resp_or_exc):
125132

126133
def test_walk_pages_logs_actual_exception_when_request_raises(caplog):
127134
"""Exception from client.request() must be logged with its actual message."""
128-
caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")
135+
caplog.set_level(logging.ERROR, logger=_LOGGER_NAME)
129136

130137
df, _ = _walk_pages_with_failure(requests.ConnectionError("boom"))
131138

132139
# First page's data is preserved (best-effort behavior).
133140
assert list(df["val"]) == ["a"]
134141
# Logged error mentions the actual ConnectionError, not a stale page body.
135-
error_messages = [
136-
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
137-
]
138-
assert any("boom" in m for m in error_messages), error_messages
142+
messages = _error_log_messages(caplog)
143+
assert any("boom" in m for m in messages), messages
139144

140145

141146
def test_walk_pages_surfaces_5xx_mid_pagination(caplog):
142147
"""A non-200 mid-pagination response must be logged, not silently swallowed."""
143-
caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")
148+
caplog.set_level(logging.ERROR, logger=_LOGGER_NAME)
144149

145150
page2_500 = mock.MagicMock()
146151
page2_500.status_code = 503
@@ -153,16 +158,82 @@ def test_walk_pages_surfaces_5xx_mid_pagination(caplog):
153158
df, _ = _walk_pages_with_failure(page2_500)
154159

155160
assert list(df["val"]) == ["a"]
156-
error_messages = [
157-
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
158-
]
159-
# The 5xx is now visible in the error log (it would previously have
160-
# been silently swallowed because _get_resp_data returned an empty
161-
# frame and the loop stopped quietly).
162-
assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), (
163-
error_messages
161+
messages = _error_log_messages(caplog)
162+
assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages
163+
164+
165+
def _stats_initial_ok():
166+
"""A 200-OK initial stats response: empty data list, signals one more page."""
167+
resp = mock.MagicMock()
168+
resp.status_code = 200
169+
resp.json.return_value = {
170+
"next": "tok2",
171+
"features": [],
172+
}
173+
resp.headers = {}
174+
resp.url = "https://example.com/stats?service=foo"
175+
return resp
176+
177+
178+
def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch):
179+
"""Exercise get_stats_data where the initial response succeeds and the
180+
paginated follow-up fails as given. Mirrors _walk_pages_with_failure.
181+
`monkeypatch` stubs ``_handle_stats_nesting`` so the synthetic minimal
182+
response body doesn't need to parse — these tests only assert on the
183+
pagination loop's error surfacing."""
184+
from dataretrieval.waterdata.utils import get_stats_data
185+
186+
monkeypatch.setattr(
187+
_utils_module,
188+
"_handle_stats_nesting",
189+
mock.MagicMock(return_value=pd.DataFrame()),
164190
)
165191

192+
mock_client = mock.MagicMock(spec=requests.Session)
193+
mock_client.send.return_value = _stats_initial_ok()
194+
if isinstance(failure_resp_or_exc, BaseException):
195+
mock_client.request.side_effect = failure_resp_or_exc
196+
else:
197+
mock_client.request.return_value = failure_resp_or_exc
198+
199+
return get_stats_data(
200+
args={"monitoring_location_id": "USGS-1"},
201+
service="observationNormals",
202+
expand_percentiles=False,
203+
client=mock_client,
204+
)
205+
206+
207+
def test_get_stats_data_logs_actual_exception_when_request_raises(caplog, monkeypatch):
208+
"""get_stats_data variant of the connection-error scenario."""
209+
caplog.set_level(logging.ERROR, logger=_LOGGER_NAME)
210+
211+
_run_get_stats_data_with_failure(
212+
requests.ConnectionError("stats-boom"),
213+
monkeypatch,
214+
)
215+
216+
messages = _error_log_messages(caplog)
217+
assert any("stats-boom" in m for m in messages), messages
218+
219+
220+
def test_get_stats_data_surfaces_5xx_mid_pagination(caplog, monkeypatch):
221+
"""get_stats_data variant of the mid-pagination 5xx scenario."""
222+
caplog.set_level(logging.ERROR, logger=_LOGGER_NAME)
223+
224+
page2_503 = mock.MagicMock()
225+
page2_503.status_code = 503
226+
page2_503.json.return_value = {
227+
"code": "ServiceUnavailable",
228+
"description": "upstream timeout",
229+
}
230+
page2_503.url = "https://example.com/stats?service=foo&next_token=tok2"
231+
232+
_run_get_stats_data_with_failure(page2_503, monkeypatch)
233+
234+
messages = _error_log_messages(caplog)
235+
assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages
236+
166237

167238
def test_handle_stats_nesting_tolerates_missing_drop_columns():
168239
"""If the upstream stats response shape ever changes such that one of

0 commit comments

Comments
 (0)