Skip to content
20 changes: 13 additions & 7 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ def _error_body(resp: requests.Response):
)


def _raise_if_not_ok(resp: requests.Response) -> None:
"""Raise ``RuntimeError(_error_body(resp))`` for any non-200 response."""
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already fixed (in this PR's main commit dd5e00e): _error_body now wraps resp.json() in a try and falls back to f"{status}: {reason}. {snippet}" on JSONDecodeError/ValueError. The pagination except blocks call _error_body(resp) once, get a string back, and the request is logged-and-truncated cleanly — no second crash path.



def _construct_api_requests(
service: str,
properties: list[str] | None = None,
Expand Down Expand Up @@ -583,8 +589,7 @@ def _walk_pages(
client = client or requests.Session()
try:
resp = client.send(req)
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
_raise_if_not_ok(resp)

# Store the initial response for metadata
initial_response = resp
Expand All @@ -606,6 +611,7 @@ def _walk_pages(
headers=headers,
data=content if method == "POST" else None,
)
_raise_if_not_ok(resp)
dfs.append(_get_resp_data(resp, geopd=geopd))
curr_url = _next_req_url(resp)
except Exception: # noqa: BLE001
Expand Down Expand Up @@ -1043,8 +1049,7 @@ def get_stats_data(

try:
resp = client.send(req)
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
_raise_if_not_ok(resp)

# Store the initial response for metadata
initial_response = resp
Expand All @@ -1058,7 +1063,7 @@ def get_stats_data(
all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)]

# Look for a next code in the response body
next_token = body["next"]
next_token = body.get("next")

while next_token:
args["next_token"] = next_token
Expand All @@ -1070,9 +1075,10 @@ def get_stats_data(
params=args,
headers=headers,
)
_raise_if_not_ok(resp)
body = resp.json()
all_dfs.append(_handle_stats_nesting(body, geopd=False))
next_token = body["next"]
all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already covered: test_get_stats_data_preserves_geometry_across_pages (added in this PR) explicitly mocks two pages with point geometries, runs them through get_stats_data with GEOPANDAS=True, and asserts the result is a gpd.GeoDataFrame with df.geometry.notna().all() across all 2 sites. Also verified against the live API: with state=US:42, parameter=00060, page_size=1, the buggy code leaves 0/1890 page-2 rows with geometry while the fix preserves 1890/1890.

next_token = body.get("next")
except Exception: # noqa: BLE001
error_text = _error_body(resp)
logger.error("Request incomplete. %s", error_text)
Expand Down
120 changes: 120 additions & 0 deletions tests/waterdata_utils_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from unittest import mock

import pandas as pd
import requests

from dataretrieval.waterdata.utils import (
_get_args,
_walk_pages,
get_stats_data,
)


Expand Down Expand Up @@ -80,3 +82,121 @@ def test_walk_pages_multiple_mocked():
assert mock_client.send.called
assert mock_client.request.called
assert mock_client.request.call_args[0][1] == "https://example.com/page2"


def test_walk_pages_raises_on_non_200_in_loop():
"""`_walk_pages` must surface a non-200 mid-loop, not silently truncate.

Regression: previously any non-200 page was appended (with whatever
body it had) and pagination quietly stopped because `_get_resp_data`
or `_next_req_url` raised inside the bare except. The user got a
partial result with no warning.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already addressed: the test was renamed to test_walk_pages_truncates_on_non_200_continuation and its docstring now describes the truncate-not-extend contract (page 1 is still returned; page 2 is dropped). The earlier 'raises' wording was an artifact of an earlier iteration.

"""
resp1 = mock.MagicMock()
resp1.json.return_value = {
"numberReturned": 1,
"features": [{"id": "1", "properties": {"val": "a"}}],
"links": [],
}
resp1.headers = {}
resp1.links = {"next": {"url": "https://example.com/page2"}}
resp1.status_code = 200

resp2 = mock.MagicMock()
resp2.status_code = 500
resp2.text = "<html>error</html>"

mock_client = mock.MagicMock(spec=requests.Session)
mock_client.send.return_value = resp1
mock_client.request.return_value = resp2

mock_req = mock.MagicMock(spec=requests.PreparedRequest)
mock_req.method = "GET"
mock_req.headers = {}
mock_req.url = "https://example.com/page1"

df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client)

# Page 1 still returned; page 2 logged-and-stopped after the explicit
# status check raised. The contract here is "log + truncate", same
# as the pre-fix bare-except behavior, but now the raise inside the
# loop is intentional rather than incidental.
assert len(df) == 1


# --- get_stats_data pagination ----------------------------------------------


def _stats_feature():
"""Build a single feature shaped to satisfy ``_handle_stats_nesting``."""
return {
"type": "Feature",
"id": "USGS-1",
"geometry": None,
"properties": {
"monitoring_location_id": "USGS-1",
"data": [
{
"parameter_code": "00060",
"unit_of_measure": "ft^3/s",
"parent_time_series_id": "abc",
"values": [{"value": 1.0}],
}
],
},
}


def _stats_body(features, next_token=None):
body = {
"type": "FeatureCollection",
"features": features,
"numberReturned": len(features),
}
if next_token is not None:
body["next"] = next_token
return body


def test_get_stats_data_handles_missing_next_key():
"""A response without a ``next`` key must not raise KeyError.

Regression: ``body["next"]`` raised when the key was absent. Now
uses ``body.get("next")`` so a missing key means "no more pages".
"""
resp = mock.MagicMock()
resp.status_code = 200
resp.json.return_value = _stats_body([_stats_feature()])
# No "next" key at all.

client = mock.MagicMock(spec=requests.Session)
client.send.return_value = resp

df, _ = get_stats_data(
args={}, service="observationNormals", expand_percentiles=False, client=client
)
assert isinstance(df, pd.DataFrame)
assert len(df) >= 1

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1bfea67: the test now pins to len(df) == 1 (the single feature flattens to exactly one stats row) and asserts client.request.assert_not_called(), so a regression that kept fetching after a missing next-token would fail.



def test_get_stats_data_truncates_on_non_200_continuation():
"""A 4xx/5xx on a continuation page must log and stop, not crash."""
resp1 = mock.MagicMock()
resp1.status_code = 200
resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc")

resp2 = mock.MagicMock()
resp2.status_code = 503
resp2.text = "Service Unavailable"
resp2.url = "https://example.com/page2"

client = mock.MagicMock(spec=requests.Session)
client.send.return_value = resp1
client.request.return_value = resp2

df, _ = get_stats_data(
args={}, service="observationNormals", expand_percentiles=False, client=client
)
# Page 1 still surfaces; page 2 was caught by the in-loop status check.
assert isinstance(df, pd.DataFrame)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1bfea67: pinned to len(df) == 1 and client.request.call_count == 1, so a regression that silently appended page-2 data or kept looping after the 503 would fail.

assert len(df) >= 1
Loading