-
Notifications
You must be signed in to change notification settings - Fork 59
waterdata: get_stats_data preserve geometry across continuation pages #255
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
0ad568c
0fa758e
af00705
dd5e00e
1bfea67
22c60de
949b4eb
b7285e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -339,6 +339,12 @@ def _error_body(resp: requests.Response): | |
| ) | ||
|
|
||
|
|
||
| def _raise_if_not_ok(resp: requests.Response) -> None: | ||
| """Raise ``RuntimeError(_error_body(resp))`` for any non-200 response.""" | ||
| if resp.status_code != 200: | ||
| raise RuntimeError(_error_body(resp)) | ||
|
|
||
|
|
||
| def _construct_api_requests( | ||
| service: str, | ||
| properties: list[str] | None = None, | ||
|
|
@@ -583,8 +589,7 @@ def _walk_pages( | |
| client = client or requests.Session() | ||
| try: | ||
| resp = client.send(req) | ||
| if resp.status_code != 200: | ||
| raise RuntimeError(_error_body(resp)) | ||
| _raise_if_not_ok(resp) | ||
|
|
||
| # Store the initial response for metadata | ||
| initial_response = resp | ||
|
|
@@ -606,6 +611,7 @@ def _walk_pages( | |
| headers=headers, | ||
| data=content if method == "POST" else None, | ||
| ) | ||
| _raise_if_not_ok(resp) | ||
| dfs.append(_get_resp_data(resp, geopd=geopd)) | ||
| curr_url = _next_req_url(resp) | ||
| except Exception: # noqa: BLE001 | ||
|
|
@@ -1043,8 +1049,7 @@ def get_stats_data( | |
|
|
||
| try: | ||
| resp = client.send(req) | ||
| if resp.status_code != 200: | ||
| raise RuntimeError(_error_body(resp)) | ||
| _raise_if_not_ok(resp) | ||
|
|
||
| # Store the initial response for metadata | ||
| initial_response = resp | ||
|
|
@@ -1058,7 +1063,7 @@ def get_stats_data( | |
| all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)] | ||
|
|
||
| # Look for a next code in the response body | ||
| next_token = body["next"] | ||
| next_token = body.get("next") | ||
|
|
||
| while next_token: | ||
| args["next_token"] = next_token | ||
|
|
@@ -1070,9 +1075,10 @@ def get_stats_data( | |
| params=args, | ||
| headers=headers, | ||
| ) | ||
| _raise_if_not_ok(resp) | ||
| body = resp.json() | ||
| all_dfs.append(_handle_stats_nesting(body, geopd=False)) | ||
| next_token = body["next"] | ||
| all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Already covered: |
||
| next_token = body.get("next") | ||
| except Exception: # noqa: BLE001 | ||
| error_text = _error_body(resp) | ||
| logger.error("Request incomplete. %s", error_text) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,12 @@ | ||
| from unittest import mock | ||
|
|
||
| import pandas as pd | ||
| import requests | ||
|
|
||
| from dataretrieval.waterdata.utils import ( | ||
| _get_args, | ||
| _walk_pages, | ||
| get_stats_data, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -80,3 +82,121 @@ def test_walk_pages_multiple_mocked(): | |
| assert mock_client.send.called | ||
| assert mock_client.request.called | ||
| assert mock_client.request.call_args[0][1] == "https://example.com/page2" | ||
|
|
||
|
|
||
| def test_walk_pages_raises_on_non_200_in_loop(): | ||
| """`_walk_pages` must surface a non-200 mid-loop, not silently truncate. | ||
|
|
||
| Regression: previously any non-200 page was appended (with whatever | ||
| body it had) and pagination quietly stopped because `_get_resp_data` | ||
| or `_next_req_url` raised inside the bare except. The user got a | ||
| partial result with no warning. | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Already addressed: the test was renamed to |
||
| """ | ||
| resp1 = mock.MagicMock() | ||
| resp1.json.return_value = { | ||
| "numberReturned": 1, | ||
| "features": [{"id": "1", "properties": {"val": "a"}}], | ||
| "links": [], | ||
| } | ||
| resp1.headers = {} | ||
| resp1.links = {"next": {"url": "https://example.com/page2"}} | ||
| resp1.status_code = 200 | ||
|
|
||
| resp2 = mock.MagicMock() | ||
| resp2.status_code = 500 | ||
| resp2.text = "<html>error</html>" | ||
|
|
||
| mock_client = mock.MagicMock(spec=requests.Session) | ||
| mock_client.send.return_value = resp1 | ||
| mock_client.request.return_value = resp2 | ||
|
|
||
| mock_req = mock.MagicMock(spec=requests.PreparedRequest) | ||
| mock_req.method = "GET" | ||
| mock_req.headers = {} | ||
| mock_req.url = "https://example.com/page1" | ||
|
|
||
| df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client) | ||
|
|
||
| # Page 1 still returned; page 2 logged-and-stopped after the explicit | ||
| # status check raised. The contract here is "log + truncate", same | ||
| # as the pre-fix bare-except behavior, but now the raise inside the | ||
| # loop is intentional rather than incidental. | ||
| assert len(df) == 1 | ||
|
|
||
|
|
||
| # --- get_stats_data pagination ---------------------------------------------- | ||
|
|
||
|
|
||
| def _stats_feature(): | ||
| """Build a single feature shaped to satisfy ``_handle_stats_nesting``.""" | ||
| return { | ||
| "type": "Feature", | ||
| "id": "USGS-1", | ||
| "geometry": None, | ||
| "properties": { | ||
| "monitoring_location_id": "USGS-1", | ||
| "data": [ | ||
| { | ||
| "parameter_code": "00060", | ||
| "unit_of_measure": "ft^3/s", | ||
| "parent_time_series_id": "abc", | ||
| "values": [{"value": 1.0}], | ||
| } | ||
| ], | ||
| }, | ||
| } | ||
|
|
||
|
|
||
| def _stats_body(features, next_token=None): | ||
| body = { | ||
| "type": "FeatureCollection", | ||
| "features": features, | ||
| "numberReturned": len(features), | ||
| } | ||
| if next_token is not None: | ||
| body["next"] = next_token | ||
| return body | ||
|
|
||
|
|
||
| def test_get_stats_data_handles_missing_next_key(): | ||
| """A response without a ``next`` key must not raise KeyError. | ||
|
|
||
| Regression: ``body["next"]`` raised when the key was absent. Now | ||
| uses ``body.get("next")`` so a missing key means "no more pages". | ||
| """ | ||
| resp = mock.MagicMock() | ||
| resp.status_code = 200 | ||
| resp.json.return_value = _stats_body([_stats_feature()]) | ||
| # No "next" key at all. | ||
|
|
||
| client = mock.MagicMock(spec=requests.Session) | ||
| client.send.return_value = resp | ||
|
|
||
| df, _ = get_stats_data( | ||
| args={}, service="observationNormals", expand_percentiles=False, client=client | ||
| ) | ||
| assert isinstance(df, pd.DataFrame) | ||
| assert len(df) >= 1 | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in |
||
|
|
||
|
|
||
| def test_get_stats_data_truncates_on_non_200_continuation(): | ||
| """A 4xx/5xx on a continuation page must log and stop, not crash.""" | ||
| resp1 = mock.MagicMock() | ||
| resp1.status_code = 200 | ||
| resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc") | ||
|
|
||
| resp2 = mock.MagicMock() | ||
| resp2.status_code = 503 | ||
| resp2.text = "Service Unavailable" | ||
| resp2.url = "https://example.com/page2" | ||
|
|
||
| client = mock.MagicMock(spec=requests.Session) | ||
| client.send.return_value = resp1 | ||
| client.request.return_value = resp2 | ||
|
|
||
| df, _ = get_stats_data( | ||
| args={}, service="observationNormals", expand_percentiles=False, client=client | ||
| ) | ||
| # Page 1 still surfaces; page 2 was caught by the in-loop status check. | ||
| assert isinstance(df, pd.DataFrame) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in |
||
| assert len(df) >= 1 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Already fixed (in this PR's main commit
dd5e00e):_error_bodynow wrapsresp.json()in atryand falls back tof"{status}: {reason}. {snippet}"onJSONDecodeError/ValueError. The paginationexceptblocks call_error_body(resp)once, get a string back, and the request is logged-and-truncated cleanly — no second crash path.