Skip to content

Commit c475452

Browse files
thodson-usgsclaude
andcommitted
refactor(waterdata): /simplify pass — typed RateLimited exception, drop sentinels
Pulls cleanup items out of /simplify's review: - **Replace string-sniffing 429 detection with a typed exception.** ``_raise_for_non_200`` now raises ``utils.RateLimited`` (a ``RuntimeError`` subclass) for HTTP 429 specifically, plain ``RuntimeError`` for everything else. The chunker's ``_is_429`` walks ``__cause__`` looking for ``isinstance(_, RateLimited)`` instead of ``str(cur).startswith("429")``; ``_paginated_failure_message`` similarly switches to ``isinstance``. Any future reformatting of the 429 message can no longer silently break quota handling. - **Drop the ``_QUOTA_UNKNOWN = -1`` sentinel.** ``_read_remaining`` now returns ``int | None``; the wrapper branches on ``remaining is not None`` instead of comparing against a magic number. - **Collapse ``_finalize_response`` into ``_combine_chunk_responses``.** The two were a 3-line wrapper around a 6-line helper; merging them removes one call and the indirection. The combiner now takes ``canonical_url`` and sets ``.url`` directly. - **Simplify ``_FetchOnce``** from a constrained ``TypeVar`` to a plain ``Callable`` alias. The TypeVar required ``# type: ignore`` at the return site anyway and bought no callsite type safety. - Update the ``waterdata_test.py`` flaky-rerun regex to match the new ``RateLimited:`` prefix as well as ``RuntimeError:``. Items considered and skipped: - Planning-phase efficiency findings (redundant ``build_request`` probes, ``_filter_candidates`` joining discarded groups, dict-copy cost in ``_iter_sub_args``) — all cold-path next to the actual HTTP round-trips that follow. Premature optimization. - Unifying ``_combine_chunk_responses`` with ``_finalize_paginated_response`` — they take different inputs (one accumulates wall-clock externally, the other builds it from the response list); unification would be churn. - Test docstring trim — separate pass. All 142 chunker / filter / utils unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent f615db8 commit c475452

4 files changed

Lines changed: 65 additions & 74 deletions

File tree

dataretrieval/waterdata/chunking.py

Lines changed: 32 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
import itertools
3535
import math
3636
from collections.abc import Callable, Iterator
37-
from typing import Any, TypeVar
37+
from typing import Any
3838
from urllib.parse import quote_plus
3939

4040
import pandas as pd
@@ -87,12 +87,6 @@
8787
# Response header USGS uses to advertise remaining hourly quota.
8888
_QUOTA_HEADER = "x-ratelimit-remaining"
8989

90-
# Sentinel returned by ``_read_remaining`` when the response has no
91-
# parseable header. The wrapper treats this as "no quota signal" and
92-
# skips the post-first-chunk plan-vs-quota check, so the value just
93-
# needs to be distinct from any plausible real remaining count.
94-
_QUOTA_UNKNOWN = -1
95-
9690
# Separators the two chunking dimensions use to compose their atoms
9791
# into URL-encoded blobs. List dims comma-join values
9892
# (``site=USGS-A,USGS-B``); filter dims OR-join clauses
@@ -101,11 +95,7 @@
10195
_LIST_SEP = ","
10296
_OR_SEP = " OR "
10397

104-
105-
_FetchOnce = TypeVar(
106-
"_FetchOnce",
107-
bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
108-
)
98+
_FetchOnce = Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]]
10999

110100

111101
class RequestTooLarge(ValueError):
@@ -437,27 +427,34 @@ def _plan_joint(
437427
return best[1], best[2]
438428

439429

440-
def _read_remaining(response: requests.Response) -> int:
441-
"""Parse ``x-ratelimit-remaining`` from a response. Missing or
442-
malformed header → ``_QUOTA_UNKNOWN`` (treated as "no signal"; the
443-
post-first-chunk quota check skips when this is returned)."""
430+
def _read_remaining(response: requests.Response) -> int | None:
431+
"""Parse ``x-ratelimit-remaining`` from a response. Returns ``None``
432+
when the header is missing or unparseable; the wrapper treats that
433+
as "no quota signal" and skips the post-first-chunk plan check."""
444434
raw = response.headers.get(_QUOTA_HEADER)
445435
if raw is None:
446-
return _QUOTA_UNKNOWN
436+
return None
447437
try:
448438
return int(raw)
449439
except (TypeError, ValueError):
450-
return _QUOTA_UNKNOWN
440+
return None
451441

452442

453443
def _is_429(exc: BaseException) -> bool:
454-
"""True iff ``exc`` or any link in its ``__cause__`` chain is a
455-
``RuntimeError`` whose message starts with ``429`` — the shape
456-
``_raise_for_non_200`` produces and ``_walk_pages`` re-wraps for
457-
mid-pagination failures."""
444+
"""True iff ``exc`` or anywhere along its ``__cause__`` chain is a
445+
``utils.RateLimited``. ``_walk_pages`` re-wraps mid-pagination
446+
failures as ``RuntimeError`` with the original ``RateLimited``
447+
linked as ``__cause__``, so the chunker has to follow the chain
448+
rather than just ``isinstance(exc, RateLimited)``.
449+
450+
Lazy import: ``utils`` imports this module to decorate
451+
``_fetch_once``, so a top-level import would be circular.
452+
"""
453+
from .utils import RateLimited
454+
458455
cur: BaseException | None = exc
459456
while cur is not None:
460-
if isinstance(cur, RuntimeError) and str(cur).startswith("429"):
457+
if isinstance(cur, RateLimited):
461458
return True
462459
cur = cur.__cause__
463460
return False
@@ -499,38 +496,22 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame:
499496

500497

501498
def _combine_chunk_responses(
502-
responses: list[requests.Response],
499+
responses: list[requests.Response], canonical_url: str
503500
) -> requests.Response:
504-
"""Return one response with the last chunk's headers (for current
505-
rate-limit state) and summed ``elapsed`` (for total wall-clock).
506-
507-
The returned response's ``.url`` is the *first chunk's* URL, which
508-
only reflects the first slice of the user's query. ``_finalize_response``
509-
overwrites ``.url`` with the canonical original-query URL so
510-
``BaseMetadata`` reflects the user's request, not the first sub-chunk.
511-
512-
Mutates the first response in place: ``.headers`` is replaced with
513-
the last response's headers and ``.elapsed`` is accumulated across
514-
all chunks. Downstream reads ``.url``, ``.headers``, and
515-
``.elapsed`` (via ``BaseMetadata``).
516-
"""
501+
"""Fold per-sub-request responses into one. The first response is
502+
mutated in place: ``.headers`` becomes the last response's (so
503+
``x-ratelimit-remaining`` reflects current state), ``.elapsed``
504+
accumulates total wall-clock, and ``.url`` is set to the canonical
505+
original-query URL so ``BaseMetadata`` reflects the user's full
506+
request rather than the first sub-chunk."""
517507
head = responses[0]
518508
if len(responses) > 1:
519509
head.headers = responses[-1].headers
520510
head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed)
511+
head.url = canonical_url
521512
return head
522513

523514

524-
def _finalize_response(
525-
responses: list[requests.Response], canonical_url: str
526-
) -> requests.Response:
527-
"""Aggregate per-sub-request responses and restore the canonical
528-
URL representing the user's full original query."""
529-
combined = _combine_chunk_responses(responses)
530-
combined.url = canonical_url
531-
return combined
532-
533-
534515
def multi_value_chunked(
535516
*,
536517
build_request: Callable[..., Any],
@@ -585,7 +566,7 @@ def wrapper(
585566
raise QuotaExhausted(
586567
partial_frame=_combine_chunk_frames(frames),
587568
partial_response=(
588-
_finalize_response(responses, canonical_url)
569+
_combine_chunk_responses(responses, canonical_url)
589570
if responses
590571
else None
591572
),
@@ -596,7 +577,7 @@ def wrapper(
596577
responses.append(response)
597578
if i == 0 and total > 1:
598579
remaining = _read_remaining(response)
599-
if remaining != _QUOTA_UNKNOWN and remaining < total - 1:
580+
if remaining is not None and remaining < total - 1:
600581
raise RequestExceedsQuota(
601582
planned_chunks=total,
602583
available=remaining + 1,
@@ -605,9 +586,9 @@ def wrapper(
605586

606587
return (
607588
_combine_chunk_frames(frames),
608-
_finalize_response(responses, canonical_url),
589+
_combine_chunk_responses(responses, canonical_url),
609590
)
610591

611-
return wrapper # type: ignore[return-value]
592+
return wrapper
612593

613594
return decorator

dataretrieval/waterdata/utils.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -410,16 +410,30 @@ def _error_body(resp: requests.Response):
410410
)
411411

412412

413+
class RateLimited(RuntimeError):
414+
"""A USGS Water Data API request was rejected with HTTP 429. Exposed
415+
as a typed exception so callers (notably the multi-value chunker)
416+
can detect rate-limit failures via ``isinstance`` instead of
417+
string-matching error messages."""
418+
419+
413420
def _raise_for_non_200(resp: requests.Response) -> None:
414-
"""Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200.
421+
"""Raise on a non-200 response. ``RateLimited`` for 429 (so the
422+
chunker can branch on it without parsing the message); plain
423+
``RuntimeError`` for every other failure.
415424
416425
Routes through ``_error_body`` (USGS-API-aware: handles 429/403
417426
specially, extracts ``code``/``description`` from JSON error bodies)
418427
rather than ``Response.raise_for_status``, which raises
419428
``HTTPError`` with a generic message.
420429
"""
421-
if resp.status_code != 200:
422-
raise RuntimeError(_error_body(resp))
430+
status = resp.status_code
431+
if status == 200:
432+
return
433+
body = _error_body(resp)
434+
if status == 429:
435+
raise RateLimited(body)
436+
raise RuntimeError(body)
423437

424438

425439
def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str:
@@ -436,7 +450,7 @@ def _paginated_failure_message(pages_collected: int, cause: BaseException) -> st
436450
# message is always informative.
437451
if not cause_str.strip():
438452
cause_str = type(cause).__name__
439-
if cause_str.startswith("429"):
453+
if isinstance(cause, RateLimited):
440454
action = "wait for the rate-limit window to reset and retry"
441455
else:
442456
action = "retry the request (possibly after a short backoff)"

tests/waterdata_chunking_test.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
_read_remaining,
4343
multi_value_chunked,
4444
)
45-
from dataretrieval.waterdata.utils import _construct_api_requests
45+
from dataretrieval.waterdata.utils import RateLimited, _construct_api_requests
4646

4747

4848
class _FakeReq:
@@ -310,21 +310,16 @@ def test_read_remaining_parses_header():
310310
assert _read_remaining(_quota_response(42)) == 42
311311

312312

313-
def test_read_remaining_treats_missing_header_as_unknown():
314-
"""Servers that don't echo a rate-limit header must not synthesize
315-
a remaining count — the wrapper checks for the unknown sentinel
316-
explicitly and skips the post-first-chunk plan-vs-quota check."""
317-
from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN
313+
def test_read_remaining_returns_none_when_header_missing():
314+
"""No rate-limit header → ``None`` so the wrapper can branch on
315+
``is None`` instead of comparing against a magic sentinel."""
316+
assert _read_remaining(_quota_response(None)) is None
318317

319-
assert _read_remaining(_quota_response(None)) == _QUOTA_UNKNOWN
320318

321-
322-
def test_read_remaining_treats_malformed_header_as_unknown():
323-
"""Defensive: non-integer header value → unknown sentinel, so the
324-
quota check is skipped rather than tripping on a parse failure."""
325-
from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN
326-
327-
assert _read_remaining(_quota_response("not-a-number")) == _QUOTA_UNKNOWN
319+
def test_read_remaining_returns_none_on_malformed_header():
320+
"""Non-integer header value → ``None`` so a parse failure doesn't
321+
trip the quota check."""
322+
assert _read_remaining(_quota_response("not-a-number")) is None
328323

329324

330325
def test_request_exceeds_quota_after_first_chunk():
@@ -399,10 +394,11 @@ def fetch(args):
399394
i = state["i"]
400395
state["i"] += 1
401396
if i == 2:
402-
inner = RuntimeError("429: Too many requests made.")
397+
# Match _walk_pages's wrapping: a generic mid-pagination
398+
# RuntimeError with the typed RateLimited as __cause__.
403399
try:
404-
raise inner
405-
except RuntimeError as cause:
400+
raise RateLimited("429: Too many requests made.")
401+
except RateLimited as cause:
406402
raise RuntimeError(
407403
"Paginated request failed after collecting 0 page(s): "
408404
"429: Too many requests made."
@@ -430,7 +426,7 @@ def test_quota_exhausted_on_first_chunk_429_has_no_partial_response():
430426
any data arrived" from "abort after partial collection"."""
431427

432428
def fetch(args):
433-
raise RuntimeError("429: Too many requests made.")
429+
raise RateLimited("429: Too many requests made.")
434430

435431
decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch)
436432
with pytest.raises(QuotaExhausted) as excinfo:

tests/waterdata_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
reruns=2,
5050
reruns_delay=5,
5151
only_rerun=[
52-
r"RuntimeError:\s*(?:429|5\d\d):", # _raise_for_non_200 output
52+
r"(?:RateLimited|RuntimeError):\s*(?:429|5\d\d):", # _raise_for_non_200 output
5353
r"ConnectionError",
5454
r"ReadTimeout|ConnectTimeout|Timeout",
5555
],

0 commit comments

Comments
 (0)