Skip to content

Commit 5d931fa

Browse files
thodson-usgsclaude
andcommitted
refactor(waterdata): /simplify pass on ChunkPlan — skip work on the passthrough hot path
Aggregated and applied the meaningful items from the review: - **Trivial-passthrough skips ``build_request`` entirely.** Previously ``ChunkPlan.from_args`` called ``build_request(**args)`` up front to capture ``canonical_url`` and to size the request, even when there was nothing to chunk. Reorder so the "no multi-value lists, no top-level-OR filter" check runs first; on that path the plan is built with ``canonical_url=None`` and no request preparation. The ~20-80 µs ``Request.prepare()`` overhead is removed from the dominant Water Data call shape. ``_combine_chunk_responses`` now treats ``canonical_url=None`` as "skip the override" — fine because ``_walk_pages`` already pinned the response's ``.url`` to the canonical request URL. - **``iter_sub_args`` short-circuits the trivial-passthrough case** — yields ``self.args`` directly instead of allocating a dict copy and spinning through an empty cartesian product. - **``_ChunkExecution`` now owns ``fetch_once``** instead of receiving it per-call on ``issue()``. ``fetch_once`` is constant across the loop, so threading it through every call was needless. ``issue(sub_args)`` and ``run()`` are now zero- and one-arg respectively. Converted from ``@dataclass`` to a plain class (the auto-generated repr/eq weren't earning their keep). The ``completed`` property was inlined to its one remaining caller as ``len(self.responses)``. - **Hoist ``_FILTER_KEY = "filter"``** so the planner and ``iter_sub_args`` substitute on the same constant, matching the existing ``_LIST_SEP``/``_OR_SEP``/``_QUOTA_HEADER`` convention. - **``utils._next_req_url``** now references ``chunking._QUOTA_HEADER`` instead of repeating the ``"x-ratelimit-remaining"`` literal. - Stale ``_NEVER_CHUNK`` comment that pointed at the removed ``_plan_joint`` now points at ``ChunkPlan.from_args``. Items considered and skipped: - ``ChunkPlan.canonical_url`` derivable from ``args`` — keeping it avoids the extra ``build_request`` call on every ``finalize``. - ``_plan_list_chunks`` dual-meaning ``None`` return — fixing it would touch unrelated callers; the current ``continue`` guard is clearly commented. - ``args: dict`` mutability on the frozen dataclass — internal use only; ``MappingProxyType`` adds churn without value. - ``ChunkPlan.from_args`` length / search-loop extraction — the search loop reads well in place; pulling it out would only push state through a helper signature. - ``_count_subrequests`` helper to DRY the ``list_count * len(...)`` math — used in two adjacent places; not worth a helper. All 145 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 24fd158 commit 5d931fa

2 files changed

Lines changed: 60 additions & 37 deletions

File tree

dataretrieval/waterdata/chunking.py

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
import itertools
3737
import math
3838
from collections.abc import Callable, Iterator
39-
from dataclasses import dataclass, field
39+
from dataclasses import dataclass
4040
from typing import Any
4141
from urllib.parse import quote_plus
4242

@@ -65,8 +65,8 @@
6565
# - structured: ``bbox`` is a fixed 4-element coord tuple.
6666
# - intervals: date/time ranges are not enumerable sets.
6767
# - handled elsewhere: ``filter`` gets OR-clause partitioning in
68-
# ``_plan_joint``; comma-joining CQL clauses
69-
# would emit malformed expressions.
68+
# ``ChunkPlan.from_args``; comma-joining CQL
69+
# clauses would emit malformed expressions.
7070
# - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang``
7171
# — a list value would be a type-erasure smuggle.
7272
_NEVER_CHUNK = frozenset(
@@ -98,6 +98,10 @@
9898
_LIST_SEP = ","
9999
_OR_SEP = " OR "
100100

101+
# Args-dict key for the CQL filter. Hoisted so the planner and the
102+
# wrapper substitute on the same key.
103+
_FILTER_KEY = "filter"
104+
101105
_FetchOnce = Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]]
102106

103107

@@ -384,16 +388,19 @@ class ChunkPlan:
384388
Filter sub-expressions to substitute one per sub-request.
385389
``[None]`` means "leave ``args['filter']`` as-is" (passthrough
386390
and single-clause cases).
387-
canonical_url : str
391+
canonical_url : str | None
388392
URL of the full original request, used to overwrite the first
389393
chunk's ``response.url`` so ``BaseMetadata`` reflects the
390-
user's full query.
394+
user's full query. ``None`` on the nothing-to-chunk passthrough
395+
path: ``fetch_once``'s response already carries the canonical
396+
URL, so the override is skipped to avoid an extra
397+
``build_request`` call on the hot path.
391398
"""
392399

393400
args: dict[str, Any]
394401
list_chunks: dict[str, list[list[Any]]]
395402
filter_chunks: list[str | None]
396-
canonical_url: str
403+
canonical_url: str | None
397404

398405
@property
399406
def total(self) -> int:
@@ -407,6 +414,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]:
407414
order: list-dim cartesian product (dict insertion order) crossed
408415
with filter chunks. Same plan → same sequence — resume is
409416
well-defined."""
417+
# Trivial-passthrough fast path: nothing to substitute, just
418+
# yield the original args. Skips a wasted dict copy on the
419+
# most common Water Data call shape.
420+
if not self.list_chunks and self.filter_chunks == [None]:
421+
yield self.args
422+
return
410423
list_combos = (
411424
itertools.product(*self.list_chunks.values()) if self.list_chunks else [()]
412425
)
@@ -416,12 +429,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]:
416429
if filter_chunk is None:
417430
yield base
418431
else:
419-
yield {**base, "filter": filter_chunk}
432+
yield {**base, _FILTER_KEY: filter_chunk}
420433

421434
def execute(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]:
422435
"""Run the plan and return the combined result. See
423436
``_ChunkExecution`` for the per-sub-request semantics."""
424-
return _ChunkExecution(self).run(fetch_once)
437+
return _ChunkExecution(self, fetch_once).run()
425438

426439
@classmethod
427440
def from_args(
@@ -442,21 +455,29 @@ def from_args(
442455
and plan list chunking with greedy halving. Keep the candidate
443456
whose ``list_count × k`` is smallest.
444457
"""
445-
initial_request = build_request(**args)
446-
canonical_url = initial_request.url
447-
448-
filter_expr = args.get("filter")
458+
filter_expr = args.get(_FILTER_KEY)
449459
clauses: list[str] = []
450460
if _is_chunkable(filter_expr, args.get("filter_lang")):
451461
_check_numeric_filter_pitfall(filter_expr)
452462
clauses = _split_top_level_or(filter_expr)
453463

454-
# Passthrough: either nothing's chunkable, or the request
455-
# already fits as-is. Trivial plan, single sub-request, original
456-
# args flow through unchanged.
457-
if (not _chunkable_params(args) and len(clauses) < 2) or (
458-
_request_bytes(initial_request) <= url_limit
459-
):
464+
# Trivial passthrough: no multi-value lists and no top-level-OR
465+
# filter to split, so chunking has no leverage. Skip the
466+
# ``build_request`` call entirely — ``fetch_once``'s response
467+
# will carry the canonical URL already (set by
468+
# ``_finalize_paginated_response``), so the wrapper can elide
469+
# the override. This is the common Water Data call shape, so
470+
# the saved request prep is worth a small branch here.
471+
if not _chunkable_params(args) and len(clauses) < 2:
472+
return cls(
473+
args=args, list_chunks={}, filter_chunks=[None], canonical_url=None
474+
)
475+
476+
initial_request = build_request(**args)
477+
canonical_url = initial_request.url
478+
479+
# Already-fits passthrough: chunking is possible but unnecessary.
480+
if _request_bytes(initial_request) <= url_limit:
460481
return cls(
461482
args=args,
462483
list_chunks={},
@@ -469,7 +490,7 @@ def from_args(
469490

470491
for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr):
471492
plan_args = (
472-
args if worst_filter is None else {**args, "filter": worst_filter}
493+
args if worst_filter is None else {**args, _FILTER_KEY: worst_filter}
473494
)
474495
try:
475496
list_chunks = _plan_list_chunks(plan_args, build_request, url_limit)
@@ -560,53 +581,55 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame:
560581

561582

562583
def _combine_chunk_responses(
563-
responses: list[requests.Response], canonical_url: str
584+
responses: list[requests.Response], canonical_url: str | None
564585
) -> requests.Response:
565586
"""Fold per-sub-request responses into one. The first response is
566587
mutated in place: ``.headers`` becomes the last response's (so
567588
``x-ratelimit-remaining`` reflects current state), ``.elapsed``
568589
accumulates total wall-clock, and ``.url`` is set to the canonical
569590
original-query URL so ``BaseMetadata`` reflects the user's full
570-
request rather than the first sub-chunk."""
591+
request rather than the first sub-chunk.
592+
593+
``canonical_url=None`` skips the URL override — used by the
594+
trivial-passthrough path where ``fetch_once`` already returns a
595+
response whose ``.url`` is the original-query URL."""
571596
head = responses[0]
572597
if len(responses) > 1:
573598
head.headers = responses[-1].headers
574599
head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed)
575-
head.url = canonical_url
600+
if canonical_url is not None:
601+
head.url = canonical_url
576602
return head
577603

578604

579-
@dataclass
580605
class _ChunkExecution:
581606
"""In-flight execution of a ``ChunkPlan``. Issues each sub-request,
582607
accumulates frames and responses, translates 429s into
583608
``QuotaExhausted`` with the partial state captured so far, and
584609
raises ``RequestExceedsQuota`` after the first sub-request if the
585610
rest of the plan won't fit the current rate-limit window."""
586611

587-
plan: ChunkPlan
588-
frames: list[pd.DataFrame] = field(default_factory=list)
589-
responses: list[requests.Response] = field(default_factory=list)
590-
591-
@property
592-
def completed(self) -> int:
593-
return len(self.responses)
612+
def __init__(self, plan: ChunkPlan, fetch_once: _FetchOnce) -> None:
613+
self.plan = plan
614+
self.fetch_once = fetch_once
615+
self.frames: list[pd.DataFrame] = []
616+
self.responses: list[requests.Response] = []
594617

595-
def run(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]:
618+
def run(self) -> tuple[pd.DataFrame, requests.Response]:
596619
for sub_args in self.plan.iter_sub_args():
597-
self.issue(fetch_once, sub_args)
620+
self.issue(sub_args)
598621
return self.finalize()
599622

600-
def issue(self, fetch_once: _FetchOnce, sub_args: dict[str, Any]) -> None:
623+
def issue(self, sub_args: dict[str, Any]) -> None:
601624
try:
602-
frame, response = fetch_once(sub_args)
625+
frame, response = self.fetch_once(sub_args)
603626
except RuntimeError as exc:
604627
if not _is_429(exc):
605628
raise
606629
raise self._quota_exhausted() from exc
607630
self.frames.append(frame)
608631
self.responses.append(response)
609-
if self.completed == 1 and self.plan.total > 1:
632+
if len(self.responses) == 1 and self.plan.total > 1:
610633
self._check_quota_after_first()
611634

612635
def finalize(self) -> tuple[pd.DataFrame, requests.Response]:
@@ -633,7 +656,7 @@ def _quota_exhausted(self) -> QuotaExhausted:
633656
if self.responses
634657
else None
635658
),
636-
completed_chunks=self.completed,
659+
completed_chunks=len(self.responses),
637660
total_chunks=self.plan.total,
638661
)
639662

dataretrieval/waterdata/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def _next_req_url(resp: requests.Response) -> str | None:
599599
if os.getenv("API_USGS_PAT", ""):
600600
logger.info(
601601
"Remaining requests this hour: %s",
602-
header_info.get("x-ratelimit-remaining", ""),
602+
header_info.get(chunking._QUOTA_HEADER, ""),
603603
)
604604
for link in body.get("links", []):
605605
if link.get("rel") == "next":

0 commit comments

Comments
 (0)