refactor(waterdata): /simplify pass on ChunkPlan — skip work on the passthrough hot path

thodson-usgs · claude · thodson-usgs · commit 5d931fa34131 · 2026-05-18T21:18:06.000-05:00
Aggregated and applied the meaningful items from the review:

- **Trivial-passthrough skips ``build_request`` entirely.** Previously
  ``ChunkPlan.from_args`` called ``build_request(**args)`` up front to
  capture ``canonical_url`` and to size the request, even when there
  was nothing to chunk. Reorder so the "no multi-value lists, no
  top-level-OR filter" check runs first; on that path the plan is
  built with ``canonical_url=None`` and no request preparation. The
  ~20-80 µs ``Request.prepare()`` overhead is removed from the
  dominant Water Data call shape. ``_combine_chunk_responses`` now
  treats ``canonical_url=None`` as "skip the override" — fine because
  ``_walk_pages`` already pinned the response's ``.url`` to the
  canonical request URL.

- **``iter_sub_args`` short-circuits the trivial-passthrough case** —
  yields ``self.args`` directly instead of allocating a dict copy and
  spinning through an empty cartesian product.

- **``_ChunkExecution`` now owns ``fetch_once``** instead of receiving
  it per-call on ``issue()``. ``fetch_once`` is constant across the
  loop, so threading it through every call was needless. ``issue(sub_args)``
  and ``run()`` are now zero- and one-arg respectively. Converted
  from ``@dataclass`` to a plain class (the auto-generated repr/eq
  weren't earning their keep). The ``completed`` property was inlined
  to its one remaining caller as ``len(self.responses)``.

- **Hoist ``_FILTER_KEY = "filter"``** so the planner and
  ``iter_sub_args`` substitute on the same constant, matching the
  existing ``_LIST_SEP``/``_OR_SEP``/``_QUOTA_HEADER`` convention.

- **``utils._next_req_url``** now references ``chunking._QUOTA_HEADER``
  instead of repeating the ``"x-ratelimit-remaining"`` literal.

- Stale ``_NEVER_CHUNK`` comment that pointed at the removed
  ``_plan_joint`` now points at ``ChunkPlan.from_args``.

Items considered and skipped:

- ``ChunkPlan.canonical_url`` derivable from ``args`` — keeping it
  avoids the extra ``build_request`` call on every ``finalize``.
- ``_plan_list_chunks`` dual-meaning ``None`` return — fixing it
  would touch unrelated callers; the current ``continue`` guard is
  clearly commented.
- ``args: dict`` mutability on the frozen dataclass — internal use
  only; ``MappingProxyType`` adds churn without value.
- ``ChunkPlan.from_args`` length / search-loop extraction — the
  search loop reads well in place; pulling it out would only push
  state through a helper signature.
- ``_count_subrequests`` helper to DRY the ``list_count * len(...)``
  math — used in two adjacent places; not worth a helper.

All 145 unit tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -36,7 +36,7 @@
 import itertools
 import math
 from collections.abc import Callable, Iterator
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any
 from urllib.parse import quote_plus
 
@@ -65,8 +65,8 @@
 #   - structured:      ``bbox`` is a fixed 4-element coord tuple.
 #   - intervals:       date/time ranges are not enumerable sets.
 #   - handled elsewhere: ``filter`` gets OR-clause partitioning in
-#                         ``_plan_joint``; comma-joining CQL clauses
-#                         would emit malformed expressions.
+#                         ``ChunkPlan.from_args``; comma-joining CQL
+#                         clauses would emit malformed expressions.
 #   - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang``
 #                          — a list value would be a type-erasure smuggle.
 _NEVER_CHUNK = frozenset(
@@ -98,6 +98,10 @@
 _LIST_SEP = ","
 _OR_SEP = " OR "
 
+# Args-dict key for the CQL filter. Hoisted so the planner and the
+# wrapper substitute on the same key.
+_FILTER_KEY = "filter"
+
 _FetchOnce = Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]]
 
 
@@ -384,16 +388,19 @@ class ChunkPlan:
         Filter sub-expressions to substitute one per sub-request.
         ``[None]`` means "leave ``args['filter']`` as-is" (passthrough
         and single-clause cases).
-    canonical_url : str
+    canonical_url : str | None
         URL of the full original request, used to overwrite the first
         chunk's ``response.url`` so ``BaseMetadata`` reflects the
-        user's full query.
+        user's full query. ``None`` on the nothing-to-chunk passthrough
+        path: ``fetch_once``'s response already carries the canonical
+        URL, so the override is skipped to avoid an extra
+        ``build_request`` call on the hot path.
     """
 
     args: dict[str, Any]
     list_chunks: dict[str, list[list[Any]]]
     filter_chunks: list[str | None]
-    canonical_url: str
+    canonical_url: str | None
 
     @property
     def total(self) -> int:
@@ -407,6 +414,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]:
         order: list-dim cartesian product (dict insertion order) crossed
         with filter chunks. Same plan → same sequence — resume is
         well-defined."""
+        # Trivial-passthrough fast path: nothing to substitute, just
+        # yield the original args. Skips a wasted dict copy on the
+        # most common Water Data call shape.
+        if not self.list_chunks and self.filter_chunks == [None]:
+            yield self.args
+            return
         list_combos = (
             itertools.product(*self.list_chunks.values()) if self.list_chunks else [()]
         )
@@ -416,12 +429,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]:
                 if filter_chunk is None:
                     yield base
                 else:
-                    yield {**base, "filter": filter_chunk}
+                    yield {**base, _FILTER_KEY: filter_chunk}
 
     def execute(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]:
         """Run the plan and return the combined result. See
         ``_ChunkExecution`` for the per-sub-request semantics."""
-        return _ChunkExecution(self).run(fetch_once)
+        return _ChunkExecution(self, fetch_once).run()
 
     @classmethod
     def from_args(
@@ -442,21 +455,29 @@ def from_args(
         and plan list chunking with greedy halving. Keep the candidate
         whose ``list_count × k`` is smallest.
         """
-        initial_request = build_request(**args)
-        canonical_url = initial_request.url
-
-        filter_expr = args.get("filter")
+        filter_expr = args.get(_FILTER_KEY)
         clauses: list[str] = []
         if _is_chunkable(filter_expr, args.get("filter_lang")):
             _check_numeric_filter_pitfall(filter_expr)
             clauses = _split_top_level_or(filter_expr)
 
-        # Passthrough: either nothing's chunkable, or the request
-        # already fits as-is. Trivial plan, single sub-request, original
-        # args flow through unchanged.
-        if (not _chunkable_params(args) and len(clauses) < 2) or (
-            _request_bytes(initial_request) <= url_limit
-        ):
+        # Trivial passthrough: no multi-value lists and no top-level-OR
+        # filter to split, so chunking has no leverage. Skip the
+        # ``build_request`` call entirely — ``fetch_once``'s response
+        # will carry the canonical URL already (set by
+        # ``_finalize_paginated_response``), so the wrapper can elide
+        # the override. This is the common Water Data call shape, so
+        # the saved request prep is worth a small branch here.
+        if not _chunkable_params(args) and len(clauses) < 2:
+            return cls(
+                args=args, list_chunks={}, filter_chunks=[None], canonical_url=None
+            )
+
+        initial_request = build_request(**args)
+        canonical_url = initial_request.url
+
+        # Already-fits passthrough: chunking is possible but unnecessary.
+        if _request_bytes(initial_request) <= url_limit:
             return cls(
                 args=args,
                 list_chunks={},
@@ -469,7 +490,7 @@ def from_args(
 
         for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr):
             plan_args = (
-                args if worst_filter is None else {**args, "filter": worst_filter}
+                args if worst_filter is None else {**args, _FILTER_KEY: worst_filter}
             )
             try:
                 list_chunks = _plan_list_chunks(plan_args, build_request, url_limit)
@@ -560,53 +581,55 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame:
 
 
 def _combine_chunk_responses(
-    responses: list[requests.Response], canonical_url: str
+    responses: list[requests.Response], canonical_url: str | None
 ) -> requests.Response:
     """Fold per-sub-request responses into one. The first response is
     mutated in place: ``.headers`` becomes the last response's (so
     ``x-ratelimit-remaining`` reflects current state), ``.elapsed``
     accumulates total wall-clock, and ``.url`` is set to the canonical
     original-query URL so ``BaseMetadata`` reflects the user's full
-    request rather than the first sub-chunk."""
+    request rather than the first sub-chunk.
+
+    ``canonical_url=None`` skips the URL override — used by the
+    trivial-passthrough path where ``fetch_once`` already returns a
+    response whose ``.url`` is the original-query URL."""
     head = responses[0]
     if len(responses) > 1:
         head.headers = responses[-1].headers
         head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed)
-    head.url = canonical_url
+    if canonical_url is not None:
+        head.url = canonical_url
     return head
 
 
-@dataclass
 class _ChunkExecution:
     """In-flight execution of a ``ChunkPlan``. Issues each sub-request,
     accumulates frames and responses, translates 429s into
     ``QuotaExhausted`` with the partial state captured so far, and
     raises ``RequestExceedsQuota`` after the first sub-request if the
     rest of the plan won't fit the current rate-limit window."""
 
-    plan: ChunkPlan
-    frames: list[pd.DataFrame] = field(default_factory=list)
-    responses: list[requests.Response] = field(default_factory=list)
-
-    @property
-    def completed(self) -> int:
-        return len(self.responses)
+    def __init__(self, plan: ChunkPlan, fetch_once: _FetchOnce) -> None:
+        self.plan = plan
+        self.fetch_once = fetch_once
+        self.frames: list[pd.DataFrame] = []
+        self.responses: list[requests.Response] = []
 
-    def run(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]:
+    def run(self) -> tuple[pd.DataFrame, requests.Response]:
         for sub_args in self.plan.iter_sub_args():
-            self.issue(fetch_once, sub_args)
+            self.issue(sub_args)
         return self.finalize()
 
-    def issue(self, fetch_once: _FetchOnce, sub_args: dict[str, Any]) -> None:
+    def issue(self, sub_args: dict[str, Any]) -> None:
         try:
-            frame, response = fetch_once(sub_args)
+            frame, response = self.fetch_once(sub_args)
         except RuntimeError as exc:
             if not _is_429(exc):
                 raise
             raise self._quota_exhausted() from exc
         self.frames.append(frame)
         self.responses.append(response)
-        if self.completed == 1 and self.plan.total > 1:
+        if len(self.responses) == 1 and self.plan.total > 1:
             self._check_quota_after_first()
 
     def finalize(self) -> tuple[pd.DataFrame, requests.Response]:
@@ -633,7 +656,7 @@ def _quota_exhausted(self) -> QuotaExhausted:
                 if self.responses
                 else None
             ),
-            completed_chunks=self.completed,
+            completed_chunks=len(self.responses),
             total_chunks=self.plan.total,
         )
 
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -599,7 +599,7 @@ def _next_req_url(resp: requests.Response) -> str | None:
     if os.getenv("API_USGS_PAT", ""):
         logger.info(
             "Remaining requests this hour: %s",
-            header_info.get("x-ratelimit-remaining", ""),
+            header_info.get(chunking._QUOTA_HEADER, ""),
         )
     for link in body.get("links", []):
         if link.get("rel") == "next":

Original file line number	Diff line number	Diff line change
`@@ -599,7 +599,7 @@ def _next_req_url(resp: requests.Response) -> str \| None:`
`599`	`599`	`if os.getenv("API_USGS_PAT", ""):`
`600`	`600`	`logger.info(`
`601`	`601`	`"Remaining requests this hour: %s",`
`602`		`- header_info.get("x-ratelimit-remaining", ""),`
	`602`	`+ header_info.get(chunking._QUOTA_HEADER, ""),`
`603`	`603`	`)`
`604`	`604`	`for link in body.get("links", []):`
`605`	`605`	`if link.get("rel") == "next":`