Simplify chunking module: shared helpers, idiomatic max(), tighter types

thodson-usgs · claude · thodson-usgs · commit aeb0f0a07fc3 · 2026-05-16T08:18:34.000-05:00
Five targeted cleanups from review, no behavior change:

- Drop the duplicate ``_FetchOnce`` TypeVar in chunking.py; import the
  one already defined in filters.py. The two had identical bodies.
- Extract ``_max_per_clause_encoding_ratio(parts)`` in filters.py.
  Both ``_effective_filter_budget`` and the outer
  ``_filter_aware_probe_args`` need the same worst-case ratio
  formula; pinning it in one place keeps them from drifting.
- Replace the manual ``best: tuple | None`` sentinel + nested-loop
  scan in ``_plan_chunks`` with a generator + ``max(..., key=...,
  default=None)``. Removes the sentinel, the conditional-update
  branch, and the post-loop ``if best is None`` check.
- Extract ``_finalize_paginated_response`` in utils.py so the
  4-line "carry last page's headers + cumulative elapsed onto the
  initial response" pattern lives in one spot instead of duplicated
  across ``_walk_pages`` and the stats helper.
- Tighten parametrized type hints from ``dict[str, list]`` to
  ``dict[str, list[Any]]`` (and the planner's return type) per
  PEP 585.

Also trimmed the 17-line ``_filter_aware_probe_args`` docstring to
9 lines; the substance is preserved, the prose is leaner.

All 209 waterdata tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -39,7 +39,7 @@
 import itertools
 import math
 from collections.abc import Callable
-from typing import Any, TypeVar
+from typing import Any
 from urllib.parse import quote_plus
 
 import pandas as pd
@@ -49,7 +49,9 @@
 from .filters import (
     _combine_chunk_frames,
     _combine_chunk_responses,
+    _FetchOnce,
     _is_chunkable,
+    _max_per_clause_encoding_ratio,
     _split_top_level_or,
 )
 
@@ -161,7 +163,7 @@ def __init__(
         self.remaining = remaining
 
 
-def _chunkable_params(args: dict[str, Any]) -> dict[str, list]:
+def _chunkable_params(args: dict[str, Any]) -> dict[str, list[Any]]:
     """Return ``{name: list(values)}`` for every list/tuple kwarg with
     >1 element that is allowed to chunk."""
     return {
@@ -173,24 +175,16 @@ def _chunkable_params(args: dict[str, Any]) -> dict[str, list]:
 
 def _filter_aware_probe_args(args: dict[str, Any]) -> dict[str, Any]:
     """Substitute the filter with a synthetic ASCII clause sized to the
-    inner chunker's bail floor if the filter is chunkable, otherwise
-    return ``args`` unchanged.
-
-    The inner ``filters.chunked`` decorator splits a filter into chunks
-    each whose URL-encoded length is ≤ the per-sub-request budget, but
-    bails (emits the full filter unchanged) when ANY single OR-clause's
-    URL-encoded length exceeds the budget. Mirroring ``filters._
-    effective_filter_budget``, the bail floor on the longest clause is
-    ``len(longest) * max(per_clause_encoding_ratio)``: even a clause
-    whose own ratio is low inherits the worst per-call ratio because
-    the budget is computed against the heaviest-encoding clause.
-
-    Substituting a synthetic ASCII clause of that exact length (ASCII
-    has a 1:1 encoding ratio, so ``quote_plus`` is a no-op) makes the
-    planner's URL probe and the inner chunker's bail condition agree
-    on worst-case size — the planner won't approve a plan the inner
-    chunker would then refuse to emit, and won't prematurely raise
-    when the inner chunker could have made it fit.
+    inner chunker's bail floor, so the planner's URL probe matches what
+    the inner chunker would emit.
+
+    The inner ``filters.chunked`` bails (emits the full filter) when any
+    single OR-clause's URL-encoded length exceeds the per-sub-request
+    budget. Mirroring ``filters._effective_filter_budget``, that floor
+    is ``len(longest_clause) * max(per-clause encoding ratio)``.
+    Substituting an ASCII clause of that exact length makes
+    ``quote_plus`` a no-op, so the URL builder sees exactly the
+    bail-floor byte count.
     """
     filter_expr = args.get("filter")
     filter_lang = args.get("filter_lang")
@@ -199,9 +193,8 @@ def _filter_aware_probe_args(args: dict[str, Any]) -> dict[str, Any]:
     parts = _split_top_level_or(filter_expr)
     if len(parts) < 2:
         return args  # one-clause filter — inner chunker can't shrink it
-    encoding_ratio_max = max(len(quote_plus(p)) / len(p) for p in parts)
     longest_raw = max(len(p) for p in parts)
-    probe_size = math.ceil(longest_raw * encoding_ratio_max)
+    probe_size = math.ceil(longest_raw * _max_per_clause_encoding_ratio(parts))
     return {**args, "filter": "x" * probe_size}
 
 
@@ -239,7 +232,7 @@ def _request_bytes(req: requests.PreparedRequest) -> int:
 
 
 def _worst_case_args(
-    probe_args: dict[str, Any], plan: dict[str, list[list]]
+    probe_args: dict[str, Any], plan: dict[str, list[list[Any]]]
 ) -> dict[str, Any]:
     """Args dict using the LARGEST chunk from each dim — represents the
     most byte-heavy sub-request the plan will issue, with the filter
@@ -255,7 +248,7 @@ def _plan_chunks(
     build_request: Callable[..., Any],
     url_limit: int,
     max_chunks: int | None = None,
-) -> dict[str, list[list]] | None:
+) -> dict[str, list[list[Any]]] | None:
     """Greedy halving until the worst-case sub-request URL fits.
 
     Returns ``None`` when no chunking is needed (request as-is fits or
@@ -280,34 +273,31 @@ def _plan_chunks(
     if _request_bytes(build_request(**probe_args)) <= url_limit:
         return None
 
-    plan: dict[str, list[list]] = {k: [v] for k, v in chunkable.items()}
+    plan: dict[str, list[list[Any]]] = {k: [v] for k, v in chunkable.items()}
 
     while True:
         worst = _worst_case_args(probe_args, plan)
         if _request_bytes(build_request(**worst)) <= url_limit:
             return plan
 
-        # Find the single biggest chunk across all dims and halve it.
-        best: tuple[str, int, int] | None = None  # (dim, chunk_index, size)
-        for dim, dim_chunks in plan.items():
-            for idx, chunk in enumerate(dim_chunks):
-                if len(chunk) <= 1:
-                    continue
-                size = _chunk_bytes(chunk)
-                if best is None or size > best[2]:
-                    best = (dim, idx, size)
-
-        if best is None:
+        # Largest splittable chunk across all dims, by URL-encoded bytes.
+        splittable = (
+            (dim, idx, chunk)
+            for dim, dim_chunks in plan.items()
+            for idx, chunk in enumerate(dim_chunks)
+            if len(chunk) > 1
+        )
+        biggest = max(splittable, key=lambda t: _chunk_bytes(t[2]), default=None)
+        if biggest is None:
             raise RequestTooLarge(
                 f"Request exceeds {url_limit} bytes (URL + body) even "
                 f"with every multi-value parameter at a singleton chunk "
                 f"and any chunkable filter reduced to one OR-clause. "
                 f"Reduce the number of values or split the call manually."
             )
-        dim, idx, _ = best
-        big = plan[dim][idx]
-        mid = len(big) // 2
-        plan[dim] = plan[dim][:idx] + [big[:mid], big[mid:]] + plan[dim][idx + 1 :]
+        dim, idx, chunk = biggest
+        mid = len(chunk) // 2
+        plan[dim] = plan[dim][:idx] + [chunk[:mid], chunk[mid:]] + plan[dim][idx + 1 :]
 
         # Each split only grows the cartesian product, so once we
         # cross max_chunks we can never come back under. Bail now
@@ -323,12 +313,6 @@ def _plan_chunks(
             )
 
 
-_FetchOnce = TypeVar(
-    "_FetchOnce",
-    bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
-)
-
-
 def _read_remaining(response: requests.Response) -> int:
     """Parse ``x-ratelimit-remaining`` from a response. Missing or
     malformed header → return ``_QUOTA_UNKNOWN`` so the safety check
diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py
@@ -152,6 +152,18 @@ def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]:
     return chunks
 
 
+def _max_per_clause_encoding_ratio(parts: list[str]) -> float:
+    """Worst per-clause ``len(quote_plus(p)) / len(p)`` across OR-clauses.
+
+    Any sub-request chunk could end up containing only the heavier-encoding
+    clauses, so per-sub-request byte budgets must be sized against the
+    worst (not average) ratio to avoid overflow. Used by both this
+    module's filter chunker and the outer ``chunking._filter_aware_probe_args``;
+    pinning the formula here keeps the two from drifting.
+    """
+    return max(len(quote_plus(p)) / len(p) for p in parts)
+
+
 def _effective_filter_budget(
     args: dict[str, Any],
     filter_expr: str,
@@ -163,8 +175,7 @@ def _effective_filter_budget(
     non-filter URL bytes by building the request with a 1-byte placeholder
     filter, subtract from the URL limit to get the bytes available for the
     encoded filter, then convert back to raw CQL bytes via the *maximum*
-    per-clause encoding ratio (a chunk could contain only the heavier-encoding
-    clauses, so budgeting by the average ratio could overflow).
+    per-clause encoding ratio.
     """
     # Fast path: encoded filter clearly fits with room for any plausible
     # non-filter URL. Skips the PreparedRequest build and splitter scan.
@@ -179,7 +190,7 @@ def _effective_filter_budget(
         # the caller sees one 414 instead of N parallel sub-request failures.
         return len(filter_expr) + 1
     parts = _split_top_level_or(filter_expr) or [filter_expr]
-    encoding_ratio = max(len(quote_plus(p)) / len(p) for p in parts)
+    encoding_ratio = _max_per_clause_encoding_ratio(parts)
     return max(100, int(available_url_bytes / encoding_ratio))
 
 
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -618,6 +618,26 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame:
     return df
 
 
+def _finalize_paginated_response(
+    initial: requests.Response,
+    last: requests.Response,
+    total_elapsed,
+) -> None:
+    """Carry the last page's headers + cumulative elapsed onto the initial
+    response in place.
+
+    The initial response stays canonical for ``md.url`` (user's original
+    query), but its ``.headers`` and ``.elapsed`` are overwritten so the
+    multi-value chunker's ``QuotaExhausted`` guard sees current
+    ``x-ratelimit-remaining`` and ``md.query_time`` reflects total
+    wall-clock across pages. No-op when ``initial is last`` (single page).
+    """
+    if last is initial:
+        return
+    initial.headers = last.headers
+    initial.elapsed = total_elapsed
+
+
 def _walk_pages(
     geopd: bool,
     req: requests.PreparedRequest,
@@ -703,9 +723,7 @@ def _walk_pages(
                 )
                 curr_url = None
 
-        if resp is not initial_response:
-            initial_response.headers = resp.headers
-            initial_response.elapsed = total_elapsed
+        _finalize_paginated_response(initial_response, resp, total_elapsed)
 
         # Concatenate all pages at once for efficiency
         return pd.concat(dfs, ignore_index=True), initial_response
@@ -1180,9 +1198,7 @@ def get_stats_data(
                 )
                 next_token = None
 
-        if resp is not initial_response:
-            initial_response.headers = resp.headers
-            initial_response.elapsed = total_elapsed
+        _finalize_paginated_response(initial_response, resp, total_elapsed)
 
         dfs = pd.concat(all_dfs, ignore_index=True) if len(all_dfs) > 1 else all_dfs[0]