refactor(waterdata): Address PR #283 review — relocate chunker helpers, clarify docs

thodson-usgs · claude · thodson-usgs · commit f16555d1ab61 · 2026-05-18T14:17:13.000-05:00
Three review responses bundled together: - chunking.py module docstring: define ``k`` as the candidate filter chunk count before using it in the planner description. - ``QuotaExhausted`` docstring: drop the stale "silently truncate" framing. PR #273 / #279 already raise on a mid-pagination 429, so this exception is the structured-recovery alternative (partial frames in hand) rather than a defense against silent truncation. - Move chunker-only orphans from filters.py to chunking.py: ``_WATERDATA_URL_BYTE_LIMIT`` (the URL byte ceiling), ``_FetchOnce`` TypeVar, ``_combine_chunk_frames``, and ``_combine_chunk_responses``. filters.py was a leftover home from the pre-unification two-decorator stack; these helpers have no callers outside the chunker. Test ``test_multi_value_chunked_lazy_url_limit`` now monkeypatches the constant on its new module. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -12,12 +12,14 @@
 - Chunkable dims include multi-value list params (sites, parameter
   codes, ...) and the cql-text ``filter`` (split at top-level ``OR``
   to keep each chunk valid CQL).
-- The planner enumerates candidate filter chunk counts
-  (``k = 1, 2, 4, ..., total_clauses``); for each, partitions clauses
-  into ``k`` roughly-balanced groups joined by ``OR``, substitutes the
-  worst (longest, URL-encoded) group as the filter, then plans list
+- For a filter with ``n_clauses`` top-level OR clauses, the planner
+  enumerates candidate filter chunk counts ``k`` (the number of
+  sub-filters to split into) at powers of two from 1 through
+  ``n_clauses``. For each ``k``, it partitions clauses into ``k``
+  roughly-balanced groups joined by ``OR``, substitutes the worst
+  (longest, URL-encoded) group as the filter, then plans list
   chunking by greedy halving. The candidate that minimizes
-  ``list_count × k`` wins.
+  ``list_count × k`` (total sub-request count) wins.
 - Sub-chunks of the same list dim never overlap, so frame concat needs
   no dedup across list chunks. Filter sub-chunks can match overlapping
   records (a row matching both ``a=1`` and ``b=2`` returns from both),
@@ -35,22 +37,23 @@
 import itertools
 import math
 from collections.abc import Callable, Iterator
-from typing import Any
+from typing import Any, TypeVar
 from urllib.parse import quote_plus
 
 import pandas as pd
 import requests
 
-from . import filters
 from .filters import (
     _check_numeric_filter_pitfall,
-    _combine_chunk_frames,
-    _combine_chunk_responses,
-    _FetchOnce,
     _is_chunkable,
     _split_top_level_or,
 )
 
+# Empirically the API replies HTTP 414 above ~8200 bytes of full URL —
+# matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000
+# leaves ~200 bytes for request-line framing and proxy variance.
+_WATERDATA_URL_BYTE_LIMIT = 8000
+
 # Default rule: any list-shaped kwarg with >1 element is chunked across
 # sub-requests — each chunk becomes a comma-joined sub-list in the URL.
 # The OGC getters expose ~90 such list-shaped params (IDs, codes,
@@ -114,6 +117,12 @@
 _OR_SEP = " OR "
 
 
+_FetchOnce = TypeVar(
+    "_FetchOnce",
+    bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
+)
+
+
 class RequestTooLarge(ValueError):
     """Raised when a chunked request cannot be issued. Either the URL
     exceeds the byte limit even at the smallest reducible plan (every
@@ -125,9 +134,14 @@ class RequestTooLarge(ValueError):
 class QuotaExhausted(RuntimeError):
     """Raised mid-chunked-call when the API's reported remaining quota
     (``x-ratelimit-remaining`` header) drops below the configured safety
-    floor. The chunker stops before issuing the next sub-request to
-    avoid a mid-call HTTP 429 that would silently truncate paginated
-    results.
+    floor. The chunker stops before issuing the next sub-request and
+    surfaces the partial result so callers can resume after the hourly
+    window resets.
+
+    A bare 429 raised by ``_walk_pages`` would also abort the call but
+    discard the chunks completed so far; this exception is the
+    structured-recovery alternative, triggered pre-emptively while the
+    accumulated frames are still in hand.
 
     Attributes
     ----------
@@ -472,6 +486,49 @@ def _iter_sub_args(
             yield base if filter_chunk is None else {**base, "filter": filter_chunk}
 
 
+def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame:
+    """Concatenate per-chunk frames, dropping empties and deduping by ``id``.
+
+    ``_get_resp_data`` returns a plain ``pd.DataFrame()`` on empty responses;
+    concat'ing it with real GeoDataFrames downgrades the result to plain
+    DataFrame and strips geometry/CRS, so empties are dropped first. Dedup
+    on the pre-rename feature ``id`` keeps overlapping user OR-clauses from
+    producing duplicate rows across chunks.
+    """
+    non_empty = [f for f in frames if not f.empty]
+    if not non_empty:
+        return pd.DataFrame()
+    if len(non_empty) == 1:
+        return non_empty[0]
+    combined = pd.concat(non_empty, ignore_index=True)
+    if "id" in combined.columns:
+        combined = combined.drop_duplicates(subset="id", ignore_index=True)
+    return combined
+
+
+def _combine_chunk_responses(
+    responses: list[requests.Response],
+) -> requests.Response:
+    """Return one response with the last chunk's headers (for current
+    rate-limit state) and summed ``elapsed`` (for total wall-clock).
+
+    The returned response's ``.url`` is the *first chunk's* URL, which
+    only reflects the first slice of the user's query. ``_finalize_response``
+    overwrites ``.url`` with the canonical original-query URL so
+    ``BaseMetadata`` reflects the user's request, not the first sub-chunk.
+
+    Mutates the first response in place: ``.headers`` is replaced with
+    the last response's headers and ``.elapsed`` is accumulated across
+    all chunks. Downstream reads ``.url``, ``.headers``, and
+    ``.elapsed`` (via ``BaseMetadata``).
+    """
+    head = responses[0]
+    if len(responses) > 1:
+        head.headers = responses[-1].headers
+        head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed)
+    return head
+
+
 def _finalize_response(
     responses: list[requests.Response], canonical_url: str
 ) -> requests.Response:
@@ -491,7 +548,7 @@ def multi_value_chunked(
 ) -> Callable[[_FetchOnce], _FetchOnce]:
     """Decorator that splits multi-value list params and cql-text
     filters across sub-requests so each sub-request URL fits
-    ``url_limit`` bytes (defaults to ``filters._WATERDATA_URL_BYTE_LIMIT``)
+    ``url_limit`` bytes (defaults to ``_WATERDATA_URL_BYTE_LIMIT``)
     and the joint cartesian-product plan stays ≤ ``max_chunks``
     sub-requests (defaults to ``_DEFAULT_MAX_CHUNKS``). All defaults are
     resolved at call time so tests/users that patch the module constants
@@ -522,9 +579,7 @@ def decorator(fetch_once: _FetchOnce) -> _FetchOnce:
         def wrapper(
             args: dict[str, Any],
         ) -> tuple[pd.DataFrame, requests.Response]:
-            limit = (
-                filters._WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit
-            )
+            limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit
             floor = (
                 _DEFAULT_QUOTA_SAFETY_FLOOR
                 if quota_safety_floor is None
diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py
@@ -6,10 +6,8 @@
 
 Internal helpers used by ``chunking.multi_value_chunked``'s joint
 planner: ``_split_top_level_or`` (clause partitioning),
-``_is_chunkable`` (filter-language gate), ``_check_numeric_filter_pitfall``
-(the lexicographic-comparison guard), ``_combine_chunk_frames`` /
-``_combine_chunk_responses`` (aggregation), and the constant
-``_WATERDATA_URL_BYTE_LIMIT``.
+``_is_chunkable`` (filter-language gate), and
+``_check_numeric_filter_pitfall`` (the lexicographic-comparison guard).
 
 Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal
 predicates, function calls) are forwarded verbatim — only top-level
@@ -20,19 +18,10 @@
 from __future__ import annotations
 
 import re
-from collections.abc import Callable
-from typing import Any, Literal, TypeVar
-
-import pandas as pd
-import requests
+from typing import Any, Literal
 
 FILTER_LANG = Literal["cql-text", "cql-json"]
 
-# Empirically the API replies HTTP 414 above ~8200 bytes of full URL —
-# matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000
-# leaves ~200 bytes for request-line framing and proxy variance.
-_WATERDATA_URL_BYTE_LIMIT = 8000
-
 
 _NUM = r"-?(?:\d+(?:\.\d+)?|\.\d+)(?:[eE][+-]?\d+)?"
 _IDENT = r"[A-Za-z_]\w*"
@@ -171,53 +160,3 @@ def _is_chunkable(filter_expr: Any, filter_lang: Any) -> bool:
         and bool(filter_expr)
         and filter_lang in {None, "cql-text"}
     )
-
-
-def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame:
-    """Concatenate per-chunk frames, dropping empties and deduping by ``id``.
-
-    ``_get_resp_data`` returns a plain ``pd.DataFrame()`` on empty responses;
-    concat'ing it with real GeoDataFrames downgrades the result to plain
-    DataFrame and strips geometry/CRS, so empties are dropped first. Dedup
-    on the pre-rename feature ``id`` keeps overlapping user OR-clauses from
-    producing duplicate rows across chunks.
-    """
-    non_empty = [f for f in frames if not f.empty]
-    if not non_empty:
-        return pd.DataFrame()
-    if len(non_empty) == 1:
-        return non_empty[0]
-    combined = pd.concat(non_empty, ignore_index=True)
-    if "id" in combined.columns:
-        combined = combined.drop_duplicates(subset="id", ignore_index=True)
-    return combined
-
-
-def _combine_chunk_responses(
-    responses: list[requests.Response],
-) -> requests.Response:
-    """Return one response with the last chunk's headers (for current
-    rate-limit state) and summed ``elapsed`` (for total wall-clock).
-
-    The returned response's ``.url`` is the *first chunk's* URL, which
-    only reflects the first slice of the user's query. Callers wanting
-    the canonical original-query URL on ``BaseMetadata`` must overwrite
-    ``.url`` themselves; ``chunking.multi_value_chunked``'s wrapper does
-    this via ``build_request(**original_args).url``.
-
-    Mutates the first response in place: ``.headers`` is replaced with
-    the last response's headers and ``.elapsed`` is accumulated across
-    all chunks. Downstream reads ``.url``, ``.headers``, and
-    ``.elapsed`` (via ``BaseMetadata``).
-    """
-    head = responses[0]
-    if len(responses) > 1:
-        head.headers = responses[-1].headers
-        head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed)
-    return head
-
-
-_FetchOnce = TypeVar(
-    "_FetchOnce",
-    bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
-)
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -13,7 +13,7 @@
 if sys.version_info < (3, 10):
     pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True)
 
-from dataretrieval.waterdata import filters as _filters
+from dataretrieval.waterdata import chunking as _chunking
 from dataretrieval.waterdata import (
     get_channel,
     get_combined_metadata,
@@ -475,7 +475,7 @@ def fetch(args):
 
 
 def test_multi_value_chunked_lazy_url_limit(monkeypatch):
-    """``url_limit=None`` → resolve filters._WATERDATA_URL_BYTE_LIMIT at call
+    """``url_limit=None`` → resolve chunking._WATERDATA_URL_BYTE_LIMIT at call
     time, so tests that patch the constant affect this decorator too."""
     calls = []
 
@@ -484,7 +484,7 @@ def fetch(args):
         calls.append(args)
         return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1))
 
-    monkeypatch.setattr(_filters, "_WATERDATA_URL_BYTE_LIMIT", 240)
+    monkeypatch.setattr(_chunking, "_WATERDATA_URL_BYTE_LIMIT", 240)
     # 4 sites of 10 chars → exceeds 240 → planner splits.
     fetch({"sites": ["S" * 10 + str(i) for i in range(4)]})
     assert len(calls) > 1, "patched constant should drive chunking"