Add multi-value GET-parameter chunker for waterdata OGC API

thodson-usgs · claude · thodson-usgs · commit 22a09c723368 · 2026-05-14T21:09:23.000-05:00
Wraps _fetch_once with a cartesian-product chunker that sits OUTSIDE @filters.chunked. Splits multi-value list params (monitoring_location_id, parameter_code, statistic_id, etc.) across sub-requests so each URL fits the server's ~8 KB byte limit. Coordination with @filters.chunked: the planner's URL probe substitutes the filter with its longest top-level OR-clause via _filter_aware_probe_args, modeling the per-sub-request URL the inner filter chunker will actually emit. Without this coordination, a long OR-filter plus multi-value lists triggered premature RequestTooLarge even when the combined chunkers would have made things fit. Two safety guards: - max_chunks=1000 cap on cartesian-product size (matches USGS API hourly quota; raises RequestTooLarge with the actual count when exceeded). - QuotaExhausted abort: between sub-requests, reads x-ratelimit-remaining; if below quota_safety_floor (default 50), raises with the partial frame and chunk offset so callers can resume instead of crashing into a mid-call HTTP 429. 30 unit tests cover the planner, filter-aware coordination, the cap, and the quota-aware abort. Live tests in /tmp verify a 3-dim equivalence case (chunked == unchunked, 16 sub-requests, all axes split), 6 edge-case stress scenarios, and 3 mv/filter composition regimes. Depends on #273 (paginated silent-truncation fix) — this PR multiplies the frequency at which the silent-truncation bug class would have surfaced. Merge order: #273 -> #233 -> this PR. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -0,0 +1,354 @@
+"""Multi-value GET-parameter chunking for the Water Data OGC getters.
+
+PR 233 routes most services through GET with comma-separated values
+(e.g. ``monitoring_location_id=USGS-A,USGS-B,...``). Long lists can blow
+the server's ~8 KB URL byte limit. This module adds a decorator that
+sits OUTSIDE ``filters.chunked`` and splits multi-value list params
+across multiple sub-requests so each URL fits.
+
+Design (orthogonal to filter chunking):
+
+- N-dimensional cartesian product: for each chunkable list param, the
+  values are partitioned into sub-lists; the planner emits the cartesian
+  product of those partitions. Sub-chunks of the same dim never overlap,
+  so frame concat needs no dedup across multi-value chunks.
+- Greedy halving of the largest chunk in any dim until the worst-case
+  sub-request URL fits the limit. Minimises total request count.
+- Date params, ``bbox``, and ``properties`` are not chunked: dates are
+  intervals not enumerable sets; bbox is a coord array; ``properties``
+  determines output schema and chunking it would shard columns.
+
+Coordination with ``filters.chunked``:
+The planner probes URL length using the SHORTEST top-level OR-clause
+when a chunkable filter is present, not the full filter. ``filters.
+chunked`` (inner) will split the filter per sub-request, so probing
+with the smallest clause models the per-sub-request URL the stack will
+actually produce. Without this, a long OR-filter plus multi-value
+lists would trigger a premature ``RequestTooLarge`` even though the
+combined chunkers would have made things fit.
+"""
+
+from __future__ import annotations
+
+import functools
+import itertools
+from collections.abc import Callable
+from typing import Any, TypeVar
+
+import pandas as pd
+import requests
+
+from . import filters
+from .filters import (
+    _combine_chunk_frames,
+    _combine_chunk_responses,
+    _is_chunkable,
+    _split_top_level_or,
+)
+
+# Params that look like lists but must NOT be chunked. ``properties`` is
+# excluded because it defines the response schema; chunking it would
+# return frames with different columns per sub-request. ``bbox`` is a
+# fixed 4-element coord tuple. Date params are intervals not sets. The
+# CQL ``filter`` (and its ``filter_lang``) is a string that has its own
+# inner chunker (``filters.chunked``); if a caller passes ``filter`` as
+# a list, treating it as a multi-value param would emit malformed CQL.
+_NEVER_CHUNK = frozenset(
+    {
+        "properties",
+        "bbox",
+        "datetime",
+        "last_modified",
+        "begin",
+        "begin_utc",
+        "end",
+        "end_utc",
+        "time",
+        "filter",
+        "filter_lang",
+    }
+)
+
+# Default cap on the number of sub-requests a single chunked call may
+# emit. The USGS Water Data API rate-limits each HTTP request (including
+# pagination), so the true budget is ``hourly_quota / avg_pages_per_chunk``.
+# 1000 matches the default hourly quota and is a reasonable upper bound
+# for single-page sub-requests; tune lower if your queries paginate.
+# Override per-decorator via ``max_chunks=`` or by monkeypatching this
+# module attribute (read lazily in the wrapper).
+_DEFAULT_MAX_CHUNKS = 1000
+
+# When ``x-ratelimit-remaining`` drops below this between sub-requests,
+# the chunker bails with ``QuotaExhausted`` rather than risk a mid-call
+# HTTP 429. Carries the partial result so callers can resume from a
+# known offset instead of retrying the whole chunked call from scratch.
+_DEFAULT_QUOTA_SAFETY_FLOOR = 50
+
+
+class RequestTooLarge(ValueError):
+    """Raised when a chunked request cannot be issued. Two cases:
+    (1) URL exceeds the byte limit even with every multi-value param at
+    a singleton chunk and any chunkable filter reduced to its smallest
+    top-level OR-clause; (2) the cartesian-product plan would issue more
+    than ``max_chunks`` sub-requests."""
+
+
+class QuotaExhausted(RuntimeError):
+    """Raised mid-chunked-call when the API's reported remaining quota
+    (``x-ratelimit-remaining`` header) drops below the configured safety
+    floor. The chunker stops before issuing the next sub-request to
+    avoid a mid-call HTTP 429 that would silently truncate paginated
+    results (see PR #273 for the pagination side of that bug).
+
+    The exception carries everything needed to resume: the combined
+    partial frame from completed sub-requests, the metadata for the
+    last successful sub-request, the number of chunks completed out of
+    the plan total, and the last-observed ``remaining`` value.
+
+    Attributes
+    ----------
+    partial_frame : pd.DataFrame
+        Concatenated, deduplicated result of every sub-request that
+        completed before the floor was crossed.
+    partial_response : requests.Response
+        Aggregated response (URL/headers of the first sub-request,
+        summed ``elapsed``). Wrap in ``BaseMetadata`` to surface to
+        the caller alongside the partial frame.
+    completed_chunks : int
+        Number of sub-requests successfully completed.
+    total_chunks : int
+        Total sub-requests in the cartesian-product plan.
+    remaining : int
+        Last observed ``x-ratelimit-remaining`` value.
+    """
+
+    def __init__(
+        self,
+        *,
+        partial_frame: pd.DataFrame,
+        partial_response: requests.Response,
+        completed_chunks: int,
+        total_chunks: int,
+        remaining: int,
+    ) -> None:
+        super().__init__(
+            f"x-ratelimit-remaining dropped to {remaining} after "
+            f"{completed_chunks}/{total_chunks} chunks; aborting to avoid "
+            f"mid-call HTTP 429. Catch QuotaExhausted to access "
+            f".partial_frame and resume from chunk {completed_chunks}."
+        )
+        self.partial_frame = partial_frame
+        self.partial_response = partial_response
+        self.completed_chunks = completed_chunks
+        self.total_chunks = total_chunks
+        self.remaining = remaining
+
+
+def _chunkable_params(args: dict[str, Any]) -> dict[str, list]:
+    """Return ``{name: list(values)}`` for every list/tuple kwarg with
+    >1 element that is allowed to chunk."""
+    return {
+        k: list(v)
+        for k, v in args.items()
+        if k not in _NEVER_CHUNK and isinstance(v, (list, tuple)) and len(v) > 1
+    }
+
+
+def _filter_aware_probe_args(args: dict[str, Any]) -> dict[str, Any]:
+    """Substitute the filter with its shortest top-level OR-clause if the
+    filter is chunkable, otherwise return ``args`` unchanged.
+
+    The inner ``filters.chunked`` decorator will reduce the filter per
+    sub-request to at most one OR-clause (its hard floor — see
+    ``_chunk_cql_or``). Probing with that minimum models the per-sub-
+    request URL the decorator stack will actually emit, so we don't
+    plan around bytes the filter chunker has already promised to remove.
+    """
+    filter_expr = args.get("filter")
+    filter_lang = args.get("filter_lang")
+    if not _is_chunkable(filter_expr, filter_lang):
+        return args
+    parts = _split_top_level_or(filter_expr)
+    if len(parts) < 2:
+        return args  # one-clause filter — filter chunker can't shrink it
+    return {**args, "filter": min(parts, key=len)}
+
+
+def _worst_case_args(
+    probe_args: dict[str, Any], plan: dict[str, list[list]]
+) -> dict[str, Any]:
+    """Args dict using the LARGEST chunk from each dim — represents the
+    most byte-heavy sub-request the plan will issue, with the filter
+    already reduced to its filter-chunker floor."""
+    out = dict(probe_args)
+    for k, chunks in plan.items():
+        out[k] = max(chunks, key=lambda c: len(",".join(map(str, c))))
+    return out
+
+
+def _plan_chunks(
+    args: dict[str, Any],
+    build_request: Callable[..., Any],
+    url_limit: int,
+    max_chunks: int = _DEFAULT_MAX_CHUNKS,
+) -> dict[str, list[list]] | None:
+    """Greedy halving until the worst-case sub-request URL fits.
+
+    Returns ``None`` when no chunking is needed (request as-is fits or
+    no chunkable lists). Raises ``RequestTooLarge`` when:
+    - every multi-value param is already a singleton chunk AND the
+      filter (if any) is already at its smallest OR-clause and the URL
+      still exceeds ``url_limit`` (irreducible), or
+    - the converged cartesian-product plan would issue more than
+      ``max_chunks`` sub-requests (hourly API budget).
+    """
+    chunkable = _chunkable_params(args)
+    if not chunkable:
+        return None
+    probe_args = _filter_aware_probe_args(args)
+    if len(build_request(**probe_args).url) <= url_limit:
+        return None
+
+    plan: dict[str, list[list]] = {k: [v] for k, v in chunkable.items()}
+
+    while True:
+        worst = _worst_case_args(probe_args, plan)
+        if len(build_request(**worst).url) <= url_limit:
+            break
+
+        # Find the single biggest chunk across all dims and halve it.
+        best: tuple[str, int, int] | None = None  # (dim, chunk_index, size)
+        for dim, dim_chunks in plan.items():
+            for idx, chunk in enumerate(dim_chunks):
+                if len(chunk) <= 1:
+                    continue
+                size = len(",".join(map(str, chunk)))
+                if best is None or size > best[2]:
+                    best = (dim, idx, size)
+
+        if best is None:
+            raise RequestTooLarge(
+                f"Request URL exceeds {url_limit} bytes even with every "
+                f"multi-value parameter at a singleton chunk and any "
+                f"chunkable filter reduced to one OR-clause. Reduce the "
+                f"number of values or split the call manually."
+            )
+        dim, idx, _ = best
+        big = plan[dim][idx]
+        mid = len(big) // 2
+        plan[dim] = plan[dim][:idx] + [big[:mid], big[mid:]] + plan[dim][idx + 1 :]
+
+    total = 1
+    for chunks in plan.values():
+        total *= len(chunks)
+    if total > max_chunks:
+        raise RequestTooLarge(
+            f"Chunked plan would issue {total} sub-requests, exceeding "
+            f"max_chunks={max_chunks} (USGS API's default hourly rate "
+            f"limit per key). Reduce input list sizes, narrow the time "
+            f"window, or raise max_chunks if you have a higher quota."
+        )
+    return plan
+
+
+_FetchOnce = TypeVar(
+    "_FetchOnce",
+    bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
+)
+
+
+def _read_remaining(response: requests.Response) -> int:
+    """Parse ``x-ratelimit-remaining`` from a response. Missing or
+    malformed header → return a large sentinel so the safety check
+    treats it as 'plenty of quota' (don't abort on header glitches)."""
+    raw = response.headers.get("x-ratelimit-remaining")
+    if raw is None:
+        return 10**9
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return 10**9
+
+
+def multi_value_chunked(
+    *,
+    build_request: Callable[..., Any],
+    url_limit: int | None = None,
+    max_chunks: int | None = None,
+    quota_safety_floor: int | None = None,
+) -> Callable[[_FetchOnce], _FetchOnce]:
+    """Decorator that splits multi-value list params across sub-requests so
+    each URL fits ``url_limit`` bytes (defaults to ``filters._WATERDATA_
+    URL_BYTE_LIMIT``) and the cartesian-product plan stays ≤ ``max_chunks``
+    sub-requests (defaults to ``_DEFAULT_MAX_CHUNKS``). All defaults are
+    resolved at call time so tests/users that patch the module constants
+    affect this decorator uniformly.
+
+    Between sub-requests the wrapper reads ``x-ratelimit-remaining`` from
+    each response. If it drops below ``quota_safety_floor`` (default
+    ``_DEFAULT_QUOTA_SAFETY_FLOOR``), the wrapper raises ``QuotaExhausted``
+    carrying the combined partial result and the chunk offset so callers
+    can resume after the hourly window resets, instead of crashing into
+    a mid-pagination HTTP 429 (which the upstream pagination loop in
+    ``_walk_pages`` historically truncated silently — see PR #273).
+
+    Sits OUTSIDE ``@filters.chunked``: list-chunking is the outer loop,
+    filter-chunking is the inner loop. The wrapped function has the same
+    signature as ``filters.chunked`` expects — ``(args: dict) -> (frame,
+    response)`` — so the two decorators compose cleanly. The planner is
+    filter-aware so it doesn't raise prematurely when the inner filter
+    chunker would have shrunk the per-sub-request URL on its own.
+    """
+
+    def decorator(fetch_once: _FetchOnce) -> _FetchOnce:
+        @functools.wraps(fetch_once)
+        def wrapper(
+            args: dict[str, Any],
+        ) -> tuple[pd.DataFrame, requests.Response]:
+            limit = (
+                url_limit
+                if url_limit is not None
+                else filters._WATERDATA_URL_BYTE_LIMIT
+            )
+            cap = max_chunks if max_chunks is not None else _DEFAULT_MAX_CHUNKS
+            floor = (
+                quota_safety_floor
+                if quota_safety_floor is not None
+                else _DEFAULT_QUOTA_SAFETY_FLOOR
+            )
+            plan = _plan_chunks(args, build_request, limit, cap)
+            if plan is None:
+                return fetch_once(args)
+
+            keys = list(plan)
+            total = 1
+            for k in keys:
+                total *= len(plan[k])
+            frames: list[pd.DataFrame] = []
+            responses: list[requests.Response] = []
+            for i, combo in enumerate(itertools.product(*(plan[k] for k in keys))):
+                sub_args = {**args, **dict(zip(keys, combo))}
+                frame, response = fetch_once(sub_args)
+                frames.append(frame)
+                responses.append(response)
+                # Quota check happens BETWEEN sub-requests: skip on the
+                # last iteration because there's nothing left to abort.
+                if i < total - 1:
+                    remaining = _read_remaining(response)
+                    if remaining < floor:
+                        raise QuotaExhausted(
+                            partial_frame=_combine_chunk_frames(frames),
+                            partial_response=_combine_chunk_responses(responses),
+                            completed_chunks=i + 1,
+                            total_chunks=total,
+                            remaining=remaining,
+                        )
+
+            return (
+                _combine_chunk_frames(frames),
+                _combine_chunk_responses(responses),
+            )
+
+        return wrapper  # type: ignore[return-value]
+
+    return decorator
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -14,7 +14,7 @@
 
 from dataretrieval import __version__
 from dataretrieval.utils import BaseMetadata
-from dataretrieval.waterdata import filters
+from dataretrieval.waterdata import chunking, filters
 from dataretrieval.waterdata.types import (
     PROFILE_LOOKUP,
     PROFILES,
@@ -912,17 +912,20 @@ def get_ogc_data(
     return return_list, BaseMetadata(response)
 
 
+@chunking.multi_value_chunked(build_request=_construct_api_requests)
 @filters.chunked(build_request=_construct_api_requests)
 def _fetch_once(
     args: dict[str, Any],
 ) -> tuple[pd.DataFrame, requests.Response]:
     """Send one prepared-args OGC request; return the frame + response.
 
-    Filter chunking is added orthogonally by the ``@filters.chunked``
-    decorator: with no filter (or an un-chunkable one) the decorator
-    passes ``args`` through to this body; with a chunkable filter it
-    fans out and calls this body once per sub-filter, then combines.
-    Either way the return shape is ``(frame, response)``.
+    Two orthogonal chunkers wrap this body. ``@chunking.multi_value_chunked``
+    (outer) splits multi-value list params (e.g. ``monitoring_location_id``)
+    across sub-requests so each URL fits the server byte limit; the
+    cartesian product of per-dim chunks is iterated. ``@filters.chunked``
+    (inner) splits long cql-text filters at top-level ``OR``. With no
+    chunkable inputs both pass through unchanged. Either way the return
+    shape is ``(frame, response)``.
     """
     req = _construct_api_requests(**args)
     return _walk_pages(geopd=GEOPANDAS, req=req)
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py