refactor(waterdata): Hone OO shape — ChunkPlan.__init__, _ChunkExecutor, axis-symmetric docstring

thodson-usgs · claude · thodson-usgs · commit 7850186554dc · 2026-05-19T09:40:15.000-05:00
Addresses three PR DOI-USGS#283 review comments: - **Module docstring reframed for axis symmetry.** The previous text read as "filter is the outer loop, list dims are inner," which obscured that both axis kinds are chunkable dimensions. The new framing leads with "every multi-value list parameter and the filter are chunkable axes" and explains *why* the algorithm enumerates filter counts in the outer loop (filter chunking is discrete in OR-clause cardinality; list dims are continuously halvable) rather than presenting the asymmetry as arbitrary. - **``ChunkPlan.from_args`` → ``ChunkPlan.__init__``.** Now that the passthrough case is just a trivial plan (never ``None``), the classmethod-constructor pattern was unjustified. ``__init__`` does the planning directly: ``ChunkPlan(args, build_request, url_limit)`` reads as "construct a plan for these args." Dropped ``@dataclass``; the fields are still simple attributes, just assigned in ``__init__``. Extracted the search loop to a free helper ``_search_best_chunking`` so ``__init__`` stays readable. - **``_ChunkExecution`` → ``_ChunkExecutor``.** Classes should be nouns; "Execution" reads as an event, "Executor" as an actor. Pairs cleanly with ``ChunkPlan`` — the plan is the recipe, the executor runs it. The wrapper is unchanged in shape: return ChunkPlan(args, build_request, limit).execute(fetch_once) Tests updated to use the direct constructor; all 145 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -1,22 +1,25 @@
 """Joint URL-byte chunking for the Water Data OGC getters.
 
-Long multi-value list params (sites, parameter codes, ...) and long
-top-level-``OR`` CQL filters independently risk overflowing the
-server's ~8 KB URL byte limit. ``multi_value_chunked`` builds a
-``ChunkPlan`` that plans both chunking dimensions together, picks the
-allocation that minimizes total sub-requests, and iterates the joint
-cartesian product so every sub-request URL fits. Requests that
-already fit get a trivial single-step plan — the wrapper has one code
-path either way.
-
-Planning: for a filter with ``n_clauses`` top-level OR clauses, try
-candidate filter chunk counts ``k = 1, 2, 4, ..., n_clauses``. For
-each, partition clauses into ``k`` count-balanced groups joined by
-``OR``, take the longest (URL-encoded) group as the worst-case filter,
-then plan list-dim chunking by greedy halving against the remaining
-budget. Keep the candidate with the smallest ``list_count × k``.
-
-Quota: after the first sub-request the execution reads
+A Water Data query has several chunkable axes — every multi-value list
+parameter (sites, parameter codes, …) and the cql-text ``filter``
+(splittable along its top-level OR clauses) — each of which can fan
+the URL past the server's ~8 KB byte limit. ``ChunkPlan`` picks a
+fan-out for each axis that minimizes total sub-requests under the URL
+budget; ``_ChunkExecutor`` iterates the joint cartesian product so
+every sub-request URL fits. Requests that already fit get a trivial
+single-step plan — the executor has one code path either way.
+
+Planning treats the two axis kinds symmetrically as "dimensions to
+split," but their cardinalities differ: list dims can be halved
+continuously, while filter chunking is discrete in OR-clause
+cardinality (you can only split into whole-clause groups). The
+planner therefore enumerates candidate filter chunk counts
+(``k = 1, 2, 4, …, n_clauses``); for each, it commits the worst-case
+(longest, URL-encoded) clause group as the filter and greedy-halves
+the list dims against the remaining budget. The candidate with the
+smallest total sub-request count (``list_count × k``) wins.
+
+Quota: after the first sub-request the executor reads
 ``x-ratelimit-remaining``; if the rest of the plan won't fit, it
 raises ``RequestExceedsQuota`` before burning more budget. A 429
 on any sub-request surfaces as ``QuotaExhausted`` carrying whatever
@@ -36,7 +39,6 @@
 import itertools
 import math
 from collections.abc import Callable, Iterator
-from dataclasses import dataclass
 from typing import Any
 from urllib.parse import quote_plus
 
@@ -364,17 +366,62 @@ def _filter_candidates(
         yield [_OR_SEP.join(g) for g in groups], _OR_SEP.join(worst)
 
 
-@dataclass(frozen=True)
-class ChunkPlan:
-    """A precomputed strategy for issuing one user-level request as a
-    sequence of sub-requests whose URLs each fit ``url_limit``.
+def _search_best_chunking(
+    args: dict[str, Any],
+    build_request: Callable[..., Any],
+    url_limit: int,
+    clauses: list[str],
+    filter_expr: str | None,
+) -> tuple[dict[str, list[list[Any]]], list[str | None]]:
+    """Enumerate filter chunk counts and greedy-halve list dims for each;
+    return the ``(list_chunks, filter_chunks)`` pair with the smallest
+    total sub-request count. Raises ``RequestTooLarge`` if no candidate
+    fits ``url_limit``."""
+    best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None
+    last_error: RequestTooLarge | None = None
+
+    for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr):
+        plan_args = (
+            args if worst_filter is None else {**args, _FILTER_KEY: worst_filter}
+        )
+        try:
+            list_chunks = _plan_list_chunks(plan_args, build_request, url_limit)
+        except RequestTooLarge as exc:
+            last_error = exc
+            continue
+        if list_chunks is None:
+            list_chunks = {}
+        # ``_plan_list_chunks`` returns ``None`` both when no list dims
+        # are chunkable AND when the request fits. Filter chunking
+        # alone has to close the gap — verify before committing to a
+        # list-empty candidate.
+        if not list_chunks and _request_bytes(build_request(**plan_args)) > url_limit:
+            continue
+        list_count = math.prod((len(c) for c in list_chunks.values()), start=1)
+        total = list_count * len(filter_chunks)
+        if best is None or total < best[0]:
+            best = (total, list_chunks, filter_chunks)
+
+    if best is None:
+        raise last_error or RequestTooLarge(
+            "No filter-chunking candidate produces a fitting plan. "
+            "Reduce list sizes or simplify the filter."
+        )
+    return best[1], best[2]
 
-    ``ChunkPlan.from_args`` always returns a plan, even when no
-    chunking is needed: the passthrough case is represented by empty
-    ``list_chunks`` and a single-element ``filter_chunks=[None]`` so
-    ``total == 1`` and ``iter_sub_args`` yields the original args
-    unchanged. The wrapper's loop is therefore the same shape whether
-    chunking was needed or not.
+
+class ChunkPlan:
+    """A strategy for issuing one user-level request as a sequence of
+    sub-requests whose URLs each fit ``url_limit``. Constructing a plan
+    *is* planning: ``ChunkPlan(args, build_request, url_limit)`` runs
+    the joint search and stores the result, raising ``RequestTooLarge``
+    only when chunking is needed but no candidate plan fits.
+
+    Passthrough requests (nothing to chunk, or already fitting) are
+    represented as a trivial plan with ``list_chunks={}``,
+    ``filter_chunks=[None]``, and ``total == 1``; ``iter_sub_args``
+    yields the original args unchanged. The executor's loop has one
+    shape either way.
 
     Attributes
     ----------
@@ -392,15 +439,46 @@ class ChunkPlan:
         URL of the full original request, used to overwrite the first
         chunk's ``response.url`` so ``BaseMetadata`` reflects the
         user's full query. ``None`` on the nothing-to-chunk passthrough
-        path: ``fetch_once``'s response already carries the canonical
-        URL, so the override is skipped to avoid an extra
+        path — ``fetch_once``'s response already carries the canonical
+        URL there, so the executor skips the override to avoid an extra
         ``build_request`` call on the hot path.
     """
 
-    args: dict[str, Any]
-    list_chunks: dict[str, list[list[Any]]]
-    filter_chunks: list[str | None]
-    canonical_url: str | None
+    def __init__(
+        self,
+        args: dict[str, Any],
+        build_request: Callable[..., Any],
+        url_limit: int,
+    ) -> None:
+        self.args = args
+        # Defaults model the trivial-passthrough shape; the branches
+        # below promote them when chunking is actually needed.
+        self.list_chunks: dict[str, list[list[Any]]] = {}
+        self.filter_chunks: list[str | None] = [None]
+        self.canonical_url: str | None = None
+
+        filter_expr = args.get(_FILTER_KEY)
+        clauses: list[str] = []
+        if _is_chunkable(filter_expr, args.get("filter_lang")):
+            _check_numeric_filter_pitfall(filter_expr)
+            clauses = _split_top_level_or(filter_expr)
+
+        # Trivial passthrough: chunking has no leverage. Skip the
+        # ``build_request`` call entirely — the common Water Data call
+        # shape doesn't pay for an unused request prep.
+        if not _chunkable_params(args) and len(clauses) < 2:
+            return
+
+        initial_request = build_request(**args)
+        self.canonical_url = initial_request.url
+
+        # Already-fits passthrough: chunking is possible but unnecessary.
+        if _request_bytes(initial_request) <= url_limit:
+            return
+
+        self.list_chunks, self.filter_chunks = _search_best_chunking(
+            args, build_request, url_limit, clauses, filter_expr
+        )
 
     @property
     def total(self) -> int:
@@ -433,98 +511,8 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]:
 
     def execute(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]:
         """Run the plan and return the combined result. See
-        ``_ChunkExecution`` for the per-sub-request semantics."""
-        return _ChunkExecution(self, fetch_once).run()
-
-    @classmethod
-    def from_args(
-        cls,
-        args: dict[str, Any],
-        build_request: Callable[..., Any],
-        url_limit: int,
-    ) -> ChunkPlan:
-        """Compute the cheapest joint plan for ``args``. Returns a
-        passthrough plan when the request already fits or nothing's
-        chunkable; raises ``RequestTooLarge`` only when chunking *is*
-        needed but no candidate plan fits ``url_limit``.
-
-        Algorithm: enumerate filter chunk counts ``k = 1, 2, 4, ...,
-        n_clauses``; for each, partition clauses into ``k``
-        count-balanced groups joined by ``OR`` and pick the worst
-        (longest URL-encoded) group; substitute that as the filter
-        and plan list chunking with greedy halving. Keep the candidate
-        whose ``list_count × k`` is smallest.
-        """
-        filter_expr = args.get(_FILTER_KEY)
-        clauses: list[str] = []
-        if _is_chunkable(filter_expr, args.get("filter_lang")):
-            _check_numeric_filter_pitfall(filter_expr)
-            clauses = _split_top_level_or(filter_expr)
-
-        # Trivial passthrough: no multi-value lists and no top-level-OR
-        # filter to split, so chunking has no leverage. Skip the
-        # ``build_request`` call entirely — ``fetch_once``'s response
-        # will carry the canonical URL already (set by
-        # ``_finalize_paginated_response``), so the wrapper can elide
-        # the override. This is the common Water Data call shape, so
-        # the saved request prep is worth a small branch here.
-        if not _chunkable_params(args) and len(clauses) < 2:
-            return cls(
-                args=args, list_chunks={}, filter_chunks=[None], canonical_url=None
-            )
-
-        initial_request = build_request(**args)
-        canonical_url = initial_request.url
-
-        # Already-fits passthrough: chunking is possible but unnecessary.
-        if _request_bytes(initial_request) <= url_limit:
-            return cls(
-                args=args,
-                list_chunks={},
-                filter_chunks=[None],
-                canonical_url=canonical_url,
-            )
-
-        best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None
-        last_error: RequestTooLarge | None = None
-
-        for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr):
-            plan_args = (
-                args if worst_filter is None else {**args, _FILTER_KEY: worst_filter}
-            )
-            try:
-                list_chunks = _plan_list_chunks(plan_args, build_request, url_limit)
-            except RequestTooLarge as exc:
-                last_error = exc
-                continue
-            if list_chunks is None:
-                list_chunks = {}
-            # ``_plan_list_chunks`` returns ``None`` both when no list
-            # dims are chunkable AND when the request fits. Filter
-            # chunking alone has to close the gap — verify before
-            # committing to a list-empty candidate.
-            if (
-                not list_chunks
-                and _request_bytes(build_request(**plan_args)) > url_limit
-            ):
-                continue
-            list_count = math.prod((len(c) for c in list_chunks.values()), start=1)
-            total = list_count * len(filter_chunks)
-            if best is None or total < best[0]:
-                best = (total, list_chunks, filter_chunks)
-
-        if best is None:
-            raise last_error or RequestTooLarge(
-                "No filter-chunking candidate produces a fitting plan. "
-                "Reduce list sizes or simplify the filter."
-            )
-
-        return cls(
-            args=args,
-            list_chunks=best[1],
-            filter_chunks=best[2],
-            canonical_url=canonical_url,
-        )
+        ``_ChunkExecutor`` for the per-sub-request semantics."""
+        return _ChunkExecutor(self, fetch_once).run()
 
 
 def _read_remaining(response: requests.Response) -> int | None:
@@ -602,12 +590,12 @@ def _combine_chunk_responses(
     return head
 
 
-class _ChunkExecution:
-    """In-flight execution of a ``ChunkPlan``. Issues each sub-request,
-    accumulates frames and responses, translates 429s into
-    ``QuotaExhausted`` with the partial state captured so far, and
-    raises ``RequestExceedsQuota`` after the first sub-request if the
-    rest of the plan won't fit the current rate-limit window."""
+class _ChunkExecutor:
+    """Runs a ``ChunkPlan`` against a ``fetch_once`` callable. Issues
+    each sub-request, accumulates frames and responses, translates 429s
+    into ``QuotaExhausted`` carrying the partial state captured so far,
+    and raises ``RequestExceedsQuota`` after the first sub-request when
+    the rest of the plan won't fit the current rate-limit window."""
 
     def __init__(self, plan: ChunkPlan, fetch_once: _FetchOnce) -> None:
         self.plan = plan
@@ -673,7 +661,7 @@ def multi_value_chunked(
     requests are a trivial single-step plan, so there's one code path
     either way.
 
-    See ``ChunkPlan`` and ``_ChunkExecution`` for planning and
+    See ``ChunkPlan`` and ``_ChunkExecutor`` for planning and
     rate-limit semantics. Exceptions: ``RequestTooLarge`` if no plan
     fits, ``RequestExceedsQuota`` if the remaining plan can't fit the
     current rate-limit window, ``QuotaExhausted`` on a 429 mid-execution.
@@ -685,7 +673,7 @@ def wrapper(
             args: dict[str, Any],
         ) -> tuple[pd.DataFrame, requests.Response]:
             limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit
-            return ChunkPlan.from_args(args, build_request, limit).execute(fetch_once)
+            return ChunkPlan(args, build_request, limit).execute(fetch_once)
 
         return wrapper
 
diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py
@@ -144,7 +144,7 @@ def test_chunk_plan_fans_out_filter_when_list_alone_cannot_fit():
     }
     # Singleton list + full filter ≈ 200 + 10 + 86 = 296 (over limit 240).
     # Joint planner must split the filter into k >= 2 groups.
-    plan = ChunkPlan.from_args(args, _fake_build, url_limit=240)
+    plan = ChunkPlan(args, _fake_build, url_limit=240)
     # Either the filter was chunked, the list was chunked, or both.
     assert len(plan.filter_chunks) > 1 or any(
         len(v) > 1 for v in plan.list_chunks.values()
@@ -165,7 +165,7 @@ def test_chunk_plan_minimizes_total_sub_requests():
         "filter": " OR ".join(clauses),
     }
     # Tight limit forces both dims to participate.
-    plan = ChunkPlan.from_args(args, _fake_build, url_limit=380)
+    plan = ChunkPlan(args, _fake_build, url_limit=380)
     # Plan must beat the bail-floor-style worst case (8 singletons × 16
     # filter chunks = 128 sub-requests) by a healthy margin.
     assert plan.total < 128
@@ -182,15 +182,15 @@ def test_chunk_plan_raises_when_smallest_plan_doesnt_fit():
     # Base 200 + singleton site (10) + singleton clause (9) = 219; limit
     # below 219 → no joint plan can fit.
     with pytest.raises(RequestTooLarge):
-        ChunkPlan.from_args(args, _fake_build, url_limit=210)
+        ChunkPlan(args, _fake_build, url_limit=210)
 
 
 def test_chunk_plan_passthrough_when_request_fits():
     """A request that already fits gets a trivial single-step plan:
     no list chunks, ``filter_chunks=[None]``, ``total == 1``. The
     wrapper still iterates it through one fetch_once call."""
     args = {"monitoring_location_id": ["A", "B", "C"]}
-    plan = ChunkPlan.from_args(args, _fake_build, url_limit=8000)
+    plan = ChunkPlan(args, _fake_build, url_limit=8000)
     assert plan.list_chunks == {}
     assert plan.filter_chunks == [None]
     assert plan.total == 1
@@ -202,7 +202,7 @@ def test_chunk_plan_passthrough_when_nothing_chunkable():
     the limit (the server may 414, but the chunker has nothing to
     split)."""
     args = {"monitoring_location_id": "scalar-only"}
-    plan = ChunkPlan.from_args(args, _fake_build, url_limit=10)
+    plan = ChunkPlan(args, _fake_build, url_limit=10)
     assert plan.list_chunks == {}
     assert plan.filter_chunks == [None]
     assert plan.total == 1
@@ -212,7 +212,7 @@ def test_chunk_plan_iter_sub_args_passthrough_yields_original_args_once():
     """The passthrough plan's ``iter_sub_args`` yields exactly one
     sub-args dict equal to the original args (modulo dict identity)."""
     args = {"monitoring_location_id": ["A", "B", "C"], "limit": 100}
-    plan = ChunkPlan.from_args(args, _fake_build, url_limit=8000)
+    plan = ChunkPlan(args, _fake_build, url_limit=8000)
     subs = list(plan.iter_sub_args())
     assert len(subs) == 1
     assert subs[0] == args
@@ -627,7 +627,7 @@ def test_joint_planner_url_construction_long_filter_and_long_sites():
     }
     url_limit = 8000
 
-    plan = ChunkPlan.from_args(args, _construct_api_requests, url_limit)
+    plan = ChunkPlan(args, _construct_api_requests, url_limit)
     assert plan.total > 1, "expected non-trivial plan for over-limit request"
     list_plan = plan.list_chunks
     filter_chunks = plan.filter_chunks