Skip to content

Commit 4e82722

Browse files
committed
refactor(waterdata): Tighten the joint chunker
Three small extractions and one minor optimization. No behavior change; 130 chunker/filter tests stay green. _iter_sub_args generator yields per-sub-request args dicts; the wrapper's nested-loop-with-manual-counter collapses to ``for i, sub_args in enumerate(...)``. The "is this the last sub-request" branch in the quota-floor check flips to ``if i == total - 1: continue`` so the gate is a guard clause rather than the body of an inverse condition. _finalize_response folds the ``_combine_chunk_responses(responses); response.url = canonical_url`` pattern (used in both the success path and the QuotaExhausted partial-state payload) into one helper. _filter_candidates generator emits ``(filter_chunks, worst_filter)`` pairs for each candidate filter chunk count; ``_plan_joint`` then iterates candidates uniformly without the ``if filter_chunkable: ... else: ...`` fork. The redundant ``filter_chunkable`` flag is gone — ``len(clauses) >= 2`` is the single truth. Per-iteration optimization: ``{**args, **list_overrides}`` was being recomputed for every filter chunk; now built once per outer combo and reused (or shallow-overridden when a filter substitution applies). Module constants ``_LIST_SEP = ","`` and ``_OR_SEP = " OR "`` replace the scattered string literals — the two chunking dimensions are now self-documenting at every call site that sizes them.
1 parent 10858e9 commit 4e82722

1 file changed

Lines changed: 98 additions & 58 deletions

File tree

dataretrieval/waterdata/chunking.py

Lines changed: 98 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
import functools
3535
import itertools
3636
import math
37-
from collections.abc import Callable
37+
from collections.abc import Callable, Iterator
3838
from typing import Any
3939
from urllib.parse import quote_plus
4040

@@ -98,6 +98,14 @@
9898
# so a missing/malformed header doesn't trigger spurious aborts.
9999
_QUOTA_UNKNOWN = 10**9
100100

101+
# Separators the two chunking dimensions use to compose their atoms
102+
# into URL-encoded blobs. List dims comma-join values
103+
# (``site=USGS-A,USGS-B``); filter dims OR-join clauses
104+
# (``filter=a='1' OR a='2'``). Pinned as constants so the URL-byte
105+
# sizing helper and the partition logic agree on the join shape.
106+
_LIST_SEP = ","
107+
_OR_SEP = " OR "
108+
101109

102110
class RequestTooLarge(ValueError):
103111
"""Raised when a chunked request cannot be issued. Either the URL
@@ -214,7 +222,7 @@ def _worst_case_args(
214222
in by the caller)."""
215223
out = dict(base_args)
216224
for k, chunks in list_plan.items():
217-
out[k] = max(chunks, key=lambda c: _joined_url_bytes(c, ","))
225+
out[k] = max(chunks, key=lambda c: _joined_url_bytes(c, _LIST_SEP))
218226
return out
219227

220228

@@ -282,7 +290,9 @@ def _plan_list_chunks(
282290
if len(chunk) > 1
283291
)
284292
biggest = max(
285-
splittable, key=lambda t: _joined_url_bytes(t[2], ","), default=None
293+
splittable,
294+
key=lambda t: _joined_url_bytes(t[2], _LIST_SEP),
295+
default=None,
286296
)
287297
if biggest is None:
288298
raise RequestTooLarge(
@@ -320,6 +330,32 @@ def _filter_chunk_counts(n_clauses: int) -> list[int]:
320330
return counts
321331

322332

333+
def _filter_candidates(
334+
clauses: list[str], original_filter: str | None
335+
) -> Iterator[tuple[list[str | None], str | None]]:
336+
"""Yield ``(filter_chunks, worst_filter)`` for each candidate filter
337+
chunk count. ``filter_chunks`` is the list of OR-joined sub-filters
338+
the wrapper will substitute one at a time; ``worst_filter`` is the
339+
longest URL-encoded chunk, used by the planner to size the list dims
340+
against the most demanding sub-request.
341+
342+
Falls through to a single ``(filter_chunks=[original_filter], None)``
343+
candidate when the filter has no top-level OR splits (single clause,
344+
cql-json, missing filter): the wrapper still iterates that one
345+
"chunk" but the planner can skip substituting a filter into the
346+
URL probe."""
347+
if len(clauses) < 2:
348+
chunks: list[str | None] = (
349+
[original_filter] if original_filter is not None else [None]
350+
)
351+
yield chunks, None
352+
return
353+
for k in _filter_chunk_counts(len(clauses)):
354+
groups = _partition_clauses(clauses, k)
355+
worst = max(groups, key=lambda g: _joined_url_bytes(g, _OR_SEP))
356+
yield [_OR_SEP.join(g) for g in groups], _OR_SEP.join(worst)
357+
358+
323359
def _plan_joint(
324360
args: dict[str, Any],
325361
build_request: Callable[..., Any],
@@ -358,42 +394,30 @@ def _plan_joint(
358394
if _is_chunkable(filter_expr, args.get("filter_lang")):
359395
_check_numeric_filter_pitfall(filter_expr)
360396
clauses = _split_top_level_or(filter_expr)
361-
# Filter is chunkable only when there are ≥2 top-level OR clauses;
362-
# a single clause can't be split losslessly.
363-
filter_chunkable = len(clauses) >= 2
364397

365-
if not _chunkable_params(args) and not filter_chunkable:
398+
if not _chunkable_params(args) and len(clauses) < 2:
366399
return None
367400
if _request_bytes(build_request(**args)) <= url_limit:
368401
return None
369402

370-
candidate_counts = _filter_chunk_counts(len(clauses)) if filter_chunkable else [1]
371403
best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None
372404
last_error: RequestTooLarge | None = None
373405

374-
for k in candidate_counts:
375-
if filter_chunkable:
376-
groups = _partition_clauses(clauses, k)
377-
worst_group = max(groups, key=lambda g: _joined_url_bytes(g, " OR "))
378-
filter_chunks: list[str | None] = [" OR ".join(g) for g in groups]
379-
plan_args = {**args, "filter": " OR ".join(worst_group)}
380-
else:
381-
filter_chunks = [filter_expr] if filter_expr is not None else [None]
382-
plan_args = args
383-
384-
per_filter_cap = max(1, max_chunks // k)
406+
for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr):
407+
k = len(filter_chunks)
408+
plan_args = args if worst_filter is None else {**args, "filter": worst_filter}
385409
try:
386410
list_plan = _plan_list_chunks(
387-
plan_args, build_request, url_limit, per_filter_cap
411+
plan_args, build_request, url_limit, max(1, max_chunks // k)
388412
)
389413
except RequestTooLarge as exc:
390414
last_error = exc
391415
continue
392416
if list_plan is None:
393417
list_plan = {}
394-
# When there are no list dims to shrink, ``_plan_list_chunks``
395-
# returns ``None`` regardless of whether the request actually
396-
# fits. Filter chunking alone has to close the gap — verify it.
418+
# ``_plan_list_chunks`` returns ``None`` when no list dims are
419+
# chunkable, regardless of whether the request actually fits.
420+
# Filter chunking alone has to close the gap — verify it.
397421
if not list_plan and _request_bytes(build_request(**plan_args)) > url_limit:
398422
continue
399423
total = _plan_total(list_plan, k)
@@ -425,6 +449,34 @@ def _read_remaining(response: requests.Response) -> int:
425449
return _QUOTA_UNKNOWN
426450

427451

452+
def _iter_sub_args(
453+
args: dict[str, Any],
454+
list_plan: dict[str, list[list[Any]]],
455+
filter_chunks: list[str | None],
456+
) -> Iterator[dict[str, Any]]:
457+
"""Yield the substituted ``args`` for each sub-request in the joint
458+
plan, in deterministic order: list-dim cartesian product (insertion
459+
order, Python 3.7+ guarantee) crossed with filter chunks."""
460+
list_keys = list(list_plan)
461+
list_combos = (
462+
itertools.product(*(list_plan[k] for k in list_keys)) if list_plan else [()]
463+
)
464+
for combo in list_combos:
465+
base = {**args, **dict(zip(list_keys, combo))}
466+
for filter_chunk in filter_chunks:
467+
yield base if filter_chunk is None else {**base, "filter": filter_chunk}
468+
469+
470+
def _finalize_response(
471+
responses: list[requests.Response], canonical_url: str
472+
) -> requests.Response:
473+
"""Aggregate per-sub-request responses and restore the canonical
474+
URL representing the user's full original query."""
475+
combined = _combine_chunk_responses(responses)
476+
combined.url = canonical_url
477+
return combined
478+
479+
428480
def multi_value_chunked(
429481
*,
430482
build_request: Callable[..., Any],
@@ -481,44 +533,32 @@ def wrapper(
481533

482534
list_plan, filter_chunks = plan
483535
canonical_url = build_request(**args).url
484-
485-
list_keys = list(list_plan)
486-
list_combos = (
487-
list(itertools.product(*(list_plan[k] for k in list_keys)))
488-
if list_plan
489-
else [()]
490-
)
491-
total = len(list_combos) * len(filter_chunks)
536+
total = _plan_total(list_plan, len(filter_chunks))
492537

493538
frames: list[pd.DataFrame] = []
494539
responses: list[requests.Response] = []
495-
i = 0
496-
for combo in list_combos:
497-
for filter_chunk in filter_chunks:
498-
sub_args = dict(args)
499-
sub_args.update(zip(list_keys, combo))
500-
if filter_chunk is not None:
501-
sub_args["filter"] = filter_chunk
502-
frame, response = fetch_once(sub_args)
503-
frames.append(frame)
504-
responses.append(response)
505-
if i < total - 1:
506-
remaining = _read_remaining(response)
507-
if remaining < floor:
508-
partial = _combine_chunk_responses(responses)
509-
partial.url = canonical_url
510-
raise QuotaExhausted(
511-
partial_frame=_combine_chunk_frames(frames),
512-
partial_response=partial,
513-
completed_chunks=i + 1,
514-
total_chunks=total,
515-
remaining=remaining,
516-
)
517-
i += 1
518-
519-
combined = _combine_chunk_responses(responses)
520-
combined.url = canonical_url
521-
return _combine_chunk_frames(frames), combined
540+
for i, sub_args in enumerate(
541+
_iter_sub_args(args, list_plan, filter_chunks)
542+
):
543+
frame, response = fetch_once(sub_args)
544+
frames.append(frame)
545+
responses.append(response)
546+
if i == total - 1:
547+
continue # last chunk; no next sub-request to gate
548+
remaining = _read_remaining(response)
549+
if remaining < floor:
550+
raise QuotaExhausted(
551+
partial_frame=_combine_chunk_frames(frames),
552+
partial_response=_finalize_response(responses, canonical_url),
553+
completed_chunks=i + 1,
554+
total_chunks=total,
555+
remaining=remaining,
556+
)
557+
558+
return (
559+
_combine_chunk_frames(frames),
560+
_finalize_response(responses, canonical_url),
561+
)
522562

523563
return wrapper # type: ignore[return-value]
524564

0 commit comments

Comments
 (0)