Skip to content

Commit ee550be

Browse files
committed
refactor(waterdata): Polish — extract _resolve_max_chunks, tidy iter_sub_args
Three micro-refinements after the previous pass settles. No behavior change; 130 tests stay green. - Extract _resolve_max_chunks() so the default + validation rule for ``max_chunks`` lives in one place, called from both _plan_list_chunks and _plan_joint. The 5-line if-None/if-<1 block was duplicated verbatim. - _iter_sub_args drops its explicit ``list_keys = list(list_plan)`` cache; iterating ``list_plan`` directly gives the same insertion-order sequence (Python 3.7+ dict guarantee), and ``zip(list_plan, combo)`` reads as "pair each list-dim name with its chunk for this combo". - Tighten the wrapper's option resolution to the "default if None else passed" form so each line reads in argument order. - Categorize the _NEVER_CHUNK comment so future additions land in the right category instead of a flat narrative.
1 parent 4e82722 commit ee550be

1 file changed

Lines changed: 33 additions & 35 deletions

File tree

dataretrieval/waterdata/chunking.py

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,16 @@
5151
_split_top_level_or,
5252
)
5353

54-
# Params the multi-value chunker must NOT split. ``properties`` defines
55-
# the response schema; chunking it would shard columns. ``bbox`` is a
56-
# fixed 4-element coord tuple. Date params are intervals not sets.
57-
# ``filter`` has its own OR-clause partitioning inside ``_plan_joint``;
58-
# treating it as a multi-value list would emit malformed CQL.
59-
# ``filter_lang`` is a scalar string. ``limit`` and ``skip_geometry``
60-
# are scalars by contract (smuggling a list through type erasure would
61-
# silently fan into queries with conflicting per-request semantics).
54+
# Params the multi-value chunker must NOT comma-join across sub-requests.
55+
# Categorized to make additions intentional:
56+
# - response shape: ``properties`` defines columns; sharding would
57+
# give different columns per chunk.
58+
# - structured: ``bbox`` is a fixed 4-element coord tuple.
59+
# - intervals: date/time ranges are not enumerable sets.
60+
# - handled elsewhere: ``filter`` gets OR-clause partitioning in
61+
# ``_plan_joint``; multi-value semantics would
62+
# emit malformed CQL.
63+
# - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang``.
6264
_NEVER_CHUNK = frozenset(
6365
{
6466
"properties",
@@ -213,6 +215,19 @@ def _plan_total(list_plan: dict[str, list[list[Any]]], n_filter_chunks: int) ->
213215
return list_count * n_filter_chunks
214216

215217

218+
def _resolve_max_chunks(max_chunks: int | None) -> int:
219+
"""Apply ``max_chunks`` default + validation. Both planner entry
220+
points share this rule, so pinning it here keeps them in sync."""
221+
if max_chunks is None:
222+
return _DEFAULT_MAX_CHUNKS
223+
if max_chunks < 1:
224+
raise ValueError(
225+
f"max_chunks must be >= 1; got {max_chunks}. Zero or negative "
226+
f"values would silently bypass the cap."
227+
)
228+
return max_chunks
229+
230+
216231
def _worst_case_args(
217232
base_args: dict[str, Any], list_plan: dict[str, list[list[Any]]]
218233
) -> dict[str, Any]:
@@ -263,13 +278,7 @@ def _plan_list_chunks(
263278
when the smallest reducible plan still doesn't fit or when the
264279
plan would exceed ``max_chunks`` sub-requests.
265280
"""
266-
if max_chunks is None:
267-
max_chunks = _DEFAULT_MAX_CHUNKS
268-
if max_chunks < 1:
269-
raise ValueError(
270-
f"max_chunks must be >= 1; got {max_chunks}. Zero or negative "
271-
f"values would silently bypass the cap."
272-
)
281+
max_chunks = _resolve_max_chunks(max_chunks)
273282
chunkable = _chunkable_params(args)
274283
if not chunkable:
275284
return None
@@ -381,13 +390,7 @@ def _plan_joint(
381390
Raises ``RequestTooLarge`` when no candidate fits or the best plan
382391
would exceed ``max_chunks``.
383392
"""
384-
if max_chunks is None:
385-
max_chunks = _DEFAULT_MAX_CHUNKS
386-
if max_chunks < 1:
387-
raise ValueError(
388-
f"max_chunks must be >= 1; got {max_chunks}. Zero or negative "
389-
f"values would silently bypass the cap."
390-
)
393+
max_chunks = _resolve_max_chunks(max_chunks)
391394

392395
filter_expr = args.get("filter")
393396
clauses: list[str] = []
@@ -455,14 +458,11 @@ def _iter_sub_args(
455458
filter_chunks: list[str | None],
456459
) -> Iterator[dict[str, Any]]:
457460
"""Yield the substituted ``args`` for each sub-request in the joint
458-
plan, in deterministic order: list-dim cartesian product (insertion
459-
order, Python 3.7+ guarantee) crossed with filter chunks."""
460-
list_keys = list(list_plan)
461-
list_combos = (
462-
itertools.product(*(list_plan[k] for k in list_keys)) if list_plan else [()]
463-
)
461+
plan, in deterministic order: list-dim cartesian product (dict
462+
insertion order — Python 3.7+ guarantee) crossed with filter chunks."""
463+
list_combos = itertools.product(*list_plan.values()) if list_plan else [()]
464464
for combo in list_combos:
465-
base = {**args, **dict(zip(list_keys, combo))}
465+
base = {**args, **dict(zip(list_plan, combo))}
466466
for filter_chunk in filter_chunks:
467467
yield base if filter_chunk is None else {**base, "filter": filter_chunk}
468468

@@ -518,14 +518,12 @@ def wrapper(
518518
args: dict[str, Any],
519519
) -> tuple[pd.DataFrame, requests.Response]:
520520
limit = (
521-
url_limit
522-
if url_limit is not None
523-
else filters._WATERDATA_URL_BYTE_LIMIT
521+
filters._WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit
524522
)
525523
floor = (
526-
quota_safety_floor
527-
if quota_safety_floor is not None
528-
else _DEFAULT_QUOTA_SAFETY_FLOOR
524+
_DEFAULT_QUOTA_SAFETY_FLOOR
525+
if quota_safety_floor is None
526+
else quota_safety_floor
529527
)
530528
plan = _plan_joint(args, build_request, limit, max_chunks)
531529
if plan is None:

0 commit comments

Comments
 (0)