Skip to content

Commit 559b466

Browse files
thodson-usgsclaude
andcommitted
Tighten filters.py per /simplify pass
Four small wins from the review agents' findings: - Type ``chunked``'s decorator factory with a ``TypeVar`` bound to the fetch-once signature. Replaces the three-line ``Callable[[Callable[[dict], tuple[DF, Response]]], Callable[...]]`` dance with ``Callable[[_FetchOnce], _FetchOnce]``. The wrapper's ``@functools.wraps`` already preserves the signature; the TypeVar makes that explicit to type-checkers. - Drop the pointless ``build_request=lambda **kw: _construct_api_requests(**kw)`` wrapper in ``utils.get_ogc_data`` — the function reference itself matches the callable shape. - Factor the shared ``(?!NOT\b)(?P<field>…)\s+(?P<negated>NOT\s+)?`` subpattern used by ``_IN_NUMERIC_RE`` and ``_BETWEEN_NUMERIC_RE`` into a single module-level ``_FIELD_NEGATED`` constant. The two regexes now compose it rather than repeat it. - Drop the dead ``if p`` filter in ``_effective_filter_budget``'s encoding-ratio scan. ``_split_top_level_or`` already drops empties, and ``filter_expr`` is non-empty at that call site (guarded by ``_is_chunkable``). Note the invariant in a one-line comment. - Trim the module docstring's redundant "Exports" section — the two public names (``FILTER_LANG``, ``chunked``) are self-evident at the declarations. All 176 non-live tests still pass. No behaviour change. Skipped from the review (per agents' own guidance or false positives): - Collapsing the three pitfall regex scans into one alternation — different group shapes, different error-message branches; no perf gain worth the complexity. - Unifying ``_iter_or_boundaries`` quote tracking with the pitfall check's mask — narrow duplication, only worth factoring if another CQL helper appears. - Moving test helpers to ``conftest.py`` — not actively duplicated (waterdata_utils_test.py no longer has them). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7002420 commit 559b466

2 files changed

Lines changed: 30 additions & 31 deletions

File tree

dataretrieval/waterdata/filters.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,16 @@
5757
getters.
5858
- ``__init__.py``: drop the ``FILTER_LANG`` re-export.
5959
60-
Exports
61-
~~~~~~~
62-
63-
- ``FILTER_LANG`` — type alias used by ``api.py`` and re-exported.
64-
- ``chunked`` — decorator used by ``utils.py`` on its fetch helper.
65-
66-
Everything else in this module is private (leading underscore).
60+
Only two names are imported by other modules — ``FILTER_LANG`` and
61+
``chunked``. Everything else is package-private.
6762
"""
6863

6964
from __future__ import annotations
7065

7166
import functools
7267
import re
7368
from collections.abc import Callable, Iterator
74-
from typing import Any, Literal
69+
from typing import Any, Literal, TypeVar
7570
from urllib.parse import quote_plus
7671

7772
import pandas as pd
@@ -139,6 +134,14 @@
139134
_IDENT = r"[A-Za-z_]\w*"
140135
_OP = r">=|<=|<>|!=|==|=|>|<"
141136

137+
# ``<field>`` in ``IN`` / ``BETWEEN`` contexts, with optional ``NOT``
138+
# keyword after the field. ``(?!NOT\b)`` keeps the bare keyword
139+
# ``NOT`` from being captured as the field when the caller writes
140+
# ``value NOT IN (…)``; the ``(?P<negated>NOT\s+)?`` after the field
141+
# captures the negation so the error message can report the offending
142+
# form accurately.
143+
_FIELD_NEGATED = rf"\b(?!NOT\b)(?P<field>{_IDENT})\s+(?P<negated>NOT\s+)?"
144+
142145
_NUMERIC_COMPARE_RE = re.compile(
143146
rf"""
144147
(?:
@@ -155,22 +158,16 @@
155158
)
156159

157160
# ``<field> [NOT] IN (<numeric>, ...)`` — same footgun as a simple
158-
# comparison but using the list form. Caught separately because ``IN``
159-
# isn't one of the comparison operators in ``_OP``. The ``(?!NOT\b)``
160-
# lookahead keeps the bare keyword ``NOT`` from being captured as the
161-
# field when the caller writes ``value NOT IN (…)``; the optional
162-
# ``(?:NOT\s+)?`` after the field captures the negation so the error
163-
# message can report the offending form accurately.
161+
# comparison but using the list form. Caught separately because
162+
# ``IN`` isn't one of the comparison operators in ``_OP``.
164163
_IN_NUMERIC_RE = re.compile(
165-
rf"\b(?!NOT\b)(?P<field>{_IDENT})\s+(?P<negated>NOT\s+)?IN\s*\(\s*{_NUM}",
164+
rf"{_FIELD_NEGATED}IN\s*\(\s*{_NUM}",
166165
re.IGNORECASE,
167166
)
168167

169-
# ``<field> [NOT] BETWEEN <numeric> AND <numeric>`` — range form with
170-
# the same ``NOT`` handling as above.
168+
# ``<field> [NOT] BETWEEN <numeric> AND <numeric>`` — range form.
171169
_BETWEEN_NUMERIC_RE = re.compile(
172-
rf"\b(?!NOT\b)(?P<field>{_IDENT})\s+(?P<negated>NOT\s+)?"
173-
rf"BETWEEN\s+{_NUM}\s+AND\s+{_NUM}\b",
170+
rf"{_FIELD_NEGATED}BETWEEN\s+{_NUM}\s+AND\s+{_NUM}\b",
174171
re.IGNORECASE,
175172
)
176173

@@ -322,8 +319,11 @@ def _effective_filter_budget(
322319
# filter so _chunk_cql_or passes it through unchanged — one 414
323320
# from the server is clearer than a burst of N failing sub-requests.
324321
return len(filter_expr) + 1
322+
# ``_split_top_level_or`` already drops empty clauses, and
323+
# ``filter_expr`` is non-empty here (guarded by ``_is_chunkable``),
324+
# so every ``p`` below is non-empty.
325325
parts = _split_top_level_or(filter_expr) or [filter_expr]
326-
encoding_ratio = max(len(quote_plus(p)) / len(p) for p in parts if p)
326+
encoding_ratio = max(len(quote_plus(p)) / len(p) for p in parts)
327327
return max(100, int(available_url_bytes / encoding_ratio))
328328

329329

@@ -445,12 +445,13 @@ def _aggregate_chunk_responses(
445445
return metadata_response
446446

447447

448-
def chunked(
449-
*, build_request: Callable[..., Any]
450-
) -> Callable[
451-
[Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]]],
452-
Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
453-
]:
448+
_FetchOnce = TypeVar(
449+
"_FetchOnce",
450+
bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
451+
)
452+
453+
454+
def chunked(*, build_request: Callable[..., Any]) -> Callable[[_FetchOnce], _FetchOnce]:
454455
"""Decorator that adds CQL-filter chunking to a single-request fetch.
455456
456457
The wrapped function must have signature
@@ -480,9 +481,7 @@ def chunked(
480481
``.url`` attribute.
481482
"""
482483

483-
def decorator(
484-
fetch_once: Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]],
485-
) -> Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]]:
484+
def decorator(fetch_once: _FetchOnce) -> _FetchOnce:
486485
@functools.wraps(fetch_once)
487486
def wrapper(
488487
args: dict[str, Any],
@@ -504,6 +503,6 @@ def wrapper(
504503

505504
return _combine_chunk_frames(frames), _aggregate_chunk_responses(responses)
506505

507-
return wrapper
506+
return wrapper # type: ignore[return-value]
508507

509508
return decorator

dataretrieval/waterdata/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -845,7 +845,7 @@ def get_ogc_data(
845845
return return_list, BaseMetadata(response)
846846

847847

848-
@filters.chunked(build_request=lambda **kw: _construct_api_requests(**kw))
848+
@filters.chunked(build_request=_construct_api_requests)
849849
def _fetch_once(
850850
args: dict[str, Any],
851851
) -> tuple[pd.DataFrame, requests.Response]:

0 commit comments

Comments
 (0)