Skip to content

Commit 9314d34

Browse files
thodson-usgsclaude
andcommitted
Extend pitfall check: IN/BETWEEN, scientific notation, short-circuit
From a /simplify pass with edge-case exploration. Empirical probe showed the original check let three real footguns slip past: - ``parameter_code IN (60, 61)`` — the common multi-value code pattern silently matches nothing, because the server stores values like ``'00060'``. - ``value BETWEEN 5 AND 10`` — same footgun via range syntax. - ``value > 1e5`` — scientific-notation floats skipped the numeric literal pattern. Add ``_IN_NUMERIC_RE`` and ``_BETWEEN_NUMERIC_RE`` to catch the list and range forms; extend the numeric literal to cover ``-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?``; factor the error path into ``_raise_pitfall`` so the three regex branches share one message shape. Also: skip the ``re.sub`` masking allocation when the filter has no single quotes (common for short ad-hoc filters that never contain a string literal). Factor ``_NUM`` / ``_IDENT`` / ``_OP`` subpatterns so the three regexes share them. Agent-flagged cases that didn't need fixing (empirically verified): doubled-quote escapes (``'O''Reilly 1000'``), CAST expressions, function-call RHS, digit-starting identifiers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 40441a9 commit 9314d34

2 files changed

Lines changed: 93 additions & 39 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 77 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -262,21 +262,44 @@ def _format_api_dates(
262262
# enforce client-side is the general one: any ``<identifier> <op>
263263
# <unquoted numeric>`` is a bug — quote the literal or drop the
264264
# comparison and filter in pandas.
265+
266+
# Unquoted numeric literal: integer, decimal, or scientific notation.
267+
_NUM = r"-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?"
268+
_IDENT = r"[A-Za-z_]\w*"
269+
_OP = r">=|<=|<>|!=|==|=|>|<"
270+
265271
_NUMERIC_COMPARE_RE = re.compile(
266-
r"""
272+
rf"""
267273
(?:
268-
\b(?P<field1>[A-Za-z_]\w*)\s*
269-
(?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
270-
(?P<num1>-?\d+(?:\.\d+)?)\b
274+
\b(?P<field1>{_IDENT})\s*
275+
(?P<op1>{_OP})\s*
276+
(?P<num1>{_NUM})\b
271277
|
272-
\b(?P<num2>-?\d+(?:\.\d+)?)\s*
273-
(?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
274-
(?P<field2>[A-Za-z_]\w*)\b
278+
\b(?P<num2>{_NUM})\s*
279+
(?P<op2>{_OP})\s*
280+
(?P<field2>{_IDENT})\b
275281
)
276282
""",
277283
re.VERBOSE,
278284
)
279285

286+
# ``<field> IN (<numeric>, ...)`` — same footgun as simple comparison
287+
# but using the list form. Caught separately because ``IN`` isn't one
288+
# of the comparison operators in ``_OP``. We only need to see one
289+
# unquoted numeric inside the parentheses to know the user intends
290+
# numeric membership.
291+
_IN_NUMERIC_RE = re.compile(
292+
rf"\b(?P<field>{_IDENT})\s+IN\s*\(\s*{_NUM}",
293+
re.IGNORECASE,
294+
)
295+
296+
# ``<field> BETWEEN <numeric> AND <numeric>`` — range form of the same
297+
# footgun.
298+
_BETWEEN_NUMERIC_RE = re.compile(
299+
rf"\b(?P<field>{_IDENT})\s+BETWEEN\s+{_NUM}\s+AND\s+{_NUM}\b",
300+
re.IGNORECASE,
301+
)
302+
280303

281304
def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
282305
"""Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -417,43 +440,58 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
417440

418441

419442
def _check_numeric_filter_pitfall(filter_expr: str) -> None:
420-
"""Raise if the filter compares any field to an unquoted numeric literal.
443+
"""Raise if the filter pairs any field with an unquoted numeric literal.
421444
422445
Every queryable property on this API is typed as a string on the
423-
server, so an unquoted numeric comparison like ``value >= 1000``
424-
or ``parameter_code = 60`` sorts **lexicographically** rather than
425-
numerically. The failure modes are silent and nasty:
426-
427-
- ``value >= 1000`` matches ``value='12'`` (``'12'`` > ``'1000'``).
428-
- ``parameter_code = 60`` matches no rows, because the actual codes
429-
are zero-padded strings like ``'00060'``.
430-
- ``district_code = 1`` matches only the rare unpadded ``'1'``.
431-
432-
Raising here turns those silent bugs into a loud error. Explicit
433-
string comparisons (``value >= '1000'``) are not flagged — the
434-
quoted literal signals the caller knows the column is textual.
446+
server, so any numeric-looking comparison — ``value >= 1000``,
447+
``parameter_code = 60``, ``parameter_code IN (60, 61)``,
448+
``value BETWEEN 5 AND 10`` — either gets rejected with HTTP 500
449+
or silently produces lexicographic results. Zero-padded codes are
450+
especially nasty (``parameter_code = '60'`` matches nothing because
451+
the real codes are ``'00060'``-shaped).
452+
453+
Explicit string comparisons with quoted literals
454+
(``value >= '1000'``) are not flagged — the caller has signalled
455+
they know the column is textual.
435456
"""
436457
# Blank out single-quoted string literals so ``name = 'value > 5'``
437-
# doesn't false-positive on its own text.
438-
masked = re.sub(r"'[^']*'", "''", filter_expr)
439-
match = _NUMERIC_COMPARE_RE.search(masked)
440-
if not match:
441-
return
442-
field = match.group("field1") or match.group("field2")
443-
op = match.group("op1") or match.group("op2")
444-
num = match.group("num1") or match.group("num2")
458+
# doesn't false-positive. The ``"'" in`` pre-check saves the
459+
# allocation on the common auto-chunked case (many-target OR chains
460+
# always contain quotes, but short ad-hoc filters often don't).
461+
masked = (
462+
re.sub(r"'[^']*'", "''", filter_expr) if "'" in filter_expr else filter_expr
463+
)
464+
465+
compare = _NUMERIC_COMPARE_RE.search(masked)
466+
if compare:
467+
field = compare.group("field1") or compare.group("field2")
468+
offense = (
469+
f"{field} {compare.group('op1') or compare.group('op2')} "
470+
f"{compare.group('num1') or compare.group('num2')}"
471+
)
472+
_raise_pitfall(field, offense)
473+
474+
membership = _IN_NUMERIC_RE.search(masked)
475+
if membership:
476+
field = membership.group("field")
477+
_raise_pitfall(field, f"{field} IN (…)")
478+
479+
between = _BETWEEN_NUMERIC_RE.search(masked)
480+
if between:
481+
field = between.group("field")
482+
_raise_pitfall(field, f"{field} BETWEEN …")
483+
484+
485+
def _raise_pitfall(field: str, offense: str) -> None:
445486
raise ValueError(
446-
f"Filter compares {field!r} to unquoted numeric {num}. Every "
447-
f"queryable on the Water Data API is typed as a string, so "
448-
f"``{field} {op} {num}`` is not a valid numeric comparison — "
449-
f"empirically the server rejects unquoted numeric literals "
450-
f"with HTTP 500. Even if you quote the literal "
451-
f"(``{field} {op} '{num}'``) the comparison is lexicographic, "
452-
f"which silently misses zero-padded codes (e.g. "
453-
f"``parameter_code = '60'`` matches nothing because the real "
454-
f"codes are ``'00060'``-shaped) and sorts ``value='12'`` above "
455-
f"``value='1000'``. For a numeric filter, fetch a wider result "
456-
f"and reduce in pandas after the call."
487+
f"Filter uses an unquoted numeric comparison against {field!r} "
488+
f"(``{offense}``). Every queryable on the Water Data API is "
489+
f"typed as a string, so the server rejects unquoted numeric "
490+
f"literals with HTTP 500; even quoting the literal gives a "
491+
f"lexicographic comparison (``value > '10'`` matches "
492+
f"``value='34.52'``, ``parameter_code = '60'`` matches nothing "
493+
f"because the real codes are ``'00060'``-shaped). For a true "
494+
f"numeric filter, fetch a wider result and reduce in pandas."
457495
)
458496

459497

tests/waterdata_utils_test.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,17 @@ def fake_construct_api_requests(**kwargs):
549549
# Channel-measurements numeric-looking string fields
550550
"channel_flow > 500",
551551
"channel_velocity >= 1.5",
552+
# Scientific notation — floats expressed as 1e5, 1.5e-3
553+
"value > 1e5",
554+
"value >= 2.5E+3",
555+
"value < 1.5e-3",
556+
# ``IN`` list form — same footgun, common pattern for codes
557+
"parameter_code IN (60, 61)",
558+
"value IN (10, 20, 30)",
559+
"statistic_id in (11)", # case-insensitive, single-element
560+
# ``BETWEEN`` range form — same footgun
561+
"value BETWEEN 5 AND 10",
562+
"channel_flow between 100 and 500",
552563
# Composite expressions
553564
"time >= '2023-01-01T00:00:00Z' AND value >= 1000",
554565
"value > 1000 OR value < 0",
@@ -580,12 +591,17 @@ def test_check_numeric_filter_pitfall_raises(expr):
580591
"monitoring_location_id = 'USGS-02238500'",
581592
"approval_status = 'Approved'",
582593
"qualifier IN ('A', 'P')",
594+
"parameter_code IN ('00060', '00065')",
595+
"value BETWEEN '1' AND '9'",
583596
# Footgun identifiers appearing only inside string literals
584597
"monitoring_location_id = 'USGS-value >= 1000'",
585598
"name = 'why I care about parameter_code = 60'",
586599
"note = 'see district_code = 1 in docs'",
600+
"note = 'quoted: value IN (10, 20) within literal'",
587601
# Multi-clause where every comparison is quoted
588602
"parameter_code = '00060' AND statistic_id = '00011'",
603+
# CQL escape-quote (``O''Reilly``) within a quoted literal
604+
"name = 'O''Reilly 1000'",
589605
],
590606
)
591607
def test_check_numeric_filter_pitfall_allows(expr):

0 commit comments

Comments
 (0)