Extend pitfall check: IN/BETWEEN, scientific notation, short-circuit

thodson-usgs · claude · thodson-usgs · commit 9314d3435aac · 2026-04-24T11:14:35.000-05:00
From a /simplify pass with edge-case exploration. Empirical probe
showed the original check let three real footguns slip past:

- ``parameter_code IN (60, 61)`` — the common multi-value code
  pattern silently matches nothing, because the server stores
  values like ``'00060'``.
- ``value BETWEEN 5 AND 10`` — same footgun via range syntax.
- ``value &gt; 1e5`` — scientific-notation floats skipped the
  numeric literal pattern.

Add ``_IN_NUMERIC_RE`` and ``_BETWEEN_NUMERIC_RE`` to catch the
list and range forms; extend the numeric literal to cover
``-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?``; factor the error path into
``_raise_pitfall`` so the three regex branches share one message
shape.

Also: skip the ``re.sub`` masking allocation when the filter has
no single quotes (common for short ad-hoc filters that never
contain a string literal). Factor ``_NUM`` / ``_IDENT`` / ``_OP``
subpatterns so the three regexes share them.

Agent-flagged cases that didn't need fixing (empirically
verified): doubled-quote escapes (``'O''Reilly 1000'``), CAST
expressions, function-call RHS, digit-starting identifiers.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -262,21 +262,44 @@ def _format_api_dates(
 # enforce client-side is the general one: any ``<identifier> <op>
 # <unquoted numeric>`` is a bug — quote the literal or drop the
 # comparison and filter in pandas.
+
+# Unquoted numeric literal: integer, decimal, or scientific notation.
+_NUM = r"-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?"
+_IDENT = r"[A-Za-z_]\w*"
+_OP = r">=|<=|<>|!=|==|=|>|<"
+
 _NUMERIC_COMPARE_RE = re.compile(
-    r"""
+    rf"""
     (?:
-        \b(?P<field1>[A-Za-z_]\w*)\s*
-        (?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
-        (?P<num1>-?\d+(?:\.\d+)?)\b
+        \b(?P<field1>{_IDENT})\s*
+        (?P<op1>{_OP})\s*
+        (?P<num1>{_NUM})\b
     |
-        \b(?P<num2>-?\d+(?:\.\d+)?)\s*
-        (?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
-        (?P<field2>[A-Za-z_]\w*)\b
+        \b(?P<num2>{_NUM})\s*
+        (?P<op2>{_OP})\s*
+        (?P<field2>{_IDENT})\b
     )
     """,
     re.VERBOSE,
 )
 
+# ``<field> IN (<numeric>, ...)`` — same footgun as simple comparison
+# but using the list form. Caught separately because ``IN`` isn't one
+# of the comparison operators in ``_OP``. We only need to see one
+# unquoted numeric inside the parentheses to know the user intends
+# numeric membership.
+_IN_NUMERIC_RE = re.compile(
+    rf"\b(?P<field>{_IDENT})\s+IN\s*\(\s*{_NUM}",
+    re.IGNORECASE,
+)
+
+# ``<field> BETWEEN <numeric> AND <numeric>`` — range form of the same
+# footgun.
+_BETWEEN_NUMERIC_RE = re.compile(
+    rf"\b(?P<field>{_IDENT})\s+BETWEEN\s+{_NUM}\s+AND\s+{_NUM}\b",
+    re.IGNORECASE,
+)
+
 
 def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
     """Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -417,43 +440,58 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
 
 
 def _check_numeric_filter_pitfall(filter_expr: str) -> None:
-    """Raise if the filter compares any field to an unquoted numeric literal.
+    """Raise if the filter pairs any field with an unquoted numeric literal.
 
     Every queryable property on this API is typed as a string on the
-    server, so an unquoted numeric comparison like ``value >= 1000``
-    or ``parameter_code = 60`` sorts **lexicographically** rather than
-    numerically. The failure modes are silent and nasty:
-
-    - ``value >= 1000`` matches ``value='12'`` (``'12'`` > ``'1000'``).
-    - ``parameter_code = 60`` matches no rows, because the actual codes
-      are zero-padded strings like ``'00060'``.
-    - ``district_code = 1`` matches only the rare unpadded ``'1'``.
-
-    Raising here turns those silent bugs into a loud error. Explicit
-    string comparisons (``value >= '1000'``) are not flagged — the
-    quoted literal signals the caller knows the column is textual.
+    server, so any numeric-looking comparison — ``value >= 1000``,
+    ``parameter_code = 60``, ``parameter_code IN (60, 61)``,
+    ``value BETWEEN 5 AND 10`` — either gets rejected with HTTP 500
+    or silently produces lexicographic results. Zero-padded codes are
+    especially nasty (``parameter_code = '60'`` matches nothing because
+    the real codes are ``'00060'``-shaped).
+
+    Explicit string comparisons with quoted literals
+    (``value >= '1000'``) are not flagged — the caller has signalled
+    they know the column is textual.
     """
     # Blank out single-quoted string literals so ``name = 'value > 5'``
-    # doesn't false-positive on its own text.
-    masked = re.sub(r"'[^']*'", "''", filter_expr)
-    match = _NUMERIC_COMPARE_RE.search(masked)
-    if not match:
-        return
-    field = match.group("field1") or match.group("field2")
-    op = match.group("op1") or match.group("op2")
-    num = match.group("num1") or match.group("num2")
+    # doesn't false-positive. The ``"'" in`` pre-check saves the
+    # allocation on the common auto-chunked case (many-target OR chains
+    # always contain quotes, but short ad-hoc filters often don't).
+    masked = (
+        re.sub(r"'[^']*'", "''", filter_expr) if "'" in filter_expr else filter_expr
+    )
+
+    compare = _NUMERIC_COMPARE_RE.search(masked)
+    if compare:
+        field = compare.group("field1") or compare.group("field2")
+        offense = (
+            f"{field} {compare.group('op1') or compare.group('op2')} "
+            f"{compare.group('num1') or compare.group('num2')}"
+        )
+        _raise_pitfall(field, offense)
+
+    membership = _IN_NUMERIC_RE.search(masked)
+    if membership:
+        field = membership.group("field")
+        _raise_pitfall(field, f"{field} IN (…)")
+
+    between = _BETWEEN_NUMERIC_RE.search(masked)
+    if between:
+        field = between.group("field")
+        _raise_pitfall(field, f"{field} BETWEEN …")
+
+
+def _raise_pitfall(field: str, offense: str) -> None:
     raise ValueError(
-        f"Filter compares {field!r} to unquoted numeric {num}. Every "
-        f"queryable on the Water Data API is typed as a string, so "
-        f"``{field} {op} {num}`` is not a valid numeric comparison — "
-        f"empirically the server rejects unquoted numeric literals "
-        f"with HTTP 500. Even if you quote the literal "
-        f"(``{field} {op} '{num}'``) the comparison is lexicographic, "
-        f"which silently misses zero-padded codes (e.g. "
-        f"``parameter_code = '60'`` matches nothing because the real "
-        f"codes are ``'00060'``-shaped) and sorts ``value='12'`` above "
-        f"``value='1000'``. For a numeric filter, fetch a wider result "
-        f"and reduce in pandas after the call."
+        f"Filter uses an unquoted numeric comparison against {field!r} "
+        f"(``{offense}``). Every queryable on the Water Data API is "
+        f"typed as a string, so the server rejects unquoted numeric "
+        f"literals with HTTP 500; even quoting the literal gives a "
+        f"lexicographic comparison (``value > '10'`` matches "
+        f"``value='34.52'``, ``parameter_code = '60'`` matches nothing "
+        f"because the real codes are ``'00060'``-shaped). For a true "
+        f"numeric filter, fetch a wider result and reduce in pandas."
     )
 
 
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -549,6 +549,17 @@ def fake_construct_api_requests(**kwargs):
         # Channel-measurements numeric-looking string fields
         "channel_flow > 500",
         "channel_velocity >= 1.5",
+        # Scientific notation — floats expressed as 1e5, 1.5e-3
+        "value > 1e5",
+        "value >= 2.5E+3",
+        "value < 1.5e-3",
+        # ``IN`` list form — same footgun, common pattern for codes
+        "parameter_code IN (60, 61)",
+        "value IN (10, 20, 30)",
+        "statistic_id in (11)",  # case-insensitive, single-element
+        # ``BETWEEN`` range form — same footgun
+        "value BETWEEN 5 AND 10",
+        "channel_flow between 100 and 500",
         # Composite expressions
         "time >= '2023-01-01T00:00:00Z' AND value >= 1000",
         "value > 1000 OR value < 0",
@@ -580,12 +591,17 @@ def test_check_numeric_filter_pitfall_raises(expr):
         "monitoring_location_id = 'USGS-02238500'",
         "approval_status = 'Approved'",
         "qualifier IN ('A', 'P')",
+        "parameter_code IN ('00060', '00065')",
+        "value BETWEEN '1' AND '9'",
         # Footgun identifiers appearing only inside string literals
         "monitoring_location_id = 'USGS-value >= 1000'",
         "name = 'why I care about parameter_code = 60'",
         "note = 'see district_code = 1 in docs'",
+        "note = 'quoted: value IN (10, 20) within literal'",
         # Multi-clause where every comparison is quoted
         "parameter_code = '00060' AND statistic_id = '00011'",
+        # CQL escape-quote (``O''Reilly``) within a quoted literal
+        "name = 'O''Reilly 1000'",
     ],
 )
 def test_check_numeric_filter_pitfall_allows(expr):