Generalize pitfall check to every field, not just value

thodson-usgs · claude · thodson-usgs · commit 40441a93fa15 · 2026-04-24T10:50:38.000-05:00
The empirical sweep of ``/collections/&lt;svc&gt;/queryables`` + sample
data shows 11+ string-typed-but-numeric-looking fields across the
OGC endpoints — ``value``, ``parameter_code``, ``statistic_id``,
``district_code``, ``county_code``, ``hydrologic_unit_code``,
``monitoring_location_number``, ``measurement_number``,
``channel_flow``, ``channel_width``, ``channel_velocity`` — and
zero-padded codes like ``parameter_code='00060'`` silently match
nothing on ``parameter_code = 60``.

Since *every* queryable on this API is typed as a string, there's
no such thing as a legitimate numeric comparison. The cleanest rule
is the universal one: flag any ``&lt;identifier&gt; &lt;op&gt; &lt;unquoted
numeric literal&gt;`` (or the reverse), regardless of which field.

Live-probed the server to confirm the failure mode: unquoted
numeric RHS doesn't silently lex — it returns HTTP 500. Quoted
literals (``value &gt; '10'``) return 200 with real lex results. The
client-side check turns an opaque 500 into a clear ValueError with
the fix suggested inline, and also catches the silent-lex case
where users who *do* quote still get wrong semantics.

Dropped the ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` watchlist and
widened the regex to match any identifier. Extended tests cover
all 11 real-world field types plus zero-padded codes,
multi-clause composites, and the original false-positive guards
(identifiers inside quoted string literals, quoted RHS bypasses).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -246,30 +246,34 @@ def _format_api_dates(
 # filter is small enough that the expensive budget probe can be skipped.
 _NON_FILTER_URL_HEADROOM = 1000
 
-# Properties that *look* numeric but are typed as strings server-side,
-# so any unquoted numeric comparison against them (e.g. ``value >= 1000``)
-# resolves lexicographically rather than numerically and silently
-# produces wrong results. The server queryables list ``value`` as
-# ``type: string`` on every data endpoint (continuous, daily,
-# field-measurements); see ``/collections/<svc>/queryables``.
-_LEXICOGRAPHIC_NUMERIC_FIELDS = frozenset({"value"})
-
-# Match ``<field> <op> <numeric literal>`` or the reverse. Captures any
-# of the footgun fields from ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` alongside
-# an unquoted numeric literal; quoted literals (``value >= '1000'``) are
-# explicit string comparisons and are not flagged.
+# Every queryable property on every OGC collection for the Water Data
+# API is ``type: string`` (confirmed across ``continuous``, ``daily``,
+# ``field-measurements``, ``monitoring-locations``,
+# ``time-series-metadata``, ``latest-continuous``, ``latest-daily``,
+# ``channel-measurements`` — see ``/collections/<svc>/queryables``).
+# That includes fields whose *values* look numeric — ``value``,
+# ``parameter_code`` (``'00060'``), ``statistic_id`` (``'00011'``),
+# ``district_code`` (``'01'``), ``hydrologic_unit_code``,
+# ``channel_flow``, and more. Comparing any of them to an *unquoted*
+# numeric literal (``value >= 1000``) triggers a lexicographic sort
+# on the server and silently produces wrong results — zero-padded
+# codes are especially nasty (``parameter_code = 60`` matches nothing
+# because the real values are all ``'00060'``-shaped). So the rule we
+# enforce client-side is the general one: any ``<identifier> <op>
+# <unquoted numeric>`` is a bug — quote the literal or drop the
+# comparison and filter in pandas.
 _NUMERIC_COMPARE_RE = re.compile(
     r"""
     (?:
-        \b(?P<field1>{fields})\s*
+        \b(?P<field1>[A-Za-z_]\w*)\s*
         (?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
         (?P<num1>-?\d+(?:\.\d+)?)\b
     |
         \b(?P<num2>-?\d+(?:\.\d+)?)\s*
         (?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
-        (?P<field2>{fields})\b
+        (?P<field2>[A-Za-z_]\w*)\b
     )
-    """.format(fields="|".join(_LEXICOGRAPHIC_NUMERIC_FIELDS)),
+    """,
     re.VERBOSE,
 )
 
@@ -413,22 +417,24 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
 
 
 def _check_numeric_filter_pitfall(filter_expr: str) -> None:
-    """Raise if the filter compares a string-typed field to an unquoted
-    numeric literal.
-
-    The Water Data API types ``value`` as a string on every data
-    collection's ``/queryables`` schema, so ``value >= 1000`` is a
-    *lexicographic* comparison, not a numeric one. ``'12'`` sorts above
-    ``'1000'``, ``'9'`` sorts above ``'1000'``, and so on — the caller
-    almost always wants numeric semantics and would get a silently wrong
-    result set. Raising here turns that silent bug into a loud error.
-
-    Explicit string comparisons (``value >= '1000'``, with the literal
-    quoted) are not flagged — the caller has signalled they know the
-    column is textual and want sort-order semantics.
+    """Raise if the filter compares any field to an unquoted numeric literal.
+
+    Every queryable property on this API is typed as a string on the
+    server, so an unquoted numeric comparison like ``value >= 1000``
+    or ``parameter_code = 60`` sorts **lexicographically** rather than
+    numerically. The failure modes are silent and nasty:
+
+    - ``value >= 1000`` matches ``value='12'`` (``'12'`` > ``'1000'``).
+    - ``parameter_code = 60`` matches no rows, because the actual codes
+      are zero-padded strings like ``'00060'``.
+    - ``district_code = 1`` matches only the rare unpadded ``'1'``.
+
+    Raising here turns those silent bugs into a loud error. Explicit
+    string comparisons (``value >= '1000'``) are not flagged — the
+    quoted literal signals the caller knows the column is textual.
     """
-    # Blank out single-quoted string literals so a filter containing
-    # ``name = 'value >= 1000'`` doesn't false-positive on its own text.
+    # Blank out single-quoted string literals so ``name = 'value > 5'``
+    # doesn't false-positive on its own text.
     masked = re.sub(r"'[^']*'", "''", filter_expr)
     match = _NUMERIC_COMPARE_RE.search(masked)
     if not match:
@@ -437,14 +443,17 @@ def _check_numeric_filter_pitfall(filter_expr: str) -> None:
     op = match.group("op1") or match.group("op2")
     num = match.group("num1") or match.group("num2")
     raise ValueError(
-        f"Filter compares {field!r} to unquoted numeric {num}, but "
-        f"{field!r} is typed as a string on the Water Data API. "
-        f"``{field} {op} {num}`` would sort lexicographically — e.g. "
-        f"``value >= 1000`` includes ``value='12'`` because ``'12'`` "
-        f"sorts above ``'1000'``. If you really want a string "
-        f"comparison, quote the literal: ``{field} {op} '{num}'``. "
-        f"For a numeric filter, fetch a wider window and reduce in "
-        f"pandas after the call."
+        f"Filter compares {field!r} to unquoted numeric {num}. Every "
+        f"queryable on the Water Data API is typed as a string, so "
+        f"``{field} {op} {num}`` is not a valid numeric comparison — "
+        f"empirically the server rejects unquoted numeric literals "
+        f"with HTTP 500. Even if you quote the literal "
+        f"(``{field} {op} '{num}'``) the comparison is lexicographic, "
+        f"which silently misses zero-padded codes (e.g. "
+        f"``parameter_code = '60'`` matches nothing because the real "
+        f"codes are ``'00060'``-shaped) and sorts ``value='12'`` above "
+        f"``value='1000'``. For a numeric filter, fetch a wider result "
+        f"and reduce in pandas after the call."
     )
 
 
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -530,6 +530,7 @@ def fake_construct_api_requests(**kwargs):
 @pytest.mark.parametrize(
     "expr",
     [
+        # The motivating case — numeric-valued string field
         "value >= 1000",
         "value > 1000",
         "value <= 1000",
@@ -538,41 +539,58 @@ def fake_construct_api_requests(**kwargs):
         "value != 1000",
         "value >= 1000.5",
         "value >= -50",
-        # With surrounding clauses
+        # Zero-padded codes: `parameter_code = 60` matches nothing
+        # because the real values are all `'00060'`-shaped
+        "parameter_code = 60",
+        "statistic_id = 11",
+        "district_code = 1",
+        "county_code != 0",
+        "hydrologic_unit_code = 20301030401",
+        # Channel-measurements numeric-looking string fields
+        "channel_flow > 500",
+        "channel_velocity >= 1.5",
+        # Composite expressions
         "time >= '2023-01-01T00:00:00Z' AND value >= 1000",
         "value > 1000 OR value < 0",
-        # Reverse order
+        "parameter_code = 60 AND statistic_id = 11",
+        # Reverse (literal on the left)
         "1000 <= value",
+        "60 = parameter_code",
     ],
 )
 def test_check_numeric_filter_pitfall_raises(expr):
-    """Unquoted numeric comparisons against ``value`` resolve
-    lexicographically on the server, so reject them with a clear
-    message before the request is sent."""
+    """Unquoted numeric comparisons against any field resolve
+    lexicographically on this API — every queryable is string-typed —
+    so reject them with a clear message before the request is sent."""
     with pytest.raises(ValueError, match="lexicographic"):
         _check_numeric_filter_pitfall(expr)
 
 
 @pytest.mark.parametrize(
     "expr",
     [
-        # Quoted literal — caller has opted into the string comparison
+        # Quoted literals — caller has opted into string comparison
         "value >= '1000'",
         "value = '42.5'",
-        # No value comparison at all
+        "parameter_code = '00060'",
+        "district_code = '01'",
+        "hydrologic_unit_code = '020301030401'",
+        # Pure string comparisons
         "time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-02T00:00:00Z'",
         "monitoring_location_id = 'USGS-02238500'",
-        # ``value`` appears only inside a string literal
-        "monitoring_location_id = 'USGS-value >= 1000'",
-        "name = 'why I care about value >= 1000'",
-        # Other string-typed fields aren't in the footgun list
         "approval_status = 'Approved'",
         "qualifier IN ('A', 'P')",
+        # Footgun identifiers appearing only inside string literals
+        "monitoring_location_id = 'USGS-value >= 1000'",
+        "name = 'why I care about parameter_code = 60'",
+        "note = 'see district_code = 1 in docs'",
+        # Multi-clause where every comparison is quoted
+        "parameter_code = '00060' AND statistic_id = '00011'",
     ],
 )
 def test_check_numeric_filter_pitfall_allows(expr):
-    """Quoted literals, unrelated comparisons, and ``value`` substrings
-    inside string literals must not trigger the check."""
+    """Quoted literals and comparisons that don't pair a field with an
+    unquoted numeric literal must not trigger the check."""
     _check_numeric_filter_pitfall(expr)  # must not raise