Check for numeric comparisons against string-typed value

thodson-usgs · claude · thodson-usgs · commit 3ff39dbc118f · 2026-04-24T08:55:41.000-05:00
The Water Data API types ``value`` as ``"string"`` on every data collection's ``/queryables`` schema, so an unquoted numeric comparison like ``filter=value >= 1000`` resolves *lexicographically* on the server — ``'12'`` sorts above ``'1000'``, ``'9'`` sorts above ``'1000'``, and the caller gets silently wrong rows. Flagged during review of the R-package PR DOI-USGS/dataRetrieval#880: the concern there was that exposing a generic ``filter`` kwarg invites exactly this kind of silent bug. Add a conservative check that runs once per cql-text filter in ``_plan_filter_chunks`` and raises ``ValueError`` when the filter contains ``<value> <op> <numeric literal>`` (or the reverse) without quotes. Explicit string comparisons (``value >= '1000'``) are not flagged — the caller has opted into sort-order semantics. The check only looks at ``value`` today (the only ``/queryables`` field that's string-typed but universally typed numerically by users); other string fields like ``qualifier`` or ``approval_status`` are genuinely character and wouldn't naturally take a numeric RHS. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -246,6 +246,33 @@ def _format_api_dates(
 # filter is small enough that the expensive budget probe can be skipped.
 _NON_FILTER_URL_HEADROOM = 1000
 
+# Properties that *look* numeric but are typed as strings server-side,
+# so any unquoted numeric comparison against them (e.g. ``value >= 1000``)
+# resolves lexicographically rather than numerically and silently
+# produces wrong results. The server queryables list ``value`` as
+# ``type: string`` on every data endpoint (continuous, daily,
+# field-measurements); see ``/collections/<svc>/queryables``.
+_LEXICOGRAPHIC_NUMERIC_FIELDS = frozenset({"value"})
+
+# Match ``<field> <op> <numeric literal>`` or the reverse. Captures any
+# of the footgun fields from ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` alongside
+# an unquoted numeric literal; quoted literals (``value >= '1000'``) are
+# explicit string comparisons and are not flagged.
+_NUMERIC_COMPARE_RE = re.compile(
+    r"""
+    (?:
+        \b(?P<field1>{fields})\s*
+        (?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
+        (?P<num1>-?\d+(?:\.\d+)?)\b
+    |
+        \b(?P<num2>-?\d+(?:\.\d+)?)\s*
+        (?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
+        (?P<field2>{fields})\b
+    )
+    """.format(fields="|".join(_LEXICOGRAPHIC_NUMERIC_FIELDS)),
+    re.VERBOSE,
+)
+
 
 def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
     """Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -385,6 +412,42 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
     return max(100, int(available_url_bytes / encoding_ratio))
 
 
+def _check_numeric_filter_pitfall(filter_expr: str) -> None:
+    """Raise if the filter compares a string-typed field to an unquoted
+    numeric literal.
+
+    The Water Data API types ``value`` as a string on every data
+    collection's ``/queryables`` schema, so ``value >= 1000`` is a
+    *lexicographic* comparison, not a numeric one. ``'12'`` sorts above
+    ``'1000'``, ``'9'`` sorts above ``'1000'``, and so on — the caller
+    almost always wants numeric semantics and would get a silently wrong
+    result set. Raising here turns that silent bug into a loud error.
+
+    Explicit string comparisons (``value >= '1000'``, with the literal
+    quoted) are not flagged — the caller has signalled they know the
+    column is textual and want sort-order semantics.
+    """
+    # Blank out single-quoted string literals so a filter containing
+    # ``name = 'value >= 1000'`` doesn't false-positive on its own text.
+    masked = re.sub(r"'[^']*'", "''", filter_expr)
+    match = _NUMERIC_COMPARE_RE.search(masked)
+    if not match:
+        return
+    field = match.group("field1") or match.group("field2")
+    op = match.group("op1") or match.group("op2")
+    num = match.group("num1") or match.group("num2")
+    raise ValueError(
+        f"Filter compares {field!r} to unquoted numeric {num}, but "
+        f"{field!r} is typed as a string on the Water Data API. "
+        f"``{field} {op} {num}`` would sort lexicographically — e.g. "
+        f"``value >= 1000`` includes ``value='12'`` because ``'12'`` "
+        f"sorts above ``'1000'``. If you really want a string "
+        f"comparison, quote the literal: ``{field} {op} '{num}'``. "
+        f"For a numeric filter, fetch a wider window and reduce in "
+        f"pandas after the call."
+    )
+
+
 def _cql2_param(args: dict[str, Any]) -> str:
     """
     Convert query parameters to CQL2 JSON format for POST requests.
@@ -1028,6 +1091,7 @@ def _plan_filter_chunks(args: dict[str, Any]) -> list[str | None]:
     )
     if not chunkable:
         return [None]
+    _check_numeric_filter_pitfall(filter_expr)
     raw_budget = _effective_filter_budget(args, filter_expr)
     return _chunk_cql_or(filter_expr, max_len=raw_budget)
 
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -10,6 +10,7 @@
 from dataretrieval.waterdata.utils import (
     _CQL_FILTER_CHUNK_LEN,
     _WATERDATA_URL_BYTE_LIMIT,
+    _check_numeric_filter_pitfall,
     _chunk_cql_or,
     _construct_api_requests,
     _effective_filter_budget,
@@ -524,3 +525,69 @@ def fake_construct_api_requests(**kwargs):
         )
 
     assert sent_filters == [expr]
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        "value >= 1000",
+        "value > 1000",
+        "value <= 1000",
+        "value < 1000",
+        "value = 1000",
+        "value != 1000",
+        "value >= 1000.5",
+        "value >= -50",
+        # With surrounding clauses
+        "time >= '2023-01-01T00:00:00Z' AND value >= 1000",
+        "value > 1000 OR value < 0",
+        # Reverse order
+        "1000 <= value",
+    ],
+)
+def test_check_numeric_filter_pitfall_raises(expr):
+    """Unquoted numeric comparisons against ``value`` resolve
+    lexicographically on the server, so reject them with a clear
+    message before the request is sent."""
+    with pytest.raises(ValueError, match="lexicographic"):
+        _check_numeric_filter_pitfall(expr)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        # Quoted literal — caller has opted into the string comparison
+        "value >= '1000'",
+        "value = '42.5'",
+        # No value comparison at all
+        "time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-02T00:00:00Z'",
+        "monitoring_location_id = 'USGS-02238500'",
+        # ``value`` appears only inside a string literal
+        "monitoring_location_id = 'USGS-value >= 1000'",
+        "name = 'why I care about value >= 1000'",
+        # Other string-typed fields aren't in the footgun list
+        "approval_status = 'Approved'",
+        "qualifier IN ('A', 'P')",
+    ],
+)
+def test_check_numeric_filter_pitfall_allows(expr):
+    """Quoted literals, unrelated comparisons, and ``value`` substrings
+    inside string literals must not trigger the check."""
+    _check_numeric_filter_pitfall(expr)  # must not raise
+
+
+def test_get_continuous_surfaces_pitfall_to_caller():
+    """End-to-end: the check runs at the ``get_continuous`` boundary,
+    not as a deep internal-only protection, so callers see the error
+    before any HTTP traffic."""
+    from dataretrieval.waterdata import get_continuous
+
+    with mock.patch("dataretrieval.waterdata.utils._construct_api_requests") as build:
+        with pytest.raises(ValueError, match="lexicographic"):
+            get_continuous(
+                monitoring_location_id="USGS-02238500",
+                parameter_code="00060",
+                filter="value >= 1000",
+                filter_lang="cql-text",
+            )
+        build.assert_not_called()