Skip to content

Commit 3ff39db

Browse files
thodson-usgsclaude
andcommitted
Check for numeric comparisons against string-typed value
The Water Data API types ``value`` as ``"string"`` on every data collection's ``/queryables`` schema, so an unquoted numeric comparison like ``filter=value >= 1000`` resolves *lexicographically* on the server — ``'12'`` sorts above ``'1000'``, ``'9'`` sorts above ``'1000'``, and the caller gets silently wrong rows. Flagged during review of the R-package PR DOI-USGS/dataRetrieval#880: the concern there was that exposing a generic ``filter`` kwarg invites exactly this kind of silent bug. Add a conservative check that runs once per cql-text filter in ``_plan_filter_chunks`` and raises ``ValueError`` when the filter contains ``<value> <op> <numeric literal>`` (or the reverse) without quotes. Explicit string comparisons (``value >= '1000'``) are not flagged — the caller has opted into sort-order semantics. The check only looks at ``value`` today (the only ``/queryables`` field that's string-typed but universally typed numerically by users); other string fields like ``qualifier`` or ``approval_status`` are genuinely character and wouldn't naturally take a numeric RHS. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 33e928d commit 3ff39db

2 files changed

Lines changed: 131 additions & 0 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,33 @@ def _format_api_dates(
246246
# filter is small enough that the expensive budget probe can be skipped.
247247
_NON_FILTER_URL_HEADROOM = 1000
248248

249+
# Properties that *look* numeric but are typed as strings server-side,
250+
# so any unquoted numeric comparison against them (e.g. ``value >= 1000``)
251+
# resolves lexicographically rather than numerically and silently
252+
# produces wrong results. The server queryables list ``value`` as
253+
# ``type: string`` on every data endpoint (continuous, daily,
254+
# field-measurements); see ``/collections/<svc>/queryables``.
255+
_LEXICOGRAPHIC_NUMERIC_FIELDS = frozenset({"value"})
256+
257+
# Match ``<field> <op> <numeric literal>`` or the reverse. Captures any
258+
# of the footgun fields from ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` alongside
259+
# an unquoted numeric literal; quoted literals (``value >= '1000'``) are
260+
# explicit string comparisons and are not flagged.
261+
_NUMERIC_COMPARE_RE = re.compile(
262+
r"""
263+
(?:
264+
\b(?P<field1>{fields})\s*
265+
(?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
266+
(?P<num1>-?\d+(?:\.\d+)?)\b
267+
|
268+
\b(?P<num2>-?\d+(?:\.\d+)?)\s*
269+
(?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
270+
(?P<field2>{fields})\b
271+
)
272+
""".format(fields="|".join(_LEXICOGRAPHIC_NUMERIC_FIELDS)),
273+
re.VERBOSE,
274+
)
275+
249276

250277
def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
251278
"""Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -385,6 +412,42 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
385412
return max(100, int(available_url_bytes / encoding_ratio))
386413

387414

415+
def _check_numeric_filter_pitfall(filter_expr: str) -> None:
416+
"""Raise if the filter compares a string-typed field to an unquoted
417+
numeric literal.
418+
419+
The Water Data API types ``value`` as a string on every data
420+
collection's ``/queryables`` schema, so ``value >= 1000`` is a
421+
*lexicographic* comparison, not a numeric one. ``'12'`` sorts above
422+
``'1000'``, ``'9'`` sorts above ``'1000'``, and so on — the caller
423+
almost always wants numeric semantics and would get a silently wrong
424+
result set. Raising here turns that silent bug into a loud error.
425+
426+
Explicit string comparisons (``value >= '1000'``, with the literal
427+
quoted) are not flagged — the caller has signalled they know the
428+
column is textual and want sort-order semantics.
429+
"""
430+
# Blank out single-quoted string literals so a filter containing
431+
# ``name = 'value >= 1000'`` doesn't false-positive on its own text.
432+
masked = re.sub(r"'[^']*'", "''", filter_expr)
433+
match = _NUMERIC_COMPARE_RE.search(masked)
434+
if not match:
435+
return
436+
field = match.group("field1") or match.group("field2")
437+
op = match.group("op1") or match.group("op2")
438+
num = match.group("num1") or match.group("num2")
439+
raise ValueError(
440+
f"Filter compares {field!r} to unquoted numeric {num}, but "
441+
f"{field!r} is typed as a string on the Water Data API. "
442+
f"``{field} {op} {num}`` would sort lexicographically — e.g. "
443+
f"``value >= 1000`` includes ``value='12'`` because ``'12'`` "
444+
f"sorts above ``'1000'``. If you really want a string "
445+
f"comparison, quote the literal: ``{field} {op} '{num}'``. "
446+
f"For a numeric filter, fetch a wider window and reduce in "
447+
f"pandas after the call."
448+
)
449+
450+
388451
def _cql2_param(args: dict[str, Any]) -> str:
389452
"""
390453
Convert query parameters to CQL2 JSON format for POST requests.
@@ -1028,6 +1091,7 @@ def _plan_filter_chunks(args: dict[str, Any]) -> list[str | None]:
10281091
)
10291092
if not chunkable:
10301093
return [None]
1094+
_check_numeric_filter_pitfall(filter_expr)
10311095
raw_budget = _effective_filter_budget(args, filter_expr)
10321096
return _chunk_cql_or(filter_expr, max_len=raw_budget)
10331097

tests/waterdata_utils_test.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from dataretrieval.waterdata.utils import (
1111
_CQL_FILTER_CHUNK_LEN,
1212
_WATERDATA_URL_BYTE_LIMIT,
13+
_check_numeric_filter_pitfall,
1314
_chunk_cql_or,
1415
_construct_api_requests,
1516
_effective_filter_budget,
@@ -524,3 +525,69 @@ def fake_construct_api_requests(**kwargs):
524525
)
525526

526527
assert sent_filters == [expr]
528+
529+
530+
@pytest.mark.parametrize(
531+
"expr",
532+
[
533+
"value >= 1000",
534+
"value > 1000",
535+
"value <= 1000",
536+
"value < 1000",
537+
"value = 1000",
538+
"value != 1000",
539+
"value >= 1000.5",
540+
"value >= -50",
541+
# With surrounding clauses
542+
"time >= '2023-01-01T00:00:00Z' AND value >= 1000",
543+
"value > 1000 OR value < 0",
544+
# Reverse order
545+
"1000 <= value",
546+
],
547+
)
548+
def test_check_numeric_filter_pitfall_raises(expr):
549+
"""Unquoted numeric comparisons against ``value`` resolve
550+
lexicographically on the server, so reject them with a clear
551+
message before the request is sent."""
552+
with pytest.raises(ValueError, match="lexicographic"):
553+
_check_numeric_filter_pitfall(expr)
554+
555+
556+
@pytest.mark.parametrize(
557+
"expr",
558+
[
559+
# Quoted literal — caller has opted into the string comparison
560+
"value >= '1000'",
561+
"value = '42.5'",
562+
# No value comparison at all
563+
"time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-02T00:00:00Z'",
564+
"monitoring_location_id = 'USGS-02238500'",
565+
# ``value`` appears only inside a string literal
566+
"monitoring_location_id = 'USGS-value >= 1000'",
567+
"name = 'why I care about value >= 1000'",
568+
# Other string-typed fields aren't in the footgun list
569+
"approval_status = 'Approved'",
570+
"qualifier IN ('A', 'P')",
571+
],
572+
)
573+
def test_check_numeric_filter_pitfall_allows(expr):
574+
"""Quoted literals, unrelated comparisons, and ``value`` substrings
575+
inside string literals must not trigger the check."""
576+
_check_numeric_filter_pitfall(expr) # must not raise
577+
578+
579+
def test_get_continuous_surfaces_pitfall_to_caller():
580+
"""End-to-end: the check runs at the ``get_continuous`` boundary,
581+
not as a deep internal-only protection, so callers see the error
582+
before any HTTP traffic."""
583+
from dataretrieval.waterdata import get_continuous
584+
585+
with mock.patch("dataretrieval.waterdata.utils._construct_api_requests") as build:
586+
with pytest.raises(ValueError, match="lexicographic"):
587+
get_continuous(
588+
monitoring_location_id="USGS-02238500",
589+
parameter_code="00060",
590+
filter="value >= 1000",
591+
filter_lang="cql-text",
592+
)
593+
build.assert_not_called()

0 commit comments

Comments
 (0)