Skip to content

Commit 40441a9

Browse files
thodson-usgsclaude
andcommitted
Generalize pitfall check to every field, not just value
The empirical sweep of ``/collections/<svc>/queryables`` + sample data shows 11+ string-typed-but-numeric-looking fields across the OGC endpoints — ``value``, ``parameter_code``, ``statistic_id``, ``district_code``, ``county_code``, ``hydrologic_unit_code``, ``monitoring_location_number``, ``measurement_number``, ``channel_flow``, ``channel_width``, ``channel_velocity`` — and zero-padded codes like ``parameter_code='00060'`` silently match nothing on ``parameter_code = 60``. Since *every* queryable on this API is typed as a string, there's no such thing as a legitimate numeric comparison. The cleanest rule is the universal one: flag any ``<identifier> <op> <unquoted numeric literal>`` (or the reverse), regardless of which field. Live-probed the server to confirm the failure mode: unquoted numeric RHS doesn't silently lex — it returns HTTP 500. Quoted literals (``value > '10'``) return 200 with real lex results. The client-side check turns an opaque 500 into a clear ValueError with the fix suggested inline, and also catches the silent-lex case where users who *do* quote still get wrong semantics. Dropped the ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` watchlist and widened the regex to match any identifier. Extended tests cover all 11 real-world field types plus zero-padded codes, multi-clause composites, and the original false-positive guards (identifiers inside quoted string literals, quoted RHS bypasses). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3ff39db commit 40441a9

2 files changed

Lines changed: 78 additions & 51 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 47 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -246,30 +246,34 @@ def _format_api_dates(
246246
# filter is small enough that the expensive budget probe can be skipped.
247247
_NON_FILTER_URL_HEADROOM = 1000
248248

249-
# Properties that *look* numeric but are typed as strings server-side,
250-
# so any unquoted numeric comparison against them (e.g. ``value >= 1000``)
251-
# resolves lexicographically rather than numerically and silently
252-
# produces wrong results. The server queryables list ``value`` as
253-
# ``type: string`` on every data endpoint (continuous, daily,
254-
# field-measurements); see ``/collections/<svc>/queryables``.
255-
_LEXICOGRAPHIC_NUMERIC_FIELDS = frozenset({"value"})
256-
257-
# Match ``<field> <op> <numeric literal>`` or the reverse. Captures any
258-
# of the footgun fields from ``_LEXICOGRAPHIC_NUMERIC_FIELDS`` alongside
259-
# an unquoted numeric literal; quoted literals (``value >= '1000'``) are
260-
# explicit string comparisons and are not flagged.
249+
# Every queryable property on every OGC collection for the Water Data
250+
# API is ``type: string`` (confirmed across ``continuous``, ``daily``,
251+
# ``field-measurements``, ``monitoring-locations``,
252+
# ``time-series-metadata``, ``latest-continuous``, ``latest-daily``,
253+
# ``channel-measurements`` — see ``/collections/<svc>/queryables``).
254+
# That includes fields whose *values* look numeric — ``value``,
255+
# ``parameter_code`` (``'00060'``), ``statistic_id`` (``'00011'``),
256+
# ``district_code`` (``'01'``), ``hydrologic_unit_code``,
257+
# ``channel_flow``, and more. Comparing any of them to an *unquoted*
258+
# numeric literal (``value >= 1000``) triggers a lexicographic sort
259+
# on the server and silently produces wrong results — zero-padded
260+
# codes are especially nasty (``parameter_code = 60`` matches nothing
261+
# because the real values are all ``'00060'``-shaped). So the rule we
262+
# enforce client-side is the general one: any ``<identifier> <op>
263+
# <unquoted numeric>`` is a bug — quote the literal or drop the
264+
# comparison and filter in pandas.
261265
_NUMERIC_COMPARE_RE = re.compile(
262266
r"""
263267
(?:
264-
\b(?P<field1>{fields})\s*
268+
\b(?P<field1>[A-Za-z_]\w*)\s*
265269
(?P<op1>>=|<=|<>|!=|==|=|>|<)\s*
266270
(?P<num1>-?\d+(?:\.\d+)?)\b
267271
|
268272
\b(?P<num2>-?\d+(?:\.\d+)?)\s*
269273
(?P<op2>>=|<=|<>|!=|==|=|>|<)\s*
270-
(?P<field2>{fields})\b
274+
(?P<field2>[A-Za-z_]\w*)\b
271275
)
272-
""".format(fields="|".join(_LEXICOGRAPHIC_NUMERIC_FIELDS)),
276+
""",
273277
re.VERBOSE,
274278
)
275279

@@ -413,22 +417,24 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
413417

414418

415419
def _check_numeric_filter_pitfall(filter_expr: str) -> None:
416-
"""Raise if the filter compares a string-typed field to an unquoted
417-
numeric literal.
418-
419-
The Water Data API types ``value`` as a string on every data
420-
collection's ``/queryables`` schema, so ``value >= 1000`` is a
421-
*lexicographic* comparison, not a numeric one. ``'12'`` sorts above
422-
``'1000'``, ``'9'`` sorts above ``'1000'``, and so on — the caller
423-
almost always wants numeric semantics and would get a silently wrong
424-
result set. Raising here turns that silent bug into a loud error.
425-
426-
Explicit string comparisons (``value >= '1000'``, with the literal
427-
quoted) are not flagged — the caller has signalled they know the
428-
column is textual and want sort-order semantics.
420+
"""Raise if the filter compares any field to an unquoted numeric literal.
421+
422+
Every queryable property on this API is typed as a string on the
423+
server, so an unquoted numeric comparison like ``value >= 1000``
424+
or ``parameter_code = 60`` sorts **lexicographically** rather than
425+
numerically. The failure modes are silent and nasty:
426+
427+
- ``value >= 1000`` matches ``value='12'`` (``'12'`` > ``'1000'``).
428+
- ``parameter_code = 60`` matches no rows, because the actual codes
429+
are zero-padded strings like ``'00060'``.
430+
- ``district_code = 1`` matches only the rare unpadded ``'1'``.
431+
432+
Raising here turns those silent bugs into a loud error. Explicit
433+
string comparisons (``value >= '1000'``) are not flagged — the
434+
quoted literal signals the caller knows the column is textual.
429435
"""
430-
# Blank out single-quoted string literals so a filter containing
431-
# ``name = 'value >= 1000'`` doesn't false-positive on its own text.
436+
# Blank out single-quoted string literals so ``name = 'value > 5'``
437+
# doesn't false-positive on its own text.
432438
masked = re.sub(r"'[^']*'", "''", filter_expr)
433439
match = _NUMERIC_COMPARE_RE.search(masked)
434440
if not match:
@@ -437,14 +443,17 @@ def _check_numeric_filter_pitfall(filter_expr: str) -> None:
437443
op = match.group("op1") or match.group("op2")
438444
num = match.group("num1") or match.group("num2")
439445
raise ValueError(
440-
f"Filter compares {field!r} to unquoted numeric {num}, but "
441-
f"{field!r} is typed as a string on the Water Data API. "
442-
f"``{field} {op} {num}`` would sort lexicographically — e.g. "
443-
f"``value >= 1000`` includes ``value='12'`` because ``'12'`` "
444-
f"sorts above ``'1000'``. If you really want a string "
445-
f"comparison, quote the literal: ``{field} {op} '{num}'``. "
446-
f"For a numeric filter, fetch a wider window and reduce in "
447-
f"pandas after the call."
446+
f"Filter compares {field!r} to unquoted numeric {num}. Every "
447+
f"queryable on the Water Data API is typed as a string, so "
448+
f"``{field} {op} {num}`` is not a valid numeric comparison — "
449+
f"empirically the server rejects unquoted numeric literals "
450+
f"with HTTP 500. Even if you quote the literal "
451+
f"(``{field} {op} '{num}'``) the comparison is lexicographic, "
452+
f"which silently misses zero-padded codes (e.g. "
453+
f"``parameter_code = '60'`` matches nothing because the real "
454+
f"codes are ``'00060'``-shaped) and sorts ``value='12'`` above "
455+
f"``value='1000'``. For a numeric filter, fetch a wider result "
456+
f"and reduce in pandas after the call."
448457
)
449458

450459

tests/waterdata_utils_test.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ def fake_construct_api_requests(**kwargs):
530530
@pytest.mark.parametrize(
531531
"expr",
532532
[
533+
# The motivating case — numeric-valued string field
533534
"value >= 1000",
534535
"value > 1000",
535536
"value <= 1000",
@@ -538,41 +539,58 @@ def fake_construct_api_requests(**kwargs):
538539
"value != 1000",
539540
"value >= 1000.5",
540541
"value >= -50",
541-
# With surrounding clauses
542+
# Zero-padded codes: `parameter_code = 60` matches nothing
543+
# because the real values are all `'00060'`-shaped
544+
"parameter_code = 60",
545+
"statistic_id = 11",
546+
"district_code = 1",
547+
"county_code != 0",
548+
"hydrologic_unit_code = 20301030401",
549+
# Channel-measurements numeric-looking string fields
550+
"channel_flow > 500",
551+
"channel_velocity >= 1.5",
552+
# Composite expressions
542553
"time >= '2023-01-01T00:00:00Z' AND value >= 1000",
543554
"value > 1000 OR value < 0",
544-
# Reverse order
555+
"parameter_code = 60 AND statistic_id = 11",
556+
# Reverse (literal on the left)
545557
"1000 <= value",
558+
"60 = parameter_code",
546559
],
547560
)
548561
def test_check_numeric_filter_pitfall_raises(expr):
549-
"""Unquoted numeric comparisons against ``value`` resolve
550-
lexicographically on the server, so reject them with a clear
551-
message before the request is sent."""
562+
"""Unquoted numeric comparisons against any field resolve
563+
lexicographically on this API — every queryable is string-typed —
564+
so reject them with a clear message before the request is sent."""
552565
with pytest.raises(ValueError, match="lexicographic"):
553566
_check_numeric_filter_pitfall(expr)
554567

555568

556569
@pytest.mark.parametrize(
557570
"expr",
558571
[
559-
# Quoted literal — caller has opted into the string comparison
572+
# Quoted literals — caller has opted into string comparison
560573
"value >= '1000'",
561574
"value = '42.5'",
562-
# No value comparison at all
575+
"parameter_code = '00060'",
576+
"district_code = '01'",
577+
"hydrologic_unit_code = '020301030401'",
578+
# Pure string comparisons
563579
"time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-02T00:00:00Z'",
564580
"monitoring_location_id = 'USGS-02238500'",
565-
# ``value`` appears only inside a string literal
566-
"monitoring_location_id = 'USGS-value >= 1000'",
567-
"name = 'why I care about value >= 1000'",
568-
# Other string-typed fields aren't in the footgun list
569581
"approval_status = 'Approved'",
570582
"qualifier IN ('A', 'P')",
583+
# Footgun identifiers appearing only inside string literals
584+
"monitoring_location_id = 'USGS-value >= 1000'",
585+
"name = 'why I care about parameter_code = 60'",
586+
"note = 'see district_code = 1 in docs'",
587+
# Multi-clause where every comparison is quoted
588+
"parameter_code = '00060' AND statistic_id = '00011'",
571589
],
572590
)
573591
def test_check_numeric_filter_pitfall_allows(expr):
574-
"""Quoted literals, unrelated comparisons, and ``value`` substrings
575-
inside string literals must not trigger the check."""
592+
"""Quoted literals and comparisons that don't pair a field with an
593+
unquoted numeric literal must not trigger the check."""
576594
_check_numeric_filter_pitfall(expr) # must not raise
577595

578596

0 commit comments

Comments
 (0)