@@ -262,21 +262,44 @@ def _format_api_dates(
262262# enforce client-side is the general one: any ``<identifier> <op>
263263# <unquoted numeric>`` is a bug — quote the literal or drop the
264264# comparison and filter in pandas.
265+
266+ # Unquoted numeric literal: integer, decimal, or scientific notation.
267+ _NUM = r"-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?"
268+ _IDENT = r"[A-Za-z_]\w*"
269+ _OP = r">=|<=|<>|!=|==|=|>|<"
270+
265271_NUMERIC_COMPARE_RE = re .compile (
266- r """
272+ rf """
267273 (?:
268- \b(?P<field1>[A-Za-z_]\w* )\s*
269- (?P<op1>>=|<=|<>|!=|==|=|>|< )\s*
270- (?P<num1>-?\d+(?:\.\d+)? )\b
274+ \b(?P<field1>{ _IDENT } )\s*
275+ (?P<op1>{ _OP } )\s*
276+ (?P<num1>{ _NUM } )\b
271277 |
272- \b(?P<num2>-?\d+(?:\.\d+)? )\s*
273- (?P<op2>>=|<=|<>|!=|==|=|>|< )\s*
274- (?P<field2>[A-Za-z_]\w* )\b
278+ \b(?P<num2>{ _NUM } )\s*
279+ (?P<op2>{ _OP } )\s*
280+ (?P<field2>{ _IDENT } )\b
275281 )
276282 """ ,
277283 re .VERBOSE ,
278284)
279285
286+ # ``<field> IN (<numeric>, ...)`` — same footgun as simple comparison
287+ # but using the list form. Caught separately because ``IN`` isn't one
288+ # of the comparison operators in ``_OP``. We only need to see one
289+ # unquoted numeric inside the parentheses to know the user intends
290+ # numeric membership.
291+ _IN_NUMERIC_RE = re .compile (
292+ rf"\b(?P<field>{ _IDENT } )\s+IN\s*\(\s*{ _NUM } " ,
293+ re .IGNORECASE ,
294+ )
295+
296+ # ``<field> BETWEEN <numeric> AND <numeric>`` — range form of the same
297+ # footgun.
298+ _BETWEEN_NUMERIC_RE = re .compile (
299+ rf"\b(?P<field>{ _IDENT } )\s+BETWEEN\s+{ _NUM } \s+AND\s+{ _NUM } \b" ,
300+ re .IGNORECASE ,
301+ )
302+
280303
281304def _iter_or_boundaries (expr : str ) -> Iterator [tuple [int , int ]]:
282305 """Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -417,43 +440,58 @@ def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
417440
418441
419442def _check_numeric_filter_pitfall (filter_expr : str ) -> None :
420- """Raise if the filter compares any field to an unquoted numeric literal.
443+ """Raise if the filter pairs any field with an unquoted numeric literal.
421444
422445 Every queryable property on this API is typed as a string on the
423- server, so an unquoted numeric comparison like ``value >= 1000``
424- or ``parameter_code = 60`` sorts **lexicographically** rather than
425- numerically. The failure modes are silent and nasty:
426-
427- - ``value >= 1000`` matches ``value='12'`` (``'12'`` > ``'1000'``).
428- - ``parameter_code = 60`` matches no rows, because the actual codes
429- are zero-padded strings like ``'00060'``.
430- - ``district_code = 1`` matches only the rare unpadded ``'1'``.
431-
432- Raising here turns those silent bugs into a loud error. Explicit
433- string comparisons (``value >= '1000'``) are not flagged — the
434- quoted literal signals the caller knows the column is textual.
446+ server, so any numeric-looking comparison — ``value >= 1000``,
447+ ``parameter_code = 60``, ``parameter_code IN (60, 61)``,
448+ ``value BETWEEN 5 AND 10`` — either gets rejected with HTTP 500
449+ or silently produces lexicographic results. Zero-padded codes are
450+ especially nasty (``parameter_code = '60'`` matches nothing because
451+ the real codes are ``'00060'``-shaped).
452+
453+ Explicit string comparisons with quoted literals
454+ (``value >= '1000'``) are not flagged — the caller has signalled
455+ they know the column is textual.
435456 """
436457 # Blank out single-quoted string literals so ``name = 'value > 5'``
437- # doesn't false-positive on its own text.
438- masked = re .sub (r"'[^']*'" , "''" , filter_expr )
439- match = _NUMERIC_COMPARE_RE .search (masked )
440- if not match :
441- return
442- field = match .group ("field1" ) or match .group ("field2" )
443- op = match .group ("op1" ) or match .group ("op2" )
444- num = match .group ("num1" ) or match .group ("num2" )
458+ # doesn't false-positive. The ``"'" in`` pre-check saves the
459+ # allocation on the common auto-chunked case (many-target OR chains
460+ # always contain quotes, but short ad-hoc filters often don't).
461+ masked = (
462+ re .sub (r"'[^']*'" , "''" , filter_expr ) if "'" in filter_expr else filter_expr
463+ )
464+
465+ compare = _NUMERIC_COMPARE_RE .search (masked )
466+ if compare :
467+ field = compare .group ("field1" ) or compare .group ("field2" )
468+ offense = (
469+ f"{ field } { compare .group ('op1' ) or compare .group ('op2' )} "
470+ f"{ compare .group ('num1' ) or compare .group ('num2' )} "
471+ )
472+ _raise_pitfall (field , offense )
473+
474+ membership = _IN_NUMERIC_RE .search (masked )
475+ if membership :
476+ field = membership .group ("field" )
477+ _raise_pitfall (field , f"{ field } IN (…)" )
478+
479+ between = _BETWEEN_NUMERIC_RE .search (masked )
480+ if between :
481+ field = between .group ("field" )
482+ _raise_pitfall (field , f"{ field } BETWEEN …" )
483+
484+
485+ def _raise_pitfall (field : str , offense : str ) -> None :
445486 raise ValueError (
446- f"Filter compares { field !r} to unquoted numeric { num } . Every "
447- f"queryable on the Water Data API is typed as a string, so "
448- f"``{ field } { op } { num } `` is not a valid numeric comparison — "
449- f"empirically the server rejects unquoted numeric literals "
450- f"with HTTP 500. Even if you quote the literal "
451- f"(``{ field } { op } '{ num } '``) the comparison is lexicographic, "
452- f"which silently misses zero-padded codes (e.g. "
453- f"``parameter_code = '60'`` matches nothing because the real "
454- f"codes are ``'00060'``-shaped) and sorts ``value='12'`` above "
455- f"``value='1000'``. For a numeric filter, fetch a wider result "
456- f"and reduce in pandas after the call."
487+ f"Filter uses an unquoted numeric comparison against { field !r} "
488+ f"(``{ offense } ``). Every queryable on the Water Data API is "
489+ f"typed as a string, so the server rejects unquoted numeric "
490+ f"literals with HTTP 500; even quoting the literal gives a "
491+ f"lexicographic comparison (``value > '10'`` matches "
492+ f"``value='34.52'``, ``parameter_code = '60'`` matches nothing "
493+ f"because the real codes are ``'00060'``-shaped). For a true "
494+ f"numeric filter, fetch a wider result and reduce in pandas."
457495 )
458496
459497
0 commit comments