Skip to content

Commit 6868247

Browse files
thodson-usgsclaude
andcommitted
Budget CQL chunks against the URL byte limit, not raw filter length
The previous 5 KB raw-filter budget was a static approximation. Empirically the Water Data API returns HTTP 414 at ~8,200 bytes of total URL, matching nginx's default 8 KB large_client_header_buffers. The raw-filter budget leaves unknown headroom that varies with: - URL encoding (a uniform time-interval filter inflates ~1.4x; heavy special-char content inflates more) - the URL space consumed by other query params Expose ``_WATERDATA_URL_BYTE_LIMIT = 8000`` with a comment describing what the limit represents, and add ``_effective_filter_budget`` which probes each request's non-filter URL cost and converts the remaining URL budget back to raw CQL bytes via the filter's own encoding ratio. ``get_ogc_data`` now uses that per-request budget instead of the fixed constant. Verified live: a 34 KB OR-chain that previously split into 8 chunks now packs into 7, with every produced URL staying at ~7.9 KB (well under the 8 KB limit and below the 8.2 KB observed 414 cliff). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8b9d7e9 commit 6868247

2 files changed

Lines changed: 114 additions & 19 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from collections.abc import Iterator
88
from datetime import datetime
99
from typing import Any, get_args
10+
from urllib.parse import quote_plus
1011

1112
import pandas as pd
1213
import requests
@@ -225,13 +226,20 @@ def _format_api_dates(
225226
raise ValueError("datetime_input should only include 1-2 values")
226227

227228

228-
# Conservative budget (characters) for a single CQL `filter` query
229-
# parameter before the URL risks exceeding the server's URI length limit.
230-
# The continuous endpoint has been observed to return HTTP 414 around ~7 KB
231-
# of filter text; 5000 leaves headroom for URL encoding and the other
232-
# query parameters.
229+
# Conservative fallback budget (characters) for a single CQL ``filter``
230+
# query parameter, used when the caller invokes ``_chunk_cql_or`` without
231+
# a ``max_len``. ``get_ogc_data`` computes a tighter per-request budget
232+
# from ``_WATERDATA_URL_BYTE_LIMIT`` below.
233233
_CQL_FILTER_CHUNK_LEN = 5000
234234

235+
# Total URL byte limit the Water Data API will accept before replying
236+
# HTTP 414 (Request-URI Too Large). Empirically the cliff sits at
237+
# ~8,200 bytes of full URL, which lines up with nginx's default
238+
# ``large_client_header_buffers`` of 8 KB (8192). 8000 leaves ~200 bytes
239+
# of headroom for request-line framing ("GET ... HTTP/1.1\r\n") and any
240+
# intermediate proxy variance.
241+
_WATERDATA_URL_BYTE_LIMIT = 8000
242+
235243

236244
def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
237245
"""Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -331,6 +339,28 @@ def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]:
331339
return chunks
332340

333341

342+
def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
343+
"""Compute the raw CQL byte budget for ``filter_expr`` in this request.
344+
345+
The server limits total URL length (see ``_WATERDATA_URL_BYTE_LIMIT``),
346+
not raw CQL length. To derive a raw-byte budget we can hand to
347+
``_chunk_cql_or``:
348+
349+
1. Probe the URL space consumed by the other query params by building
350+
the request with a 1-byte placeholder filter.
351+
2. Subtract from the URL limit to get the bytes available for the
352+
encoded filter value.
353+
3. Convert back to raw CQL bytes using the filter's own URL-encoding
354+
ratio (e.g. uniform time-interval clauses inflate ~1.4x; heavy
355+
special-char clauses can inflate more).
356+
"""
357+
probe = _construct_api_requests(**{**args, "filter": "x"})
358+
non_filter_url_bytes = len(probe.url) - 1
359+
available_url_bytes = _WATERDATA_URL_BYTE_LIMIT - non_filter_url_bytes
360+
encoding_ratio = len(quote_plus(filter_expr)) / len(filter_expr)
361+
return max(100, int(available_url_bytes / encoding_ratio))
362+
363+
334364
def _cql2_param(args: dict[str, Any]) -> str:
335365
"""
336366
Convert query parameters to CQL2 JSON format for POST requests.
@@ -947,11 +977,16 @@ def get_ogc_data(
947977
# Overlapping user OR-clauses are deduplicated by feature id further below.
948978
filter_expr = args.get("filter")
949979
filter_lang = args.get("filter_lang")
950-
should_chunk_filter = isinstance(filter_expr, str) and filter_lang in {
951-
None,
952-
"cql-text",
953-
}
954-
filter_chunks = _chunk_cql_or(filter_expr) if should_chunk_filter else [None]
980+
should_chunk_filter = (
981+
isinstance(filter_expr, str)
982+
and filter_expr
983+
and filter_lang in {None, "cql-text"}
984+
)
985+
if should_chunk_filter:
986+
raw_budget = _effective_filter_budget(args, filter_expr)
987+
filter_chunks = _chunk_cql_or(filter_expr, max_len=raw_budget)
988+
else:
989+
filter_chunks = [None]
955990

956991
frames = []
957992
first_response = None

tests/waterdata_utils_test.py

Lines changed: 69 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99

1010
from dataretrieval.waterdata.utils import (
1111
_CQL_FILTER_CHUNK_LEN,
12+
_WATERDATA_URL_BYTE_LIMIT,
1213
_chunk_cql_or,
1314
_construct_api_requests,
15+
_effective_filter_budget,
1416
_get_args,
1517
_split_top_level_or,
1618
_walk_pages,
@@ -236,6 +238,9 @@ def fake_walk_pages(*_args, **_kwargs):
236238
side_effect=fake_construct_api_requests,
237239
), mock.patch(
238240
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
241+
), mock.patch(
242+
"dataretrieval.waterdata.utils._effective_filter_budget",
243+
return_value=_CQL_FILTER_CHUNK_LEN,
239244
):
240245
df, _ = get_continuous(
241246
monitoring_location_id="USGS-07374525",
@@ -244,12 +249,18 @@ def fake_walk_pages(*_args, **_kwargs):
244249
filter_lang="cql-text",
245250
)
246251

247-
# Mirror the library's splitter so the test doesn't hardcode a chunk count.
248-
expected_chunks = _chunk_cql_or(expr)
249-
assert len(expected_chunks) > 1
250-
assert len(sent_filters) == len(expected_chunks)
251-
assert sent_filters == expected_chunks
252-
assert len(df) == len(expected_chunks)
252+
# Mocking _effective_filter_budget bypasses the URL-length probe, so
253+
# sent_filters contains only real chunk requests. Assert invariants:
254+
# chunking happened, every original clause is preserved exactly once
255+
# in order, each chunk stays under the budget, and the mock's
256+
# one-row-per-chunk responses concatenate to a row per chunk.
257+
expected_parts = _split_top_level_or(expr)
258+
assert len(sent_filters) > 1
259+
rejoined_parts = []
260+
for chunk in sent_filters:
261+
rejoined_parts.extend(_split_top_level_or(chunk))
262+
assert rejoined_parts == expected_parts
263+
assert len(df) == len(sent_filters)
253264
assert all(len(chunk) <= _CQL_FILTER_CHUNK_LEN for chunk in sent_filters)
254265

255266

@@ -283,6 +294,9 @@ def fake_walk_pages(*_args, **_kwargs):
283294
),
284295
), mock.patch(
285296
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
297+
), mock.patch(
298+
"dataretrieval.waterdata.utils._effective_filter_budget",
299+
return_value=_CQL_FILTER_CHUNK_LEN,
286300
):
287301
df, _ = get_continuous(
288302
monitoring_location_id="USGS-07374525",
@@ -291,13 +305,59 @@ def fake_walk_pages(*_args, **_kwargs):
291305
filter_lang="cql-text",
292306
)
293307

294-
expected_chunks = _chunk_cql_or(expr)
295-
assert len(expected_chunks) > 1
296-
assert call_count["n"] == len(expected_chunks)
308+
# Chunking must have happened (otherwise dedup wouldn't be exercised).
309+
assert call_count["n"] > 1
297310
# Even though each chunk returned a feature, dedup by id collapses them.
298311
assert len(df) == 1
299312

300313

314+
def test_effective_filter_budget_respects_url_limit():
315+
"""The computed budget, once encoded, fits within the URL byte limit
316+
alongside the other query params."""
317+
from urllib.parse import quote_plus
318+
319+
filter_expr = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')"
320+
args = {
321+
"service": "continuous",
322+
"monitoring_location_id": "USGS-02238500",
323+
"parameter_code": "00060",
324+
"filter": filter_expr,
325+
"filter_lang": "cql-text",
326+
}
327+
raw_budget = _effective_filter_budget(args, filter_expr)
328+
329+
# Build a chunk exactly at the raw budget (padded with the clause repeated)
330+
# and confirm the full URL it produces stays under the URL byte limit.
331+
padded = (" OR ".join([filter_expr] * 200))[:raw_budget]
332+
req = _construct_api_requests(**{**args, "filter": padded})
333+
assert len(req.url) <= _WATERDATA_URL_BYTE_LIMIT
334+
# And the budget scales inversely with encoding ratio (sanity).
335+
assert raw_budget < _WATERDATA_URL_BYTE_LIMIT
336+
# Quick sanity on the encoding math itself.
337+
assert len(quote_plus(padded)) <= _WATERDATA_URL_BYTE_LIMIT
338+
339+
340+
def test_effective_filter_budget_shrinks_with_more_url_params():
341+
"""Adding more scalar query params consumes URL bytes and should
342+
shrink the raw filter budget accordingly."""
343+
clause = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')"
344+
sparse_args = {
345+
"service": "continuous",
346+
"monitoring_location_id": "USGS-02238500",
347+
"filter": clause,
348+
"filter_lang": "cql-text",
349+
}
350+
dense_args = {
351+
**sparse_args,
352+
"parameter_code": "00060",
353+
"statistic_id": "00003",
354+
"last_modified": "2023-01-01T00:00:00Z/2023-12-31T23:59:59Z",
355+
}
356+
sparse_budget = _effective_filter_budget(sparse_args, clause)
357+
dense_budget = _effective_filter_budget(dense_args, clause)
358+
assert dense_budget < sparse_budget
359+
360+
301361
def test_cql_json_filter_is_not_chunked():
302362
"""Chunking applies only to cql-text; cql-json is passed through unchanged."""
303363
from dataretrieval.waterdata import get_continuous

0 commit comments

Comments
 (0)