Budget CQL chunks against the URL byte limit, not raw filter length

thodson-usgs · claude · thodson-usgs · commit 686824784a12 · 2026-04-22T19:34:00.000-05:00
The previous 5 KB raw-filter budget was a static approximation.
Empirically the Water Data API returns HTTP 414 at ~8,200 bytes of
total URL, matching nginx's default 8 KB large_client_header_buffers.
The raw-filter budget leaves unknown headroom that varies with:
  - URL encoding (a uniform time-interval filter inflates ~1.4x; heavy
    special-char content inflates more)
  - the URL space consumed by other query params

Expose ``_WATERDATA_URL_BYTE_LIMIT = 8000`` with a comment describing
what the limit represents, and add ``_effective_filter_budget`` which
probes each request's non-filter URL cost and converts the remaining
URL budget back to raw CQL bytes via the filter's own encoding ratio.
``get_ogc_data`` now uses that per-request budget instead of the fixed
constant.

Verified live: a 34 KB OR-chain that previously split into 8 chunks now
packs into 7, with every produced URL staying at ~7.9 KB (well under
the 8 KB limit and below the 8.2 KB observed 414 cliff).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -7,6 +7,7 @@
 from collections.abc import Iterator
 from datetime import datetime
 from typing import Any, get_args
+from urllib.parse import quote_plus
 
 import pandas as pd
 import requests
@@ -225,13 +226,20 @@ def _format_api_dates(
         raise ValueError("datetime_input should only include 1-2 values")
 
 
-# Conservative budget (characters) for a single CQL `filter` query
-# parameter before the URL risks exceeding the server's URI length limit.
-# The continuous endpoint has been observed to return HTTP 414 around ~7 KB
-# of filter text; 5000 leaves headroom for URL encoding and the other
-# query parameters.
+# Conservative fallback budget (characters) for a single CQL ``filter``
+# query parameter, used when the caller invokes ``_chunk_cql_or`` without
+# a ``max_len``. ``get_ogc_data`` computes a tighter per-request budget
+# from ``_WATERDATA_URL_BYTE_LIMIT`` below.
 _CQL_FILTER_CHUNK_LEN = 5000
 
+# Total URL byte limit the Water Data API will accept before replying
+# HTTP 414 (Request-URI Too Large). Empirically the cliff sits at
+# ~8,200 bytes of full URL, which lines up with nginx's default
+# ``large_client_header_buffers`` of 8 KB (8192). 8000 leaves ~200 bytes
+# of headroom for request-line framing ("GET ... HTTP/1.1\r\n") and any
+# intermediate proxy variance.
+_WATERDATA_URL_BYTE_LIMIT = 8000
+
 
 def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
     """Yield ``(start, end)`` spans of each top-level ``OR`` separator.
@@ -331,6 +339,28 @@ def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]:
     return chunks
 
 
+def _effective_filter_budget(args: dict[str, Any], filter_expr: str) -> int:
+    """Compute the raw CQL byte budget for ``filter_expr`` in this request.
+
+    The server limits total URL length (see ``_WATERDATA_URL_BYTE_LIMIT``),
+    not raw CQL length. To derive a raw-byte budget we can hand to
+    ``_chunk_cql_or``:
+
+    1. Probe the URL space consumed by the other query params by building
+       the request with a 1-byte placeholder filter.
+    2. Subtract from the URL limit to get the bytes available for the
+       encoded filter value.
+    3. Convert back to raw CQL bytes using the filter's own URL-encoding
+       ratio (e.g. uniform time-interval clauses inflate ~1.4x; heavy
+       special-char clauses can inflate more).
+    """
+    probe = _construct_api_requests(**{**args, "filter": "x"})
+    non_filter_url_bytes = len(probe.url) - 1
+    available_url_bytes = _WATERDATA_URL_BYTE_LIMIT - non_filter_url_bytes
+    encoding_ratio = len(quote_plus(filter_expr)) / len(filter_expr)
+    return max(100, int(available_url_bytes / encoding_ratio))
+
+
 def _cql2_param(args: dict[str, Any]) -> str:
     """
     Convert query parameters to CQL2 JSON format for POST requests.
@@ -947,11 +977,16 @@ def get_ogc_data(
     # Overlapping user OR-clauses are deduplicated by feature id further below.
     filter_expr = args.get("filter")
     filter_lang = args.get("filter_lang")
-    should_chunk_filter = isinstance(filter_expr, str) and filter_lang in {
-        None,
-        "cql-text",
-    }
-    filter_chunks = _chunk_cql_or(filter_expr) if should_chunk_filter else [None]
+    should_chunk_filter = (
+        isinstance(filter_expr, str)
+        and filter_expr
+        and filter_lang in {None, "cql-text"}
+    )
+    if should_chunk_filter:
+        raw_budget = _effective_filter_budget(args, filter_expr)
+        filter_chunks = _chunk_cql_or(filter_expr, max_len=raw_budget)
+    else:
+        filter_chunks = [None]
 
     frames = []
     first_response = None
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -9,8 +9,10 @@
 
 from dataretrieval.waterdata.utils import (
     _CQL_FILTER_CHUNK_LEN,
+    _WATERDATA_URL_BYTE_LIMIT,
     _chunk_cql_or,
     _construct_api_requests,
+    _effective_filter_budget,
     _get_args,
     _split_top_level_or,
     _walk_pages,
@@ -236,6 +238,9 @@ def fake_walk_pages(*_args, **_kwargs):
         side_effect=fake_construct_api_requests,
     ), mock.patch(
         "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
+    ), mock.patch(
+        "dataretrieval.waterdata.utils._effective_filter_budget",
+        return_value=_CQL_FILTER_CHUNK_LEN,
     ):
         df, _ = get_continuous(
             monitoring_location_id="USGS-07374525",
@@ -244,12 +249,18 @@ def fake_walk_pages(*_args, **_kwargs):
             filter_lang="cql-text",
         )
 
-    # Mirror the library's splitter so the test doesn't hardcode a chunk count.
-    expected_chunks = _chunk_cql_or(expr)
-    assert len(expected_chunks) > 1
-    assert len(sent_filters) == len(expected_chunks)
-    assert sent_filters == expected_chunks
-    assert len(df) == len(expected_chunks)
+    # Mocking _effective_filter_budget bypasses the URL-length probe, so
+    # sent_filters contains only real chunk requests. Assert invariants:
+    # chunking happened, every original clause is preserved exactly once
+    # in order, each chunk stays under the budget, and the mock's
+    # one-row-per-chunk responses concatenate to a row per chunk.
+    expected_parts = _split_top_level_or(expr)
+    assert len(sent_filters) > 1
+    rejoined_parts = []
+    for chunk in sent_filters:
+        rejoined_parts.extend(_split_top_level_or(chunk))
+    assert rejoined_parts == expected_parts
+    assert len(df) == len(sent_filters)
     assert all(len(chunk) <= _CQL_FILTER_CHUNK_LEN for chunk in sent_filters)
 
 
@@ -283,6 +294,9 @@ def fake_walk_pages(*_args, **_kwargs):
         ),
     ), mock.patch(
         "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
+    ), mock.patch(
+        "dataretrieval.waterdata.utils._effective_filter_budget",
+        return_value=_CQL_FILTER_CHUNK_LEN,
     ):
         df, _ = get_continuous(
             monitoring_location_id="USGS-07374525",
@@ -291,13 +305,59 @@ def fake_walk_pages(*_args, **_kwargs):
             filter_lang="cql-text",
         )
 
-    expected_chunks = _chunk_cql_or(expr)
-    assert len(expected_chunks) > 1
-    assert call_count["n"] == len(expected_chunks)
+    # Chunking must have happened (otherwise dedup wouldn't be exercised).
+    assert call_count["n"] > 1
     # Even though each chunk returned a feature, dedup by id collapses them.
     assert len(df) == 1
 
 
+def test_effective_filter_budget_respects_url_limit():
+    """The computed budget, once encoded, fits within the URL byte limit
+    alongside the other query params."""
+    from urllib.parse import quote_plus
+
+    filter_expr = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')"
+    args = {
+        "service": "continuous",
+        "monitoring_location_id": "USGS-02238500",
+        "parameter_code": "00060",
+        "filter": filter_expr,
+        "filter_lang": "cql-text",
+    }
+    raw_budget = _effective_filter_budget(args, filter_expr)
+
+    # Build a chunk exactly at the raw budget (padded with the clause repeated)
+    # and confirm the full URL it produces stays under the URL byte limit.
+    padded = (" OR ".join([filter_expr] * 200))[:raw_budget]
+    req = _construct_api_requests(**{**args, "filter": padded})
+    assert len(req.url) <= _WATERDATA_URL_BYTE_LIMIT
+    # And the budget scales inversely with encoding ratio (sanity).
+    assert raw_budget < _WATERDATA_URL_BYTE_LIMIT
+    # Quick sanity on the encoding math itself.
+    assert len(quote_plus(padded)) <= _WATERDATA_URL_BYTE_LIMIT
+
+
+def test_effective_filter_budget_shrinks_with_more_url_params():
+    """Adding more scalar query params consumes URL bytes and should
+    shrink the raw filter budget accordingly."""
+    clause = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')"
+    sparse_args = {
+        "service": "continuous",
+        "monitoring_location_id": "USGS-02238500",
+        "filter": clause,
+        "filter_lang": "cql-text",
+    }
+    dense_args = {
+        **sparse_args,
+        "parameter_code": "00060",
+        "statistic_id": "00003",
+        "last_modified": "2023-01-01T00:00:00Z/2023-12-31T23:59:59Z",
+    }
+    sparse_budget = _effective_filter_budget(sparse_args, clause)
+    dense_budget = _effective_filter_budget(dense_args, clause)
+    assert dense_budget < sparse_budget
+
+
 def test_cql_json_filter_is_not_chunked():
     """Chunking applies only to cql-text; cql-json is passed through unchanged."""
     from dataretrieval.waterdata import get_continuous