Skip to content

Commit a65e94d

Browse files
thodson-usgsclaude
andcommitted
Skip chunking for non-cql-text filters; simplify tests
- `_chunk_cql_or` splits on the literal substring " OR " and only quote-aware for single quotes (CQL-text). Applying it to CQL-JSON would corrupt JSON string values or produce nonsense sub-requests. Gate chunking to `filter_lang in {None, "cql-text"}` and pass other languages through as a single request. - Replace the `requests_mock`-based fan-out/dedup tests with lighter `mock.patch` stubs of `_construct_api_requests` / `_walk_pages`, which also removes the py<3.10 skip (the tests no longer touch any HTTP or py3.10-only paths). Strengthen the fan-out assertion to `sent_filters == expected_chunks`. - Add `test_cql_json_filter_is_not_chunked` to pin the new guard. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 850f14d commit a65e94d

2 files changed

Lines changed: 103 additions & 75 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -930,11 +930,16 @@ def get_ogc_data(
930930
convert_type = args.pop("convert_type", False)
931931
# Create fresh dictionary of args without any None values
932932
args = {k: v for k, v in args.items() if v is not None}
933+
# Only cql-text filters can be safely chunked by splitting top-level OR
934+
# chains. For cql-json (or unknown languages), pass through unchanged.
933935
# Overlapping user OR-clauses are deduplicated by feature id further below.
934936
filter_expr = args.get("filter")
935-
filter_chunks = (
936-
_chunk_cql_or(filter_expr) if isinstance(filter_expr, str) else [None]
937-
)
937+
filter_lang = args.get("filter_lang")
938+
should_chunk_filter = isinstance(filter_expr, str) and filter_lang in {
939+
None,
940+
"cql-text",
941+
}
942+
filter_chunks = _chunk_cql_or(filter_expr) if should_chunk_filter else [None]
938943

939944
frames = []
940945
first_response = None

tests/waterdata_utils_test.py

Lines changed: 95 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
import sys
1+
from datetime import timedelta
2+
from types import SimpleNamespace
23
from unittest import mock
34
from urllib.parse import parse_qs, urlsplit
45

6+
import pandas as pd
57
import pytest
68
import requests
79

@@ -14,10 +16,6 @@
1416
_walk_pages,
1517
)
1618

17-
OGC_CONTINUOUS_URL = (
18-
"https://api.waterdata.usgs.gov/ogcapi/v0/collections/continuous/items"
19-
)
20-
2119

2220
def _query_params(prepared_request):
2321
return parse_qs(urlsplit(prepared_request.url).query)
@@ -205,11 +203,7 @@ def test_construct_filter_on_all_ogc_services(service):
205203
assert qs["filter-lang"] == ["cql-text"]
206204

207205

208-
@pytest.mark.skipif(
209-
sys.version_info < (3, 10),
210-
reason="get_continuous requires py>=3.10 (see tests/waterdata_test.py)",
211-
)
212-
def test_long_filter_fans_out_into_multiple_requests(requests_mock):
206+
def test_long_filter_fans_out_into_multiple_requests():
213207
"""An oversized top-level OR filter triggers multiple HTTP requests
214208
whose results are concatenated."""
215209
from dataretrieval.waterdata import get_continuous
@@ -221,49 +215,45 @@ def test_long_filter_fans_out_into_multiple_requests(requests_mock):
221215
expr = " OR ".join(clause.format(day=(i % 28) + 1) for i in range(300))
222216
assert len(expr) > _CQL_FILTER_CHUNK_LEN
223217

224-
call_count = {"n": 0}
225-
226-
def respond(request, context):
227-
context.status_code = 200
228-
call_count["n"] += 1
229-
return {
230-
"type": "FeatureCollection",
231-
"numberReturned": 1,
232-
"features": [
233-
{
234-
"type": "Feature",
235-
"id": f"chunk-{call_count['n']}",
236-
"geometry": None,
237-
"properties": {"value": call_count["n"]},
238-
}
239-
],
240-
"links": [],
241-
}
242-
243-
requests_mock.get(OGC_CONTINUOUS_URL, json=respond)
244-
245-
df, _ = get_continuous(
246-
monitoring_location_id="USGS-07374525",
247-
parameter_code="72255",
248-
filter=expr,
249-
filter_lang="cql-text",
250-
)
218+
sent_filters = []
219+
220+
def fake_construct_api_requests(**kwargs):
221+
sent_filters.append(kwargs.get("filter"))
222+
return SimpleNamespace(url="https://example.test", method="GET", headers={})
223+
224+
def fake_walk_pages(*_args, **_kwargs):
225+
idx = len(sent_filters)
226+
frame = pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]})
227+
resp = SimpleNamespace(
228+
url="https://example.test",
229+
elapsed=timedelta(milliseconds=1),
230+
headers={},
231+
)
232+
return frame, resp
233+
234+
with mock.patch(
235+
"dataretrieval.waterdata.utils._construct_api_requests",
236+
side_effect=fake_construct_api_requests,
237+
), mock.patch(
238+
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
239+
):
240+
df, _ = get_continuous(
241+
monitoring_location_id="USGS-07374525",
242+
parameter_code="72255",
243+
filter=expr,
244+
filter_lang="cql-text",
245+
)
251246

252247
# Mirror the library's splitter so the test doesn't hardcode a chunk count.
253248
expected_chunks = _chunk_cql_or(expr)
254249
assert len(expected_chunks) > 1
255-
assert call_count["n"] == len(expected_chunks)
250+
assert len(sent_filters) == len(expected_chunks)
251+
assert sent_filters == expected_chunks
256252
assert len(df) == len(expected_chunks)
257-
for req in requests_mock.request_history:
258-
filter_qs = parse_qs(urlsplit(req.url).query).get("filter", [""])[0]
259-
assert len(filter_qs) <= _CQL_FILTER_CHUNK_LEN
253+
assert all(len(chunk) <= _CQL_FILTER_CHUNK_LEN for chunk in sent_filters)
260254

261255

262-
@pytest.mark.skipif(
263-
sys.version_info < (3, 10),
264-
reason="get_continuous requires py>=3.10 (see tests/waterdata_test.py)",
265-
)
266-
def test_long_filter_deduplicates_cross_chunk_overlap(requests_mock):
256+
def test_long_filter_deduplicates_cross_chunk_overlap():
267257
"""Features returned by multiple chunks (same feature `id`) are
268258
deduplicated in the concatenated result."""
269259
from dataretrieval.waterdata import get_continuous
@@ -276,36 +266,69 @@ def test_long_filter_deduplicates_cross_chunk_overlap(requests_mock):
276266

277267
call_count = {"n": 0}
278268

279-
def respond(request, context):
280-
context.status_code = 200
269+
def fake_walk_pages(*_args, **_kwargs):
281270
call_count["n"] += 1
282-
# Every chunk returns the same feature id so dedup should collapse
283-
# the concatenated frame down to a single row.
284-
return {
285-
"type": "FeatureCollection",
286-
"numberReturned": 1,
287-
"features": [
288-
{
289-
"type": "Feature",
290-
"id": "shared-feature",
291-
"geometry": None,
292-
"properties": {"value": 1},
293-
}
294-
],
295-
"links": [],
296-
}
297-
298-
requests_mock.get(OGC_CONTINUOUS_URL, json=respond)
299-
300-
df, _ = get_continuous(
301-
monitoring_location_id="USGS-07374525",
302-
parameter_code="72255",
303-
filter=expr,
304-
filter_lang="cql-text",
305-
)
271+
frame = pd.DataFrame({"id": ["shared-feature"], "value": [1]})
272+
resp = SimpleNamespace(
273+
url="https://example.test",
274+
elapsed=timedelta(milliseconds=1),
275+
headers={},
276+
)
277+
return frame, resp
278+
279+
with mock.patch(
280+
"dataretrieval.waterdata.utils._construct_api_requests",
281+
return_value=SimpleNamespace(
282+
url="https://example.test", method="GET", headers={}
283+
),
284+
), mock.patch(
285+
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
286+
):
287+
df, _ = get_continuous(
288+
monitoring_location_id="USGS-07374525",
289+
parameter_code="72255",
290+
filter=expr,
291+
filter_lang="cql-text",
292+
)
306293

307294
expected_chunks = _chunk_cql_or(expr)
308295
assert len(expected_chunks) > 1
309296
assert call_count["n"] == len(expected_chunks)
310297
# Even though each chunk returned a feature, dedup by id collapses them.
311298
assert len(df) == 1
299+
300+
301+
def test_cql_json_filter_is_not_chunked():
302+
"""Chunking applies only to cql-text; cql-json is passed through unchanged."""
303+
from dataretrieval.waterdata import get_continuous
304+
305+
clause = "(time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-01T00:30:00Z')"
306+
expr = " OR ".join([clause] * 300)
307+
sent_filters = []
308+
309+
def fake_construct_api_requests(**kwargs):
310+
sent_filters.append(kwargs.get("filter"))
311+
return SimpleNamespace(url="https://example.test", method="GET", headers={})
312+
313+
with mock.patch(
314+
"dataretrieval.waterdata.utils._construct_api_requests",
315+
side_effect=fake_construct_api_requests,
316+
), mock.patch(
317+
"dataretrieval.waterdata.utils._walk_pages",
318+
return_value=(
319+
pd.DataFrame({"id": ["row-1"], "value": [1]}),
320+
SimpleNamespace(
321+
url="https://example.test",
322+
elapsed=timedelta(milliseconds=1),
323+
headers={},
324+
),
325+
),
326+
):
327+
get_continuous(
328+
monitoring_location_id="USGS-07374525",
329+
parameter_code="72255",
330+
filter=expr,
331+
filter_lang="cql-json",
332+
)
333+
334+
assert sent_filters == [expr]

0 commit comments

Comments
 (0)