Address Copilot review on chunked CQL filter

thodson-usgs · claude · thodson-usgs · commit 850f14dc0b7e · 2026-04-22T14:14:37.000-05:00
- Dedupe on pre-rename feature `id` (always present at that stage)
  instead of `output_id`, which is the post-rename name and may not
  be on every OGC service's response.
- Aggregate elapsed time across chunk responses so the returned
  metadata's query_time reflects the whole operation rather than
  just the last chunk.
- Drop the redundant `continuous_id` from the fan-out test's mock
  properties so the assertion exercises the real `id`-based dedup
  path, and add a separate test that forces cross-chunk duplicate
  feature ids to prove they collapse to a single row.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -930,28 +930,35 @@ def get_ogc_data(
     convert_type = args.pop("convert_type", False)
     # Create fresh dictionary of args without any None values
     args = {k: v for k, v in args.items() if v is not None}
-    # Overlapping user OR-clauses are deduplicated by output_id further below.
+    # Overlapping user OR-clauses are deduplicated by feature id further below.
     filter_expr = args.get("filter")
     filter_chunks = (
         _chunk_cql_or(filter_expr) if isinstance(filter_expr, str) else [None]
     )
 
     frames = []
-    response = None
+    first_response = None
+    total_elapsed = None
     for chunk in filter_chunks:
         chunk_args = args if chunk is None else {**args, "filter": chunk}
         req = _construct_api_requests(**chunk_args)
-        chunk_df, response = _walk_pages(geopd=GEOPANDAS, req=req)
+        chunk_df, chunk_response = _walk_pages(geopd=GEOPANDAS, req=req)
         frames.append(chunk_df)
+        if first_response is None:
+            first_response = chunk_response
+            total_elapsed = chunk_response.elapsed
+        else:
+            total_elapsed = total_elapsed + chunk_response.elapsed
 
     if len(frames) == 1:
         return_list = frames[0]
     else:
         return_list = pd.concat(frames, ignore_index=True)
-        if output_id in return_list.columns:
-            return_list = return_list.drop_duplicates(
-                subset=output_id, ignore_index=True
-            )
+        # The top-level feature "id" is always present at this stage (the
+        # rename to output_id happens later in _arrange_cols), so dedup on
+        # it directly to catch overlapping OR-clauses across chunks.
+        if "id" in return_list.columns:
+            return_list = return_list.drop_duplicates(subset="id", ignore_index=True)
     # Manage some aspects of the returned dataset
     return_list = _deal_with_empty(return_list, properties, service)
 
@@ -961,8 +968,12 @@ def get_ogc_data(
     return_list = _arrange_cols(return_list, properties, output_id)
 
     return_list = _sort_rows(return_list)
-    # Create metadata object from response
-    metadata = BaseMetadata(response)
+    # Create metadata object from the first response. When the filter was
+    # chunked into multiple sub-requests, query_time reflects the total
+    # elapsed time across all chunks rather than just the first.
+    if len(frames) > 1:
+        first_response.elapsed = total_elapsed
+    metadata = BaseMetadata(first_response)
     return return_list, metadata
 
 
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -234,10 +234,7 @@ def respond(request, context):
                     "type": "Feature",
                     "id": f"chunk-{call_count['n']}",
                     "geometry": None,
-                    "properties": {
-                        "continuous_id": f"chunk-{call_count['n']}",
-                        "value": call_count["n"],
-                    },
+                    "properties": {"value": call_count["n"]},
                 }
             ],
             "links": [],
@@ -260,3 +257,55 @@ def respond(request, context):
     for req in requests_mock.request_history:
         filter_qs = parse_qs(urlsplit(req.url).query).get("filter", [""])[0]
         assert len(filter_qs) <= _CQL_FILTER_CHUNK_LEN
+
+
+@pytest.mark.skipif(
+    sys.version_info < (3, 10),
+    reason="get_continuous requires py>=3.10 (see tests/waterdata_test.py)",
+)
+def test_long_filter_deduplicates_cross_chunk_overlap(requests_mock):
+    """Features returned by multiple chunks (same feature `id`) are
+    deduplicated in the concatenated result."""
+    from dataretrieval.waterdata import get_continuous
+
+    clause = (
+        "(time >= '2023-01-{day:02d}T00:00:00Z' "
+        "AND time <= '2023-01-{day:02d}T00:30:00Z')"
+    )
+    expr = " OR ".join(clause.format(day=(i % 28) + 1) for i in range(300))
+
+    call_count = {"n": 0}
+
+    def respond(request, context):
+        context.status_code = 200
+        call_count["n"] += 1
+        # Every chunk returns the same feature id so dedup should collapse
+        # the concatenated frame down to a single row.
+        return {
+            "type": "FeatureCollection",
+            "numberReturned": 1,
+            "features": [
+                {
+                    "type": "Feature",
+                    "id": "shared-feature",
+                    "geometry": None,
+                    "properties": {"value": 1},
+                }
+            ],
+            "links": [],
+        }
+
+    requests_mock.get(OGC_CONTINUOUS_URL, json=respond)
+
+    df, _ = get_continuous(
+        monitoring_location_id="USGS-07374525",
+        parameter_code="72255",
+        filter=expr,
+        filter_lang="cql-text",
+    )
+
+    expected_chunks = _chunk_cql_or(expr)
+    assert len(expected_chunks) > 1
+    assert call_count["n"] == len(expected_chunks)
+    # Even though each chunk returned a feature, dedup by id collapses them.
+    assert len(df) == 1