feat(wateruse): fetch fanned-out locations concurrently (no backoff needed)

thodson-usgs · claude · thodson-usgs · commit 63a13ba3ef7f · 2026-06-24T10:17:48.000-05:00
A multi-value `state`/`county`/`huc` selector now fans out over a `ThreadPoolExecutor` instead of a serial loop. Concurrency is capped by a module-level `MAX_CONCURRENT_REQUESTS` (default 4; set to 1 for serial) — kept in this module rather than honoring the OGC engine's `API_USGS_CONCURRENT`, so wateruse stays decoupled from the engine. The locations are independent single requests over the synchronous `_get`, so the thread pool needs no shared state; `pool.map` preserves input order and re-raises the first failure. Stress-tested against the live NWDC at concurrency 1/2/4/8/16 over 16 distinct locations: all 200s, zero rate-limit/connection errors, and the rate budget depletes one token per request regardless of concurrency — so no request backoff/retry is required. End-to-end results are concurrency-invariant (byte-identical at conc 1/4/8) with a ~3.6x speedup at the default of 4. The single-location common path skips the pool entirely. Tests route each location to its own mocked response so the fan-out assertions are deterministic under thread races, and cover both the concurrent and serial (cap=1) paths. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Sjb14HkwuCydKSKMsaXsgd
diff --git a/dataretrieval/wateruse.py b/dataretrieval/wateruse.py
@@ -42,6 +42,7 @@
 import io
 import logging
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any
 
 import httpx
@@ -75,6 +76,13 @@
 #: Temporal resolutions: monthly, annual calendar year, annual water year.
 TIME_RESOLUTIONS = ("monthly", "annualcy", "annualwy")
 
+#: Maximum locations fetched concurrently when a list of state/county/huc
+#: selectors is fanned out (one request per location). Kept conservative
+#: because this module intentionally carries no request backoff/retry; the
+#: NWDC tolerates this level of concurrency without rate-limit errors (verified
+#: by stress test). Set ``wateruse.MAX_CONCURRENT_REQUESTS = 1`` for serial.
+MAX_CONCURRENT_REQUESTS = 4
+
 # Page responses carry the HUC12 identifier in this column; it must stay a
 # string so leading zeros (e.g. "010900020502") survive the round trip.
 _HUC12_COLUMN = "huc12_id"
@@ -104,8 +112,9 @@ def get_wateruse(
     frame.
 
     Each selector also accepts a list of values. The NWDC queries one area per
-    request, so a list is fanned out into one request per value and the results
-    are concatenated — convenient, but proportionally slower for many areas.
+    request, so a list is fanned out into one request per value — up to
+    :data:`MAX_CONCURRENT_REQUESTS` in parallel — and the results are
+    concatenated in the order given.
 
     Parameters
     ----------
@@ -209,17 +218,25 @@ def get_wateruse(
     # The NWDC queries one location per request, so fan a multi-value selector
     # out into a request per location and concatenate the results.
     locations = _resolve_locations(state, county, huc)
-    frame, first_response = _fetch_all_pages(
-        {**base_params, "location": locations[0]}, ssl_check=ssl_check
-    )
-    frames = [frame]
-    for location in locations[1:]:
-        frame, _ = _fetch_all_pages(
+
+    def _fetch(location: str) -> tuple[pd.DataFrame, httpx.Response]:
+        return _fetch_all_pages(
             {**base_params, "location": location}, ssl_check=ssl_check
         )
-        frames.append(frame)
-    df = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
-    return df, BaseMetadata(first_response)
+
+    if len(locations) == 1:
+        # Common case: no pool, and no extra concat copy of the whole result.
+        frame, response = _fetch(locations[0])
+        return frame, BaseMetadata(response)
+
+    # Fan out concurrently (bounded), preserving input order. The locations are
+    # independent single requests, so a thread pool over the synchronous fetch
+    # needs no shared state or backoff; ``pool.map`` re-raises the first failure.
+    workers = min(len(locations), max(1, MAX_CONCURRENT_REQUESTS))
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        results = list(pool.map(_fetch, locations))
+    df = pd.concat([frame for frame, _ in results], ignore_index=True)
+    return df, BaseMetadata(results[0][1])
 
 
 # Valid HUC code lengths (digits) → the hydrologic-unit level they query.
diff --git a/tests/wateruse_test.py b/tests/wateruse_test.py
@@ -220,19 +220,49 @@ def test_state_selector_builds_location_query(httpx_mock):
     assert qs["location"] == ["stateCd:RI"]
 
 
-def test_multiple_states_fan_out_into_separate_requests(httpx_mock):
-    """A list selector issues one request per location and concatenates them."""
-    httpx_mock.add_response(method="GET", url=WU_RE, text=_CSV_P1)  # first state
-    httpx_mock.add_response(method="GET", url=WU_RE, text=_CSV_P2)  # second state
+def test_multiple_states_fan_out_preserves_input_order(httpx_mock):
+    """A list selector fans out one request per location and concatenates the
+    results in the order given — even though the requests run concurrently and
+    may reach the server out of order. Each location is routed to its own
+    response so attribution is deterministic regardless of arrival order."""
+    httpx_mock.add_response(
+        method="GET", url=re.compile(r".*location=stateCd%3ARI.*"), text=_CSV_P1
+    )
+    httpx_mock.add_response(
+        method="GET", url=re.compile(r".*location=stateCd%3AWI.*"), text=_CSV_P2
+    )
 
     df, _ = get_wateruse(model="wu-public-supply-wd", state=["RI", "Wisconsin"])
 
-    # _CSV_P1 (2 rows) + _CSV_P2 (1 row), one request per state.
-    assert len(df) == 3
+    # RI's rows (_CSV_P1) precede WI's (_CSV_P2) regardless of which request the
+    # thread pool dispatched first.
+    assert df["huc12_id"].tolist() == [
+        "010900020502",
+        "010900020503",
+        "010900020504",
+    ]
     reqs = httpx_mock.get_requests()
     assert len(reqs) == 2
-    locations = [parse_qs(urlsplit(str(r.url)).query)["location"][0] for r in reqs]
-    assert locations == ["stateCd:RI", "stateCd:WI"]
+    assert {parse_qs(urlsplit(str(r.url)).query)["location"][0] for r in reqs} == {
+        "stateCd:RI",
+        "stateCd:WI",
+    }
+
+
+def test_fan_out_is_serial_when_concurrency_is_one(httpx_mock, monkeypatch):
+    """``MAX_CONCURRENT_REQUESTS = 1`` still fans out correctly (serial path)."""
+    monkeypatch.setattr(wateruse, "MAX_CONCURRENT_REQUESTS", 1)
+    httpx_mock.add_response(
+        method="GET", url=re.compile(r".*location=stateCd%3ARI.*"), text=_CSV_P1
+    )
+    httpx_mock.add_response(
+        method="GET", url=re.compile(r".*location=stateCd%3AWI.*"), text=_CSV_P2
+    )
+
+    df, _ = get_wateruse(model="wu-public-supply-wd", state=["RI", "WI"])
+
+    assert len(df) == 3
+    assert len(httpx_mock.get_requests()) == 2
 
 
 # --- _resolve_locations unit tests (no HTTP) -------------------------------