refactor(waterdata): /simplify pass 2 — encapsulate session sharing, drop duplicate json parse

thodson-usgs · claude · thodson-usgs · commit 47c82ec0e602 · 2026-05-21T20:08:43.000-05:00
Follow-up cleanup on the chunker module driven by another code review:

- _publish_session contextmanager wraps the _chunked_session
  ContextVar set/reset token dance. ChunkedCall.resume becomes a single
  `with requests.Session() as s, _publish_session(s):` line.

- Push the ContextVar lookup INTO `_session()` itself (resolution order:
  caller-provided -&gt; chunker's shared -&gt; fresh temp). _paginate no
  longer reaches across modules into `chunking._chunked_session`.

- Add `body` kwarg to `_get_resp_data` and `_next_req_url` so
  `_walk_pages` can `resp.json()` once and reuse the body across both
  helpers — eliminates a per-page redundant JSON parse on the OGC
  pagination path (~halves JSON-decode CPU per page).

- TypeVar `_Cursor` on `_paginate` so the two callbacks
  (parse_response, follow_up) are linked through the type system
  rather than `Any → Any`. Type checkers can now catch a cursor-type
  mismatch at a single call site.

- get_stats_data's follow_up no longer mutates the caller's args dict
  — uses `params={**args, "next_token": cursor}` instead of
  `args["next_token"] = cursor`.

- Switch _walk_pages's lambdas to named inner functions to match
  get_stats_data's style.

- Hoist the `RateLimited` import in
  test_chunked_session_isolated_per_resume to module level (was inside
  the fetch closure).

- Drop one redundant multi-line "PreparedRequest.method is already
  upper-cased" comment in _walk_pages (the inline form in
  get_stats_data is enough context for the codebase).

80 chunker + utils unit tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -39,6 +39,7 @@
 import math
 import os
 from collections.abc import Callable, Iterator
+from contextlib import contextmanager
 from contextvars import ContextVar
 from dataclasses import dataclass
 from typing import Any, ClassVar
@@ -96,7 +97,7 @@
 _QUOTA_HEADER = "x-ratelimit-remaining"
 
 # Session shared across all sub-requests of a single chunked call.
-# Set by ``ChunkedCall.resume`` so paginated-loop helpers downstream
+# Published by ``_publish_session`` so paginated-loop helpers downstream
 # (``_walk_pages``) reuse the same connection pool across the entire
 # fan-out instead of opening a fresh ``requests.Session`` per
 # sub-request. ``None`` when not inside a chunked call — paginated
@@ -108,6 +109,21 @@
     "_chunked_session", default=None
 )
 
+
+@contextmanager
+def _publish_session(session: requests.Session) -> Iterator[None]:
+    """
+    Make ``session`` visible to :func:`dataretrieval.waterdata.utils._session`
+    for the duration of the ``with`` block via the ``_chunked_session``
+    ContextVar. Wraps the set/reset token dance so callers don't have to.
+    """
+    token = _chunked_session.set(session)
+    try:
+        yield
+    finally:
+        _chunked_session.reset(token)
+
+
 # Separators the two axis kinds use to join their atoms back into
 # URL text. List axes comma-join values
 # (``site=USGS-A,USGS-B``); the filter axis OR-joins clauses
@@ -1022,22 +1038,18 @@ def resume(self) -> tuple[pd.DataFrame, requests.Response]:
             When the rate-limit window can't cover the remaining plan
             (checked after the first sub-request).
         """
-        with requests.Session() as session:
-            token = _chunked_session.set(session)
-            try:
-                completed = len(self._chunks)
-                for i, sub_args in enumerate(self.plan.iter_sub_args()):
-                    if i < completed:
-                        continue
-                    self._issue(sub_args)
-                frames = [frame for frame, _ in self._chunks]
-                responses = [resp for _, resp in self._chunks]
-                return (
-                    _combine_chunk_frames(frames),
-                    _combine_chunk_responses(responses, self.plan.canonical_url),
-                )
-            finally:
-                _chunked_session.reset(token)
+        with requests.Session() as session, _publish_session(session):
+            completed = len(self._chunks)
+            for i, sub_args in enumerate(self.plan.iter_sub_args()):
+                if i < completed:
+                    continue
+                self._issue(sub_args)
+            frames = [frame for frame, _ in self._chunks]
+            responses = [resp for _, resp in self._chunks]
+            return (
+                _combine_chunk_frames(frames),
+                _combine_chunk_responses(responses, self.plan.canonical_url),
+            )
 
     def _issue(self, sub_args: dict[str, Any]) -> None:
         try:
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -7,7 +7,7 @@
 from collections.abc import Callable, Iterable, Iterator, Mapping
 from contextlib import contextmanager
 from datetime import datetime, timedelta
-from typing import Any, get_args
+from typing import Any, TypeVar, get_args
 from zoneinfo import ZoneInfo
 
 import pandas as pd
@@ -678,7 +678,9 @@ def _construct_api_requests(
     return request.prepare()
 
 
-def _next_req_url(resp: requests.Response) -> str | None:
+def _next_req_url(
+    resp: requests.Response, *, body: dict[str, Any] | None = None
+) -> str | None:
     """
     Extracts the URL for the next page of results from an HTTP response from a
     water data endpoint.
@@ -687,6 +689,10 @@ def _next_req_url(resp: requests.Response) -> str | None:
     ----------
     resp : requests.Response
         The HTTP response object containing JSON data and headers.
+    body : dict, optional
+        Pre-parsed JSON body for ``resp``. When provided, skips the
+        ``resp.json()`` call — useful when the caller has already
+        decoded the body for its own use (avoids a second parse pass).
 
     Returns
     -------
@@ -702,7 +708,8 @@ def _next_req_url(resp: requests.Response) -> str | None:
     "rel" and "href" keys.
     - Checks for the "next" relation in the "links" to determine the next URL.
     """
-    body = resp.json()
+    if body is None:
+        body = resp.json()
     if not body.get("numberReturned"):
         return None
     header_info = resp.headers
@@ -719,7 +726,12 @@ def _next_req_url(resp: requests.Response) -> str | None:
     return None
 
 
-def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame:
+def _get_resp_data(
+    resp: requests.Response,
+    geopd: bool,
+    *,
+    body: dict[str, Any] | None = None,
+) -> pd.DataFrame:
     """
     Extracts and normalizes data from an HTTP response containing GeoJSON features.
 
@@ -731,6 +743,10 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame:
     geopd : bool
         Indicates whether geopandas is installed and should be used to
         handle geometries.
+    body : dict, optional
+        Pre-parsed JSON body for ``resp``. When provided, skips the
+        ``resp.json()`` call — useful when the caller has already
+        decoded the body for its own use (avoids a second parse pass).
 
     Returns
     -------
@@ -739,8 +755,8 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame:
         containing the feature properties and each row's service-specific id.
         Returns an empty pandas DataFrame if no features are returned.
     """
-    # Check if it's an empty response
-    body = resp.json()
+    if body is None:
+        body = resp.json()
     if not body.get("numberReturned"):
         return pd.DataFrame()
 
@@ -771,28 +787,36 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame:
 @contextmanager
 def _session(client: requests.Session | None) -> Iterator[requests.Session]:
     """
-    Yield a usable session, opening a temporary one when needed.
+    Yield a usable session, picking the best available source.
+
+    Resolution order:
 
-    Lets paginated-loop callers borrow a caller-provided session
-    (without closing it) or fall back to a short-lived one with a
-    single ``with`` statement, instead of repeating the
-    ``close_client = client is None`` pattern.
+    1. ``client`` if the caller supplied one (borrowed; not closed
+       here — the caller owns its lifecycle).
+    2. The chunker's shared session if we're inside a ``ChunkedCall``
+       fan-out (published via :func:`chunking._publish_session`).
+       Borrowed; ``ChunkedCall.resume`` closes it on exit.
+    3. A fresh short-lived ``requests.Session`` opened here and closed
+       on context exit.
 
     Parameters
     ----------
     client : requests.Session or None
-        A caller-owned session to borrow, or ``None`` to open a
-        temporary one.
+        A caller-owned session to borrow, or ``None`` to defer to the
+        chunker's shared session or a temporary one.
 
     Yields
     ------
     requests.Session
-        ``client`` itself when provided; otherwise a freshly opened
-        session that is closed on context exit.
+        The chosen session.
     """
     if client is not None:
         yield client
         return
+    shared = chunking._chunked_session.get()
+    if shared is not None:
+        yield shared
+        return
     with requests.Session() as new:
         yield new
 
@@ -833,12 +857,15 @@ def _finalize_paginated_response(
     initial.elapsed = total_elapsed
 
 
+_Cursor = TypeVar("_Cursor")
+
+
 def _paginate(
     initial_req: requests.PreparedRequest,
     *,
     geopd: bool,
-    parse_response: Callable[[requests.Response], tuple[pd.DataFrame, Any]],
-    follow_up: Callable[[Any, requests.Session], requests.Response],
+    parse_response: Callable[[requests.Response], tuple[pd.DataFrame, _Cursor | None]],
+    follow_up: Callable[[_Cursor, requests.Session], requests.Response],
     client: requests.Session | None = None,
 ) -> tuple[pd.DataFrame, requests.Response]:
     """
@@ -898,12 +925,6 @@ def _paginate(
             "into pandas DataFrames."
         )
 
-    # Inside a chunker fan-out, reuse the shared session so every
-    # sub-request rides the same connection pool. The fallback path
-    # (``client=None`` and no chunker context) opens a temp session.
-    if client is None:
-        client = chunking._chunked_session.get()
-
     with _session(client) as sess:
         resp = sess.send(initial_req)
         _raise_for_non_200(resp)
@@ -973,22 +994,25 @@ def _walk_pages(
     requests.exceptions.RequestException
         See :func:`_paginate`.
     """
-    # ``PreparedRequest.method`` is already upper-cased by
-    # ``requests`` during preparation, so no need to normalize again.
-    method = req.method
+    method = req.method  # ``PreparedRequest.method`` is already upper-cased.
     headers = dict(req.headers)
     content = req.body if method == "POST" else None
 
+    def parse_response(resp: requests.Response) -> tuple[pd.DataFrame, str | None]:
+        body = resp.json()
+        return (
+            _get_resp_data(resp, geopd=geopd, body=body),
+            _next_req_url(resp, body=body),
+        )
+
+    def follow_up(cursor: str, sess: requests.Session) -> requests.Response:
+        return sess.request(method, cursor, headers=headers, data=content)
+
     return _paginate(
         req,
         geopd=geopd,
-        parse_response=lambda resp: (
-            _get_resp_data(resp, geopd=geopd),
-            _next_req_url(resp),
-        ),
-        follow_up=lambda cursor, sess: sess.request(
-            method, cursor, headers=headers, data=content
-        ),
+        parse_response=parse_response,
+        follow_up=follow_up,
         client=client,
     )
 
@@ -1409,8 +1433,11 @@ def parse_response(resp: requests.Response) -> tuple[pd.DataFrame, str | None]:
         return _handle_stats_nesting(body, geopd=GEOPANDAS), body.get("next")
 
     def follow_up(cursor: str, sess: requests.Session) -> requests.Response:
-        args["next_token"] = cursor
-        return sess.request(method, url=url, params=args, headers=headers)
+        # Build a fresh params dict per page so the caller's ``args``
+        # is never mutated (the closure used to do ``args["next_token"] = ...``).
+        return sess.request(
+            method, url=url, params={**args, "next_token": cursor}, headers=headers
+        )
 
     df, response = _paginate(
         req,
diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py
@@ -369,8 +369,6 @@ def fetch(args):
         i = state["i"]
         state["i"] += 1
         if i == 1 and state["blow_up"]:
-            from dataretrieval.waterdata.utils import RateLimited
-
             raise RateLimited("429: Too many requests.")
         return (
             pd.DataFrame({"sites": list(args["sites"])}),