Skip to content

Commit 243a794

Browse files
thodson-usgsclaude
andcommitted
Merge upstream/main; take upstream's chunker (B1 superseded), keep B2/B3
Resolves conflicts in waterdata/chunking.py, waterdata/utils.py, and tests/waterdata_chunking_test.py. Upstream's async parallel chunker (3e9ebdc) independently implemented B1's fix — a `finalize` hook bound at construction (utils._finalize_ogc -> get_ogc_data -> ChunkedCall(finalize=...)), described as the single source of result shape, applied on both the normal return and resume(), with partial_frame/partial_response kept raw to avoid masking. That is the same design this branch converged on, so the conflicted files take upstream's (async-aware) version and this branch's now-redundant B1 work is dropped. B2 (wqp.WQP_Metadata.site_info) and B3 (streamstats.Watershed) do not conflict and are preserved — they remain the PR's contribution over upstream. Verified: 276 mocked tests pass (upstream's chunker suite + the B2/B3 regressions); ruff clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2 parents 459f391 + 9cf7251 commit 243a794

31 files changed

Lines changed: 2653 additions & 3820 deletions

.pre-commit-config.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,12 @@ repos:
2626
- id: ruff-check
2727
args: [--fix]
2828
- id: ruff-format
29+
30+
# Strip cell outputs + execution_count from notebooks on commit so the
31+
# diff is the source, not the rendered run. Demos still execute fine
32+
# locally; clean commits keep PRs reviewable and avoid quota/timestamp
33+
# churn on every re-run.
34+
- repo: https://github.com/kynan/nbstripout
35+
rev: 0.8.1
36+
hooks:
37+
- id: nbstripout

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,16 +125,16 @@ Access water quality data from multiple agencies:
125125
```python
126126
from dataretrieval import wqp
127127

128-
# Find water quality monitoring sites
129-
sites = wqp.what_sites(
128+
# Find water quality monitoring sites (returns a DataFrame and metadata)
129+
sites, metadata = wqp.what_sites(
130130
statecode='US:55', # Wisconsin
131131
siteType='Stream'
132132
)
133133

134134
print(f"Found {len(sites)} stream monitoring sites in Wisconsin")
135135

136136
# Get water quality results
137-
results = wqp.get_results(
137+
results, metadata = wqp.get_results(
138138
siteid='USGS-05427718',
139139
characteristicName='Temperature, water'
140140
)

dataretrieval/waterdata/_progress.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ def __init__(
121121
# The hourly request quota (``x-ratelimit-limit``), shown as the
122122
# denominator when the server reports it.
123123
self.rate_limit: str | None = None
124+
# Transient note shown while a sub-request backs off before a
125+
# retry; cleared by the next page/chunk so it doesn't linger.
126+
self.retry_note: str | None = None
124127
self._last_len = 0
125128
# Whether anything was actually written to the stream — drives whether
126129
# close() needs a terminating newline. (``current_chunk`` is a poor
@@ -140,13 +143,33 @@ def start_chunk(self, index: int) -> None:
140143
avoids a premature "0 pages" frame before the first page arrives.
141144
"""
142145
self.current_chunk = index
146+
self.retry_note = None
143147
if self.total_chunks > 1:
144148
self._render()
145149

146150
def add_page(self, rows: int = 0) -> None:
147151
"""Record one fetched page carrying ``rows`` rows and redraw."""
148152
self.pages += 1
149153
self.rows += int(rows)
154+
self.retry_note = None
155+
self._render()
156+
157+
def note_retry(self, *, attempt: int, wait: float) -> None:
158+
"""Show that a sub-request is backing off before retry ``attempt``.
159+
160+
Cleared by the next :meth:`add_page` / :meth:`start_chunk` (or by
161+
:meth:`close`) so the line returns to normal once the retry resolves.
162+
"""
163+
# Keep sub-second waits explicit (avoid misleading ``0s``) while
164+
# rendering whole-second waits without unnecessary ``.0`` noise.
165+
# ``float()`` to support Python 3.9-3.11: ``round(int, 1)`` returns an
166+
# int and ``int.is_integer()`` (used below) only exists on 3.12+.
167+
wait_1dp = round(float(wait), 1)
168+
if wait_1dp < 1 or not wait_1dp.is_integer():
169+
secs = f"{wait_1dp:.1f}s"
170+
else:
171+
secs = f"{wait_1dp:.0f}s"
172+
self.retry_note = f"retrying (attempt {attempt}, waiting {secs})"
150173
self._render()
151174

152175
def set_rate_remaining(
@@ -179,6 +202,8 @@ def _format(self) -> str:
179202
else:
180203
segment = f"{remaining} requests remaining"
181204
parts.append(segment)
205+
if self.retry_note is not None:
206+
parts.append(self.retry_note)
182207
if self.service:
183208
return f"Retrieving: {self.service} · " + " · ".join(parts)
184209
return "Progress: " + " · ".join(parts)
@@ -209,6 +234,13 @@ def close(self) -> None:
209234
"""
210235
if self._closed:
211236
return
237+
# A retry note set during the final backoff would otherwise freeze as
238+
# the persisted last line of a call that has since completed or given
239+
# up; clear it and redraw (while still un-closed, so ``_render`` runs)
240+
# so the final state isn't a stale "retrying".
241+
if self.enabled and self._rendered and self.retry_note is not None:
242+
self.retry_note = None
243+
self._render()
212244
self._closed = True
213245
if not (self.enabled and self._rendered):
214246
return

dataretrieval/waterdata/api.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2022,6 +2022,7 @@ def get_reference_table(
20222022
collection: str,
20232023
limit: int | None = None,
20242024
query: dict | None = None,
2025+
max_rows: int | None = None,
20252026
) -> tuple[pd.DataFrame, BaseMetadata]:
20262027
"""Get metadata reference tables for the USGS Water Data API.
20272028
@@ -2046,6 +2047,12 @@ def get_reference_table(
20462047
query: dictionary, optional
20472048
The optional args parameter can be used to pass a dictionary of
20482049
query parameters to the collection API call.
2050+
max_rows : int, optional
2051+
Cap the total number of rows returned, stopping pagination early
2052+
instead of downloading the whole table. Useful for cheaply
2053+
previewing large tables (e.g. ``hydrologic-unit-codes`` has ~125k
2054+
rows). Unlike ``limit`` (the per-page size), this bounds the total
2055+
result. The default (None) downloads every page.
20492056
20502057
Returns
20512058
-------
@@ -2092,7 +2099,9 @@ def get_reference_table(
20922099
query_args = dict(query) if query else {}
20932100
if limit is not None:
20942101
query_args["limit"] = limit
2095-
return get_ogc_data(args=query_args, output_id=output_id, service=collection)
2102+
return get_ogc_data(
2103+
args=query_args, output_id=output_id, service=collection, max_rows=max_rows
2104+
)
20962105

20972106

20982107
def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:

0 commit comments

Comments
 (0)