Skip to content

Commit 2613792

Browse files
thodson-usgsclaude
andcommitted
Simplify get_nearest_continuous; add NEWS entry
Split the helper's body into four private functions so the top-level flow reads as a short recipe: - ``_check_nearest_kwargs`` reject kwargs the helper owns (``time``/``filter``/``filter_lang``); validate ``on_tie`` - ``_build_window_or_filter`` CQL ``OR``-chain of bracketed time windows, one per target - ``_pick_nearest_row`` window → nearest row, with the three tie-resolution branches isolated - ``_empty_nearest_result`` empty frame with a ``target_time`` column, used wherever no match lands Drops the nested ``for site → for target → mask → tie-branch`` loop in favor of a flat list-comprehension + walrus against the new helper. Fixes a fragile ``pd.to_datetime(list(targets), utc=True)`` (a numpy ``datetime64`` array would round-trip through ``list`` as tz-stripped scalars) — now passes the input directly to ``pd.to_datetime`` and wraps in ``pd.DatetimeIndex``. Swaps ``df = df.copy(); df["time"] = ...`` for ``df.assign(time=...)`` to avoid the full-frame copy. Also NEWS.md: add a short entry describing the new helper. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8aff522 commit 2613792

2 files changed

Lines changed: 98 additions & 56 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**04/23/2026:** Added `waterdata.get_nearest_continuous(targets, ...)` — for each of N target timestamps, fetches the single continuous observation closest to that timestamp in one HTTP round-trip (auto-chunked when the resulting CQL filter is long, via the facility added in #238). The helper is designed for workflows that pair many discrete-measurement timestamps with surrounding instantaneous data, which the OGC `time` parameter can't express since it only accepts one instant or one interval per request. Ties at window midpoints are resolved per a configurable `on_tie` ∈ {`"first"`, `"last"`, `"mean"`}; the default `window="PT7M30S"` matches a 15-minute continuous gauge.
2+
13
**04/22/2026:** Highlights since the `v1.1.0` release (2025-11-26), which shipped the `waterdata` module:
24

35
- Added `get_channel` for channel-measurement data (#218) and `get_stats_por` / `get_stats_date_range` for period-of-record and daily statistics (#207).

dataretrieval/waterdata/api.py

Lines changed: 96 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -560,86 +560,126 @@ def get_nearest_continuous(
560560
... on_tie="mean",
561561
... )
562562
"""
563-
for forbidden in ("time", "filter", "filter_lang"):
564-
if forbidden in kwargs:
565-
raise TypeError(
566-
f"get_nearest_continuous constructs its own {forbidden!r}; "
567-
"do not pass it directly"
568-
)
569-
if on_tie not in ("first", "last", "mean"):
570-
raise ValueError(f"on_tie must be 'first', 'last', or 'mean'; got {on_tie!r}")
571-
572-
targets = pd.to_datetime(list(targets), utc=True)
563+
_check_nearest_kwargs(kwargs, on_tie)
564+
targets = pd.DatetimeIndex(pd.to_datetime(targets, utc=True))
573565
window_td = pd.Timedelta(window)
574566

575567
if len(targets) == 0:
576-
# Nothing to ask about — return an empty frame shaped like a real
577-
# ``get_continuous`` response (via a trivially-empty time range).
568+
# Issue a trivial-range request so the caller still receives a
569+
# real ``BaseMetadata``; return an empty frame with the same
570+
# shape a real response would have.
578571
df, md = get_continuous(
579572
monitoring_location_id=monitoring_location_id,
580573
parameter_code=parameter_code,
581574
time="1900-01-01T00:00:00Z/1900-01-01T00:00:00Z",
582575
**kwargs,
583576
)
584-
return df.iloc[0:0], md
585-
586-
filter_expr = " OR ".join(
587-
f"(time >= '{(t - window_td).strftime('%Y-%m-%dT%H:%M:%SZ')}' "
588-
f"AND time <= '{(t + window_td).strftime('%Y-%m-%dT%H:%M:%SZ')}')"
589-
for t in targets
590-
)
577+
return _empty_nearest_result(df), md
591578

579+
filter_expr = _build_window_or_filter(targets, window_td)
592580
df, md = get_continuous(
593581
monitoring_location_id=monitoring_location_id,
594582
parameter_code=parameter_code,
595583
filter=filter_expr,
596584
filter_lang="cql-text",
597585
**kwargs,
598586
)
599-
600587
if df.empty:
601-
return df, md
588+
return _empty_nearest_result(df), md
602589

603-
df = df.copy()
604-
df["time"] = pd.to_datetime(df["time"], utc=True)
590+
df = df.assign(time=pd.to_datetime(df["time"], utc=True))
591+
site_groups = (
592+
df.groupby("monitoring_location_id", sort=False)
593+
if "monitoring_location_id" in df.columns
594+
else [(None, df)]
595+
)
605596

606-
if "monitoring_location_id" in df.columns:
607-
site_groups = list(df.groupby("monitoring_location_id", sort=False))
608-
else:
609-
site_groups = [(None, df)]
597+
selected = [
598+
row
599+
for _, site_df in site_groups
600+
for target in targets
601+
if (row := _pick_nearest_row(site_df, target, window_td, on_tie)) is not None
602+
]
603+
if not selected:
604+
return _empty_nearest_result(df), md
605+
return pd.DataFrame(selected).reset_index(drop=True), md
606+
607+
608+
_VALID_ON_TIE = ("first", "last", "mean")
610609

611-
selected = []
612-
for _, site_df in site_groups:
613-
for target in targets:
614-
mask = (site_df["time"] >= target - window_td) & (
615-
site_df["time"] <= target + window_td
610+
611+
def _check_nearest_kwargs(kwargs: dict, on_tie: str) -> None:
612+
"""Reject kwargs the helper owns; validate ``on_tie``."""
613+
for forbidden in ("time", "filter", "filter_lang"):
614+
if forbidden in kwargs:
615+
raise TypeError(
616+
f"get_nearest_continuous constructs its own {forbidden!r}; "
617+
"do not pass it directly"
616618
)
617-
window_df = site_df[mask]
618-
if window_df.empty:
619-
continue
620-
deltas = (window_df["time"] - target).abs()
621-
candidates = window_df[deltas == deltas.min()].sort_values("time")
622-
623-
if len(candidates) == 1 or on_tie == "first":
624-
row = candidates.iloc[0].copy()
625-
elif on_tie == "last":
626-
row = candidates.iloc[-1].copy()
627-
else: # "mean"
628-
row = candidates.iloc[0].copy()
629-
for col in candidates.select_dtypes("number").columns:
630-
row[col] = candidates[col].mean()
631-
row["time"] = target
632-
633-
row["target_time"] = target
634-
selected.append(row)
619+
if on_tie not in _VALID_ON_TIE:
620+
raise ValueError(f"on_tie must be one of {_VALID_ON_TIE}; got {on_tie!r}")
635621

636-
if not selected:
637-
empty = df.iloc[0:0].copy()
638-
empty["target_time"] = pd.Series(dtype="datetime64[ns, UTC]")
639-
return empty, md
640622

641-
result = pd.DataFrame(selected).reset_index(drop=True)
642-
return result, md
623+
def _build_window_or_filter(targets: pd.DatetimeIndex, window_td: pd.Timedelta) -> str:
624+
"""Build the CQL OR-chain of ``time >= ... AND time <= ...`` windows.
625+
626+
``get_continuous`` auto-chunks the result if the full URL would
627+
exceed the server's length limit, so this is always safe to build
628+
as one string even for many targets.
629+
"""
630+
return " OR ".join(
631+
f"(time >= '{(t - window_td).strftime('%Y-%m-%dT%H:%M:%SZ')}' "
632+
f"AND time <= '{(t + window_td).strftime('%Y-%m-%dT%H:%M:%SZ')}')"
633+
for t in targets
634+
)
635+
636+
637+
def _pick_nearest_row(
638+
site_df: pd.DataFrame,
639+
target: pd.Timestamp,
640+
window_td: pd.Timedelta,
641+
on_tie: str,
642+
) -> pd.Series | None:
643+
"""Return the single row within ``window_td`` of ``target``, or ``None``.
644+
645+
Resolves ties (two rows equidistant from ``target``) per ``on_tie``.
646+
The returned row carries a ``target_time`` column identifying which
647+
target it was selected for.
648+
"""
649+
in_window = site_df[
650+
(site_df["time"] >= target - window_td)
651+
& (site_df["time"] <= target + window_td)
652+
]
653+
if in_window.empty:
654+
return None
655+
deltas = (in_window["time"] - target).abs()
656+
candidates = in_window[deltas == deltas.min()].sort_values("time")
657+
658+
if len(candidates) == 1 or on_tie == "first":
659+
row = candidates.iloc[0].copy()
660+
elif on_tie == "last":
661+
row = candidates.iloc[-1].copy()
662+
else: # "mean" — synthesize a row whose numeric cols are averaged and
663+
# whose ``time`` is the target (no real observation sits at the midpoint).
664+
row = candidates.iloc[0].copy()
665+
for col in candidates.select_dtypes("number").columns:
666+
row[col] = candidates[col].mean()
667+
row["time"] = target
668+
669+
row["target_time"] = target
670+
return row
671+
672+
673+
def _empty_nearest_result(template: pd.DataFrame | None = None) -> pd.DataFrame:
674+
"""Empty frame with a ``target_time`` column, for no-match cases.
675+
676+
When ``template`` is provided, preserve its columns/dtypes so the
677+
returned frame matches the shape of a real ``get_continuous``
678+
response.
679+
"""
680+
base = pd.DataFrame() if template is None else template.iloc[0:0].copy()
681+
base["target_time"] = pd.Series(dtype="datetime64[ns, UTC]")
682+
return base
643683

644684

645685
def get_monitoring_locations(

0 commit comments

Comments
 (0)