fix(waterdata.xarray): resolve remaining review findings

thodson-usgs · claude · thodson-usgs · commit 961be13da3e0 · 2026-06-05T09:56:17.000-05:00
- samples now surface station longitude/latitude (mapped from
  Location_Longitude/Location_Latitude; _point_coords reads explicit lon/lat
  columns in addition to an OGC geometry column)
- metadata cache: a single large pull is no longer subject to within-batch
  FIFO eviction (the call's result is built from the freshly-parsed entries),
  and sites with no metadata are no longer negatively cached, so they retry
- dense variable naming is deterministic and unambiguous: a bare name
  (e.g. discharge) is used only when unique; same-named series are all
  disambiguated by cell method / statistic / parameter code
- dense multi-unit label is deterministic (sorted) instead of row-order dependent
- row_size is int64 (was int32) to avoid overflow / cumsum truncation
- select_series rejects descriptor coords as keys (lon/lat float-equality
  footgun) and can match a null instance key

64 offline tests pass; live samples lon/lat + dense naming verified.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
@@ -232,7 +232,27 @@ def _lonlat(geom):
 
 
 def _point_coords(df, site):
-    """lon/lat dicts keyed by site from point geometry, or None."""
+    """lon/lat dicts keyed by site, or None.
+
+    Reads either a ``geometry`` column (the time-series getters' OGC response) or
+    explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via
+    :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates.
+    """
+    if {"longitude", "latitude"}.issubset(df.columns):
+        geo = df.dropna(subset=["longitude", "latitude"]).drop_duplicates(site)
+        if geo.empty:
+            return None
+        lon, lat = {}, {}
+        for site_id, x, y in zip(
+            geo[site].to_numpy(),
+            geo["longitude"].to_numpy(),
+            geo["latitude"].to_numpy(),
+        ):
+            try:
+                lon[site_id], lat[site_id] = float(x), float(y)
+            except (TypeError, ValueError):
+                continue
+        return (lon, lat) if lon else None
     if "geometry" not in df.columns:
         return None
     geo = df.dropna(subset=["geometry"]).drop_duplicates(site)
@@ -408,8 +428,9 @@ def lookup(self, site_ids):
         """
         sites = sorted({str(s) for s in site_ids if _pd.notna(s)})
         # Racy read of the keys is fine: a concurrent miss just re-fetches (the
-        # fetch is idempotent); only the writes in _ingest take the lock.
+        # fetch is idempotent); only the writes in _store take the lock.
         todo = [s for s in sites if s not in self._entries]
+        fresh: dict[str, dict] = {}
         if todo:
             try:
                 meta, _ = self._getter(monitoring_location_id=todo)
@@ -420,12 +441,17 @@ def lookup(self, site_ids):
                     stacklevel=2,
                 )
             else:
-                self._ingest(meta, todo)
+                fresh = self._parse(meta, todo)
+                self._store(fresh)
         param_meta: dict[str, dict] = {}
         site_meta: dict[str, dict] = {}
         with self._lock:
             for s in sites:
-                entry = self._entries.get(s, {})
+                # Prefer this call's freshly-parsed entry over the cache: the
+                # bounded cache may have already evicted just-fetched sites when a
+                # single pull's ``todo`` exceeds maxsize, but the current call
+                # must still see every site it fetched.
+                entry = fresh.get(s) or self._entries.get(s, {})
                 param_meta.update(entry.get("params", {}))
                 if entry.get("site"):
                     site_meta[s] = entry["site"]
@@ -439,12 +465,8 @@ def clear(self):
     def __len__(self):
         return len(self._entries)
 
-    def _ingest(self, meta, todo):
-        """Parse ``meta`` into per-site entries, then merge + evict under lock.
-
-        The parsing runs lock-free on a local dict; only the (cheap) merge into
-        the shared cache and the FIFO eviction past ``maxsize`` hold the lock.
-        """
+    def _parse(self, meta, todo):
+        """Parse ``meta`` into per-site ``{params, site}`` entries (lock-free)."""
         fresh = {s: {"params": {}, "site": {}} for s in todo}
         if not meta.empty:
             name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns]
@@ -470,8 +492,20 @@ def _ingest(self, meta, todo):
                     }
                     if desc:
                         fresh[site]["site"] = desc
+        return fresh
+
+    def _store(self, fresh):
+        """Merge non-empty entries into the bounded cache (FIFO eviction).
+
+        Sites that came back with no metadata are *not* cached, so a later call
+        retries them rather than being stuck with a sticky empty result; the
+        current call still sees them via the freshly-parsed ``fresh`` dict.
+        """
+        keep = {s: e for s, e in fresh.items() if e["params"] or e["site"]}
+        if not keep:
+            return
         with self._lock:
-            self._entries.update(fresh)
+            self._entries.update(keep)
             while len(self._entries) > self._maxsize:
                 self._entries.pop(next(iter(self._entries)))
 
@@ -524,14 +558,24 @@ def select_series(ds, **keys):
             "so select by name instead, e.g. "
             "ds[variable].sel(monitoring_location_id=...)."
         )
-    inst_coords = [c for c in ds.coords if ds[c].dims == ("timeseries",)]
+    # Selectable keys are the series *identity* coordinates only -- exclude the
+    # per-series descriptors (lon/lat are a float-equality footgun; unit/HUC/state
+    # are not series identifiers).
+    descriptors = {"longitude", "latitude", "unit_of_measure", *_SITE_DESCRIPTORS}
+    inst_coords = [
+        c for c in ds.coords if ds[c].dims == ("timeseries",) and c not in descriptors
+    ]
     mask = _np.ones(ds.sizes["timeseries"], dtype=bool)
     for key, value in keys.items():
         if key not in inst_coords:
             raise KeyError(
-                f"{key!r} is not a per-series coordinate; choose from {inst_coords}."
+                f"{key!r} is not a per-series identity coordinate; choose from "
+                f"{inst_coords}."
             )
-        mask &= ds[key].to_numpy() == value
+        arr = ds[key].to_numpy()
+        # NaN never equals anything, so match a missing instance key (e.g. a
+        # characteristic with no sample fraction) by null-ness instead.
+        mask &= _pd.isna(arr) if _is_missing(value) else (arr == value)
     matches = _np.flatnonzero(mask)
     if matches.size == 0:
         raise KeyError(f"no time series matches {keys}.")
@@ -563,6 +607,10 @@ def select_series(ds, **keys):
     "Result_SampleFraction": "sample_fraction",
     "Result_ResultDetectionCondition": "detection_condition",
     "Result_MeasureStatusIdentifier": "status",
+    # Samples carry position as explicit columns (no OGC ``geometry``); map them
+    # to the canonical names so _point_coords surfaces station lon/lat.
+    "Location_Longitude": "longitude",
+    "Location_Latitude": "latitude",
 }
 _CANONICAL_COORD_ATTRS = {
     "parameter_code": {"long_name": "USGS parameter code"},
@@ -786,7 +834,9 @@ def _assemble(self, work, inst_cols, ancillary, has_unit):
         )
         data_vars = {
             "value": ("obs", work["value"].to_numpy()),
-            "row_size": ("timeseries", row_size.to_numpy().astype("int32")),
+            # int64 (not int32): a single long, high-frequency series can exceed
+            # 2^31 observations, and the select_series cumsum must not overflow.
+            "row_size": ("timeseries", row_size.to_numpy().astype("int64")),
         }
         for c in ancillary:
             data_vars[c] = ("obs", work[c].to_numpy())
@@ -899,16 +949,25 @@ def _build_series(self, work, group_cols, ancillary, has_unit):
 
     def _variable_datasets(self, work, group_cols, ancillary, has_unit):
         """One pivoted ``(site, time)`` Dataset per (parameter, statistic)."""
-        datasets, used = [], set()
+        # First pass: gather each group's identity and base name, so naming can
+        # see the whole set (a bare name is only used when it is unambiguous).
+        specs = []
         for _, group in work.groupby(group_cols, dropna=False):
             pcode = _first_present(group, "parameter_code")
             stat = _first_present(group, "statistic_id")
-            group_units = group["unit_of_measure"].dropna().unique() if has_unit else ()
-            unit = group_units[0] if len(group_units) else None
             desc = self.series_meta.get(str(pcode), {}) if pcode is not None else {}
-
-            name = self._variable_name(desc, pcode, stat, used)
-            used.add(name)
+            base = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value")
+            specs.append((group, pcode, stat, desc, base))
+        names = self._disambiguate([s[4] for s in specs], [(s[1], s[2]) for s in specs])
+
+        datasets = []
+        for (group, pcode, stat, desc, _base), name in zip(specs, names):
+            # Sort the units so the chosen label is deterministic across pulls
+            # (values are not converted either way; see the multi-unit warning).
+            group_units = (
+                sorted(group["unit_of_measure"].dropna().unique()) if has_unit else []
+            )
+            unit = group_units[0] if group_units else None
 
             if len(group_units) > 1:
                 # One variable can carry only one ``units`` attr; surface the
@@ -951,15 +1010,34 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
         return datasets
 
     @staticmethod
-    def _variable_name(desc, pcode, stat, used):
-        """A unique slug for a variable; disambiguate same-parameter series."""
-        name = _slug(_none_if_nan(desc.get("parameter_name")) or pcode or "value")
-        if name in used:  # same parameter, different statistic -> distinct var
-            op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None)
-            name = f"{name}_{_slug(op)}" if op else name
-        while name in used:
-            name += "_x"
-        return name
+    def _disambiguate(bases, keys):
+        """Map per-group base slugs to unique, deterministic variable names.
+
+        ``keys[i]`` is the group's ``(parameter_code, statistic_id)``. A base used
+        by exactly one group stays bare (e.g. ``discharge``); a base shared by
+        several groups is disambiguated for *all* of them -- by the statistic's
+        cell-method operator (``discharge_maximum`` / ``discharge_mean``), falling
+        back to the statistic id then the parameter code -- so a bare name never
+        silently refers to an arbitrary one of several same-named series.
+        """
+        counts: dict[str, int] = {}
+        for b in bases:
+            counts[b] = counts.get(b, 0) + 1
+        names, used = [], set()
+        for base, (pcode, stat) in zip(bases, keys):
+            if counts[base] == 1:
+                name = base
+            else:
+                op = CF_CELL_METHODS.get(str(stat)) if stat is not None else None
+                suffix = op or (str(stat) if stat is not None else None)
+                name = f"{base}_{_slug(suffix)}" if suffix else base
+                if name == base or name in used:  # statistic didn't separate them
+                    name = f"{base}_{_slug(pcode)}" if pcode is not None else base
+            while name in used:
+                name += "_x"
+            used.add(name)
+            names.append(name)
+        return names
 
 
 class _StatsBuilder(_DatasetBuilder):
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
@@ -655,6 +655,41 @@ def test_select_series_returns_time_indexed_single_series():
     assert s["value"].sel(time="2024-06-01").item() == 100
 
 
+def test_dense_same_parameter_two_statistics_no_bare_name():
+    # 00060 under both 00001 (max) and 00003 (mean): the bare 'discharge' name is
+    # ambiguous, so BOTH variables are disambiguated by their cell method -- no
+    # order-dependent bare 'discharge' that silently means one of them.
+    mx = _daily_frame(values=(500,), times=("2024-06-01",))
+    mx["statistic_id"] = "00001"
+    mn = _daily_frame(values=(100,), times=("2024-06-01",))
+    ds = wdx._build_dense(
+        pd.concat([mx, mn]), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "discharge" not in ds.data_vars  # no bare (ambiguous) name
+    assert {"discharge_maximum", "discharge_mean"} <= set(ds.data_vars)
+    assert ds["discharge_maximum"].attrs["cell_methods"] == "time: maximum"
+    assert ds["discharge_mean"].attrs["cell_methods"] == "time: mean"
+
+
+def test_dense_single_statistic_keeps_bare_name():
+    # The common single-statistic case keeps the clean bare name.
+    ds = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    assert "discharge" in ds.data_vars
+
+
+def test_select_series_matches_nan_instance_key():
+    # An instance whose key is null (samples with no sample_fraction) must be
+    # selectable by passing None, since `== NaN` never matches.
+    df = _samples_frame()
+    df["Result_SampleFraction"] = None
+    ds = _samples_ds(df)
+    s = wdx.select_series(ds, characteristic="Temperature, water", sample_fraction=None)
+    assert set(s.sizes) == {"time"}
+    assert "value" in s.data_vars
+
+
 def test_select_series_ambiguous_raises():
     # selecting by site alone matches both instances -> ask for more keys
     ds = _two_instance_ragged()
@@ -670,8 +705,11 @@ def test_select_series_no_match_raises():
 
 def test_select_series_unknown_key_raises():
     ds = _two_instance_ragged()
-    with pytest.raises(KeyError, match="not a per-series coordinate"):
+    with pytest.raises(KeyError, match="not a per-series identity coordinate"):
         wdx.select_series(ds, bogus="x")
+    # descriptor coords (lon/lat/unit/HUC/state) are not selectable identity keys
+    with pytest.raises(KeyError, match="not a per-series identity coordinate"):
+        wdx.select_series(ds, unit_of_measure="ft^3/s")
 
 
 def test_select_series_on_dense_raises_helpful_error():
@@ -760,6 +798,20 @@ def _samples_ds(frame):
     )
 
 
+def test_samples_surface_lonlat_from_location_columns():
+    # Samples carry position as Location_Latitude/Location_Longitude (no OGC
+    # geometry); the dataset must still get numeric longitude/latitude coords.
+    frame = _samples_frame()
+    frame["Location_Longitude"] = [-90.44]
+    frame["Location_Latitude"] = [43.19]
+    ds = _samples_ds(frame)
+    assert "longitude" in ds.coords and "latitude" in ds.coords
+    assert ds["longitude"].dtype.kind == "f"
+    assert float(ds["longitude"].values[0]) == -90.44
+    assert float(ds["latitude"].values[0]) == 43.19
+    assert ds["longitude"].attrs["units"] == "degrees_east"
+
+
 def test_build_samples_single_characteristic():
     ds = _samples_ds(_samples_frame())
     assert set(ds.sizes) == {"obs", "timeseries"}
@@ -988,3 +1040,57 @@ def fake(monitoring_location_id):
     wdx._FIELD_CACHE._entries["Y"] = {"params": {}, "site": {}}
     wdx.clear_metadata_cache()
     assert len(wdx._TS_CACHE) == 0 and len(wdx._FIELD_CACHE) == 0
+
+
+def test_metadata_missing_site_is_not_negatively_cached():
+    # A site the metadata endpoint returns nothing for must NOT be cached as an
+    # empty entry (which would never be retried); a later call re-fetches it.
+    calls = []
+
+    def fake(monitoring_location_id):
+        calls.append(list(monitoring_location_id))
+        # respond only for S1, never for S2
+        rows = [
+            {
+                "monitoring_location_id": s,
+                "parameter_code": "00060",
+                "parameter_name": s,
+            }
+            for s in monitoring_location_id
+            if s == "S1"
+        ]
+        return pd.DataFrame(rows), SimpleNamespace(url=None)
+
+    cache = wdx._MetadataCache(fake)
+    cache.lookup(["S1", "S2"])
+    cache.lookup(["S1", "S2"])
+    # S1 cached (hit, not re-fetched); S2 never cached, so it is re-requested.
+    assert calls[0] == ["S1", "S2"]
+    assert calls[1] == ["S2"]  # only the still-uncached S2
+    assert "S1" in cache._entries and "S2" not in cache._entries
+
+
+def test_metadata_lookup_survives_within_batch_eviction():
+    # A single pull whose site count exceeds maxsize must still return metadata
+    # for every requested site, even though the bounded cache can't hold them all.
+    sites = ["S0", "S1", "S2", "S3", "S4"]
+
+    def fake(monitoring_location_id):
+        rows = [
+            {
+                "monitoring_location_id": s,
+                "parameter_code": f"p{s}",  # distinct per site
+                "parameter_name": f"name-{s}",
+                "hydrologic_unit_code": f"huc-{s}",
+            }
+            for s in monitoring_location_id
+        ]
+        return pd.DataFrame(rows), SimpleNamespace(url=None)
+
+    cache = wdx._MetadataCache(fake, maxsize=2)
+    param_meta, site_meta = cache.lookup(sites)
+    # every requested site's metadata is in the result even though the bounded
+    # cache evicted most of the just-fetched batch.
+    assert {f"p{s}" for s in sites} <= set(param_meta)
+    assert set(site_meta) == set(sites)
+    assert len(cache) <= 2  # cache stayed bounded