DOI-USGS
diff --git a/‎dataretrieval/waterdata/xarray.py‎
Lines changed: 94 additions & 106 deletions b/‎dataretrieval/waterdata/xarray.py‎
Lines changed: 94 additions & 106 deletions
@@ -17,9 +17,12 @@
 * dataset-level attributes carry ``Conventions``, provenance, and the
   request URL.
 
-The wrappers call the underlying getter with ``include_hash=True`` so the
-join key survives, perform the metadata lookup, then drop the opaque hash
-columns from the user-facing result.
+The wrappers call the underlying getter with its default, hash-free output
+-- so the large per-record UUID column is never fetched or materialized --
+and derive the CF attributes directly from the surviving columns
+(``unit_of_measure`` -> ``units``, ``statistic_id`` -> ``cell_methods``,
+``parameter_code`` -> ``standard_name``). Only the human-readable parameter
+name comes from a small, cached metadata lookup keyed by ``parameter_code``.
 
 This module requires the optional ``xarray`` dependency::
 
@@ -79,15 +82,16 @@
     "%": "percent",
 }
 
-# computation_identifier -> the operator in a CF ``cell_methods`` string.
-_CELL_METHOD = {
-    "Mean": "mean",
-    "Sum": "sum",
-    "Maximum": "maximum",
-    "Max At Event Time": "maximum",
-    "Minimum": "minimum",
-    "Median": "median",
-    "Instantaneous": "point",
+# USGS statistic_id -> the operator in a CF ``cell_methods`` string. Read
+# straight from the values frame, so no metadata round-trip is needed to
+# classify the aggregation.
+_STATISTIC_CELL_METHOD = {
+    "00001": "maximum",
+    "00002": "minimum",
+    "00003": "mean",
+    "00006": "sum",
+    "00008": "median",
+    "00011": "point",  # instantaneous
 }
 
 # USGS 5-digit parameter code -> CF standard_name. Deliberately conservative;
@@ -109,62 +113,43 @@
 _TS_META_CACHE: dict[str, dict[str, dict]] = {}
 _FIELD_META_CACHE: dict[str, dict[str, dict]] = {}
 
-_TS_DESCRIPTORS = (
-    "parameter_code",
-    "parameter_name",
-    "parameter_description",
-    "unit_of_measure",
-    "statistic_id",
-    "computation_period_identifier",
-    "computation_identifier",
-)
-_FIELD_DESCRIPTORS = ("parameter_code", "parameter_name", "parameter_description")
+# Only the human-readable name is sourced from the metadata endpoint; units,
+# statistic, and parameter code all come from the values frame itself.
+_NAME_DESCRIPTORS = ("parameter_name", "parameter_description")
 
 
-def _lookup(site_ids, cache, getter, id_col, descriptors):
-    """Fetch and cache metadata descriptors keyed by the series id column.
+def _lookup(site_ids, cache, getter):
+    """Fetch and cache ``{parameter_code: {name descriptors}}`` for sites.
 
-    Returns a flat ``{series_id: {descriptor: value}}`` dict covering every
-    requested site. One network call per not-yet-cached batch of sites; the
-    cache is keyed by site so repeated getter calls reuse it.
+    Keyed by ``parameter_code`` (stable and 1:1 with the parameter name), so
+    the lookup needs no hash id. One metadata call per not-yet-cached batch
+    of sites; the cache is keyed by site so repeated getter calls reuse it.
     """
     sites = sorted({str(s) for s in site_ids if _pd.notna(s)})
     todo = [s for s in sites if s not in cache]
     if todo:
-        meta, _ = getter(monitoring_location_id=todo, include_hash=True)
+        meta, _ = getter(monitoring_location_id=todo)
         for s in todo:
             cache[s] = {}
-        if not meta.empty:
-            cols = [c for c in descriptors if c in meta.columns]
+        if not meta.empty and "parameter_code" in meta.columns:
+            cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns]
             for _, row in meta.iterrows():
                 site = row.get("monitoring_location_id")
-                sid = row.get(id_col)
-                if site in cache and _pd.notna(sid):
-                    cache[site][sid] = {c: row.get(c) for c in cols}
+                pcode = row.get("parameter_code")
+                if site in cache and _pd.notna(pcode):
+                    cache[site][str(pcode)] = {c: row.get(c) for c in cols}
     out: dict[str, dict] = {}
     for s in sites:
         out.update(cache.get(s, {}))
     return out
 
 
 def _timeseries_metadata(site_ids):
-    return _lookup(
-        site_ids,
-        _TS_META_CACHE,
-        _api.get_time_series_metadata,
-        "time_series_id",
-        _TS_DESCRIPTORS,
-    )
+    return _lookup(site_ids, _TS_META_CACHE, _api.get_time_series_metadata)
 
 
 def _field_metadata(site_ids):
-    return _lookup(
-        site_ids,
-        _FIELD_META_CACHE,
-        _api.get_field_measurements_metadata,
-        "field_series_id",
-        _FIELD_DESCRIPTORS,
-    )
+    return _lookup(site_ids, _FIELD_META_CACHE, _api.get_field_measurements_metadata)
 
 
 # --- helpers ---------------------------------------------------------------
@@ -182,20 +167,26 @@ def _first(series):
     return nonnull.iloc[0] if len(nonnull) else None
 
 
-def _var_attrs(desc, group, *, default_cell_method, pcode, ancillary, name):
-    """Build the CF attribute dict for one data variable."""
+def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name):
+    """Build the CF attribute dict for one data variable.
+
+    ``unit``, ``pcode`` and ``stat`` are read from the values frame; ``desc``
+    supplies only the human-readable name from the metadata lookup.
+    """
     attrs: dict[str, str] = {}
     long_name = desc.get("parameter_description") or desc.get("parameter_name")
     if long_name and _pd.notna(long_name):
         attrs["long_name"] = str(long_name)
 
-    unit = desc.get("unit_of_measure")
-    if (unit is None or _pd.isna(unit)) and "unit_of_measure" in group:
-        unit = _first(group["unit_of_measure"])
     if unit is not None and _pd.notna(unit):
         attrs["units"] = _UDUNITS.get(str(unit), str(unit))
 
-    op = _CELL_METHOD.get(desc.get("computation_identifier"), default_cell_method)
+    op = (
+        _STATISTIC_CELL_METHOD.get(str(stat))
+        if stat is not None and _pd.notna(stat)
+        else None
+    )
+    op = op or default_cell_method
     if op:
         attrs["cell_methods"] = f"time: {op}"
 
@@ -205,9 +196,6 @@ def _var_attrs(desc, group, *, default_cell_method, pcode, ancillary, name):
             attrs["standard_name"] = sn
         attrs["usgs_parameter_code"] = str(pcode)
 
-    stat = desc.get("statistic_id")
-    if (stat is None or _pd.isna(stat)) and "statistic_id" in group:
-        stat = _first(group["statistic_id"])
     if stat is not None and _pd.notna(stat):
         attrs["usgs_statistic_id"] = str(stat)
 
@@ -254,69 +242,77 @@ def _point_coords(df, inst):
     return lon, lat
 
 
+_INSTANCE = "monitoring_location_id"
+
+
 def _build_timeseries(
     df,
     base_meta,
     *,
     service,
     series_meta,
-    key_col="time_series_id",
     group_cols=("parameter_code", "statistic_id"),
     default_cell_method=None,
 ):
-    """Long values frame -> CF timeSeries Dataset (one var per parameter)."""
+    """Hash-free values frame -> CF timeSeries Dataset (one var per parameter).
+
+    The frame carries no hash columns (the wrappers fetch the default lean
+    output); every CF attribute is derived from ``parameter_code`` /
+    ``statistic_id`` / ``unit_of_measure`` already present, plus the
+    human-readable name from ``series_meta`` keyed by ``parameter_code``.
+    ``time`` and ``monitoring_location_id`` become the coordinates.
+    """
     if df is None or len(df) == 0:
         return _empty_dataset(service, base_meta)
 
-    df = df.copy()
-    # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype).
-    times = _pd.to_datetime(df["time"], errors="coerce", utc=True)
-    df["time"] = times.dt.tz_localize(None)
-    df["value"] = _pd.to_numeric(df["value"], errors="coerce")
     group_cols = [c for c in group_cols if c in df.columns]
-
-    # Instance (DSG) dimension: the site, unless two series collide on
-    # (site, parameter, time) -- then fall back to the unambiguous series id.
-    inst = "monitoring_location_id"
-    if key_col in df.columns and df.duplicated(group_cols + [inst, "time"]).any():
-        inst = key_col
-        _warnings.warn(
-            "multiple time series share a (site, parameter); using "
-            f"'{key_col}' as the instance dimension instead of "
-            "'monitoring_location_id'.",
-            stacklevel=3,
-        )
+    ancillary = [c for c in _ANCILLARY if c in df.columns]
+    has_unit = "unit_of_measure" in df.columns
+    # Slim to just the columns we convert, so the heavy frame (and any columns
+    # we ignore) is not copied wholesale.
+    cols = [_INSTANCE, "time", "value", *group_cols, *ancillary]
+    if has_unit:
+        cols.append("unit_of_measure")
+    work = df.loc[:, list(dict.fromkeys(cols))].copy()
+    # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype).
+    work["time"] = _pd.to_datetime(
+        work["time"], errors="coerce", utc=True
+    ).dt.tz_localize(None)
+    work["value"] = _pd.to_numeric(work["value"], errors="coerce")
 
     datasets, used = [], set()
-    for _, group in df.groupby(group_cols, dropna=False):
-        sid = _first(group[key_col]) if key_col in group else None
-        desc = series_meta.get(sid, {}) if sid is not None else {}
-        pcode = desc.get("parameter_code") or (
-            _first(group["parameter_code"]) if "parameter_code" in group else None
-        )
+    for _, group in work.groupby(group_cols, dropna=False):
+        pcode = _first(group["parameter_code"]) if "parameter_code" in group else None
+        stat = _first(group["statistic_id"]) if "statistic_id" in group else None
+        unit = _first(group["unit_of_measure"]) if has_unit else None
+        desc = series_meta.get(str(pcode), {}) if pcode is not None else {}
 
         name = _slug(desc.get("parameter_name") or pcode)
-        if name in used:  # disambiguate same-parameter, different-statistic vars
-            comp = desc.get("computation_identifier") or _first(
-                group.get("statistic_id", _pd.Series(dtype=object))
-            )
-            name = f"{name}_{_slug(comp)}" if comp else name
+        if name in used:  # same parameter, different statistic -> distinct var
+            op = _STATISTIC_CELL_METHOD.get(str(stat)) or (str(stat) if stat else None)
+            name = f"{name}_{_slug(op)}" if op else name
         while name in used:
             name += "_x"
         used.add(name)
 
-        ancillary = [c for c in _ANCILLARY if c in group.columns]
-        sub = group.set_index([inst, "time"])[["value", *ancillary]]
+        sub = group.set_index([_INSTANCE, "time"])[["value", *ancillary]]
         if not sub.index.is_unique:
+            _warnings.warn(
+                f"'{name}' has multiple values per (site, time) -- two series "
+                "share this (site, parameter, statistic); keeping the first. "
+                "Filter the query to separate them.",
+                stacklevel=3,
+            )
             sub = sub[~sub.index.duplicated(keep="first")]
         ds_g = sub.to_xarray().rename(
             {"value": name, **{c: f"{name}_{c}" for c in ancillary}}
         )
         ds_g[name].attrs = _var_attrs(
             desc,
-            group,
-            default_cell_method=default_cell_method,
+            unit=unit,
             pcode=pcode,
+            stat=stat,
+            default_cell_method=default_cell_method,
             ancillary=ancillary,
             name=name,
         )
@@ -327,26 +323,20 @@ def _build_timeseries(
     ds = _xr.merge(datasets, combine_attrs="drop_conflicts", join="outer")
     ds.attrs = _dataset_attrs(service, base_meta)
     ds["time"].attrs.setdefault("standard_name", "time")
-    if inst in ds.coords:
-        ds[inst].attrs.setdefault("cf_role", "timeseries_id")
+    if _INSTANCE in ds.coords:
+        ds[_INSTANCE].attrs.setdefault("cf_role", "timeseries_id")
 
-    coords = _point_coords(df, inst)
+    coords = _point_coords(df, _INSTANCE)
     if coords is not None:
         lon, lat = coords
-        order = list(ds[inst].values)
+        order = list(ds[_INSTANCE].values)
         ds = ds.assign_coords(
-            longitude=(inst, [lon.get(k) for k in order]),
-            latitude=(inst, [lat.get(k) for k in order]),
+            longitude=(_INSTANCE, [lon.get(k) for k in order]),
+            latitude=(_INSTANCE, [lat.get(k) for k in order]),
         )
         ds["longitude"].attrs = {"standard_name": "longitude", "units": "degrees_east"}
         ds["latitude"].attrs = {"standard_name": "latitude", "units": "degrees_north"}
 
-    # Surface the human-readable site as a coordinate when the id is the dim.
-    if inst == key_col and "monitoring_location_id" in df.columns:
-        sites = df.drop_duplicates(key_col).set_index(key_col)["monitoring_location_id"]
-        ds = ds.assign_coords(
-            monitoring_location_id=(inst, sites.reindex(ds[inst].values).values)
-        )
     return ds
 
 
@@ -390,7 +380,8 @@ def _xr_doc(func):
 def _timeseries_wrapper(func, *, service, default_cell_method=None):
     @_wraps(func)
     def wrapper(*args, **kwargs):
-        kwargs.setdefault("include_hash", True)
+        # Default (hash-free) fetch: the per-record UUID is never requested or
+        # materialized; CF attributes come from the surviving columns.
         df, base_meta = func(*args, **kwargs)
         return _build_timeseries(
             df,
@@ -407,14 +398,12 @@ def wrapper(*args, **kwargs):
 def _field_wrapper(func, *, service):
     @_wraps(func)
     def wrapper(*args, **kwargs):
-        kwargs.setdefault("include_hash", True)
         df, base_meta = func(*args, **kwargs)
         return _build_timeseries(
             df,
             base_meta,
             service=service,
             series_meta=_field_metadata(_sites(df)),
-            key_col="field_measurements_series_id",
             group_cols=("parameter_code",),
             default_cell_method="point",
         )
@@ -426,7 +415,6 @@ def wrapper(*args, **kwargs):
 def _stats_wrapper(func, *, service):
     @_wraps(func)
     def wrapper(*args, **kwargs):
-        kwargs.setdefault("include_hash", True)
         df, base_meta = func(*args, **kwargs)
         return _build_stats(df, base_meta, service)