feat(waterdata): extend hash-ID drop to get_stats_por / get_stats_date_range

thodson-usgs · claude · thodson-usgs · commit 5f95a1d10da6 · 2026-05-17T17:19:44.000-05:00
The OGC ``get_*`` functions in the prior commit drop hash columns
through ``get_ogc_data``. The statistics services (which return JSON
through ``get_stats_data`` rather than OGC features) bypassed that
path, so ``get_stats_por`` and ``get_stats_date_range`` were still
returning ``computation_id`` (UUID) and ``parent_time_series_id``
(hex hash) by default.

This commit:
- Adds ``computation_id`` to ``_HASH_ID_COLUMNS``
  (``parent_time_series_id`` was already there).
- Plumbs ``include_hash_ids: bool = False`` through ``get_stats_data``,
  ``get_stats_por``, and ``get_stats_date_range``.
- Drops the hash columns at the end of ``get_stats_data``, after
  ``_expand_percentiles`` (which still needs ``computation_id`` as a
  join key while it explodes the percentile lists into rows).
- Updates ``test_get_stats_por_expanded_false`` /
  ``test_get_stats_date_range`` to reflect the new column count and
  adds ``test_get_stats_por_include_hash_ids`` documenting the opt-in.

Discovered while running a live-API sweep across every public
waterdata ``get_*`` function — the OGC services now pass, the stats
ones used to leak, and this commit closes that gap.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -2563,6 +2563,7 @@ def get_stats_por(
     site_type_name: str | Iterable[str] | None = None,
     parameter_code: str | Iterable[str] | None = None,
     expand_percentiles: bool = True,
+    include_hash_ids: bool = False,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Get day-of-year and month-of-year water data statistics from the
     USGS Water Data API.
@@ -2641,6 +2642,13 @@ def get_stats_por(
         argument will return both the "values" column, containing the list
         of percentile threshold values, and a "value" column, containing
         the singular summary value for the other statistics.
+    include_hash_ids : boolean, optional
+        If False (default), the per-computation UUID (``computation_id``)
+        and the upstream time-series hex hash (``parent_time_series_id``)
+        are dropped from the returned DataFrame. Stable identifiers
+        (``monitoring_location_id``, ``parameter_code``, the time keys)
+        are kept. Set to True to restore the legacy behavior of
+        including every column.
 
     Examples
     --------
@@ -2665,10 +2673,13 @@ def get_stats_por(
         ... )
     """
     # Build argument dictionary, omitting None values
-    params = _get_args(locals(), exclude={"expand_percentiles"})
+    params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"})
 
     return get_stats_data(
-        args=params, service="observationNormals", expand_percentiles=expand_percentiles
+        args=params,
+        service="observationNormals",
+        expand_percentiles=expand_percentiles,
+        include_hash_ids=include_hash_ids,
     )
 
 
@@ -2687,6 +2698,7 @@ def get_stats_date_range(
     site_type_name: str | Iterable[str] | None = None,
     parameter_code: str | Iterable[str] | None = None,
     expand_percentiles: bool = True,
+    include_hash_ids: bool = False,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Get monthly and annual water data statistics from the USGS Water Data API.
     This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov)
@@ -2769,6 +2781,13 @@ def get_stats_date_range(
         argument will return both the "values" column, containing the list
         of percentile threshold values, and a "value" column, containing
         the singular summary value for the other statistics.
+    include_hash_ids : boolean, optional
+        If False (default), the per-computation UUID (``computation_id``)
+        and the upstream time-series hex hash (``parent_time_series_id``)
+        are dropped from the returned DataFrame. Stable identifiers
+        (``monitoring_location_id``, ``parameter_code``, the time keys)
+        are kept. Set to True to restore the legacy behavior of
+        including every column.
 
     Examples
     --------
@@ -2794,12 +2813,13 @@ def get_stats_date_range(
         ... )
     """
     # Build argument dictionary, omitting None values
-    params = _get_args(locals(), exclude={"expand_percentiles"})
+    params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"})
 
     return get_stats_data(
         args=params,
         service="observationIntervals",
         expand_percentiles=expand_percentiles,
+        include_hash_ids=include_hash_ids,
     )
 
 
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -187,6 +187,9 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s
         "parent_time_series_id",
         "field_visit_id",
         "field_measurements_series_id",
+        # ``get_stats_*`` (statistics service) output — per-computation
+        # UUID; ``parent_time_series_id`` is already listed above.
+        "computation_id",
     }
 )
 
@@ -1223,6 +1226,7 @@ def get_stats_data(
     service: str,
     expand_percentiles: bool,
     client: requests.Session | None = None,
+    include_hash_ids: bool = False,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """
     Retrieves statistical data from a specified endpoint and returns it
@@ -1244,6 +1248,13 @@ def get_stats_data(
         each percentile gets its own row in the returned dataframe. If
         True and user requests a computation_type other than
         percentiles, a percentile column is still returned.
+    include_hash_ids : bool, optional
+        If False (default), the per-computation UUID (``computation_id``)
+        and the upstream time-series hex hash (``parent_time_series_id``)
+        are dropped from the returned DataFrame. These IDs are not
+        stable across record refreshes; ``computation_id`` is used as a
+        join key internally during percentile expansion and only
+        removed after that step completes.
 
     Returns
     -------
@@ -1320,6 +1331,16 @@ def get_stats_data(
         if expand_percentiles:
             dfs = _expand_percentiles(dfs)
 
+        # Drop hash-valued ID columns at the end (after
+        # ``_expand_percentiles``, which still needs ``computation_id``
+        # as a merge key while it explodes the percentile lists into
+        # rows). Stable identifiers (``monitoring_location_id``,
+        # ``parameter_code``, ``time_of_year``, …) are kept.
+        if not include_hash_ids:
+            drop_cols = [col for col in dfs.columns if col in _HASH_ID_COLUMNS]
+            if drop_cols:
+                dfs = dfs.drop(columns=drop_cols)
+
         return dfs, BaseMetadata(initial_response)
     finally:
         if close_client:
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -618,13 +618,31 @@ def test_get_stats_por_expanded_false():
         computation_type=["minimum", "percentile"],
     )
     assert df.shape[0] == 4
-    assert df.shape[1] == 20  # if geopandas installed, 21 columns if not
+    # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols.
+    assert df.shape[1] == 18
+    assert "computation_id" not in df.columns
+    assert "parent_time_series_id" not in df.columns
     assert "percentile" not in df.columns
     assert "percentiles" in df.columns
     assert type(df["percentiles"][2]) is list
     assert df.loc[~df["percentiles"].isna(), "value"].isnull().all()
 
 
+def test_get_stats_por_include_hash_ids():
+    """``include_hash_ids=True`` preserves the per-computation UUID
+    and the upstream time-series hex hash that ``get_stats_*`` used
+    to return unconditionally."""
+    df, _ = get_stats_por(
+        monitoring_location_id="USGS-12451000",
+        parameter_code="00060",
+        start_date="01-01",
+        end_date="01-01",
+        include_hash_ids=True,
+    )
+    assert "computation_id" in df.columns
+    assert "parent_time_series_id" in df.columns
+
+
 def test_get_stats_date_range():
     df, _ = get_stats_date_range(
         monitoring_location_id="USGS-12451000",
@@ -635,7 +653,10 @@ def test_get_stats_date_range():
     )
 
     assert df.shape[0] == 3
-    assert df.shape[1] == 20  # if geopandas installed, 21 columns if not
+    # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols.
+    assert df.shape[1] == 18
+    assert "computation_id" not in df.columns
+    assert "parent_time_series_id" not in df.columns
     assert "interval_type" in df.columns
     assert "percentile" in df.columns
     assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all()