Skip to content

Commit 5f95a1d

Browse files
thodson-usgsclaude
andcommitted
feat(waterdata): extend hash-ID drop to get_stats_por / get_stats_date_range
The OGC ``get_*`` functions in the prior commit drop hash columns through ``get_ogc_data``. The statistics services (which return JSON through ``get_stats_data`` rather than OGC features) bypassed that path, so ``get_stats_por`` and ``get_stats_date_range`` were still returning ``computation_id`` (UUID) and ``parent_time_series_id`` (hex hash) by default. This commit: - Adds ``computation_id`` to ``_HASH_ID_COLUMNS`` (``parent_time_series_id`` was already there). - Plumbs ``include_hash_ids: bool = False`` through ``get_stats_data``, ``get_stats_por``, and ``get_stats_date_range``. - Drops the hash columns at the end of ``get_stats_data``, after ``_expand_percentiles`` (which still needs ``computation_id`` as a join key while it explodes the percentile lists into rows). - Updates ``test_get_stats_por_expanded_false`` / ``test_get_stats_date_range`` to reflect the new column count and adds ``test_get_stats_por_include_hash_ids`` documenting the opt-in. Discovered while running a live-API sweep across every public waterdata ``get_*`` function — the OGC services now pass, the stats ones used to leak, and this commit closes that gap. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 21b95eb commit 5f95a1d

3 files changed

Lines changed: 67 additions & 5 deletions

File tree

dataretrieval/waterdata/api.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2563,6 +2563,7 @@ def get_stats_por(
25632563
site_type_name: str | Iterable[str] | None = None,
25642564
parameter_code: str | Iterable[str] | None = None,
25652565
expand_percentiles: bool = True,
2566+
include_hash_ids: bool = False,
25662567
) -> tuple[pd.DataFrame, BaseMetadata]:
25672568
"""Get day-of-year and month-of-year water data statistics from the
25682569
USGS Water Data API.
@@ -2641,6 +2642,13 @@ def get_stats_por(
26412642
argument will return both the "values" column, containing the list
26422643
of percentile threshold values, and a "value" column, containing
26432644
the singular summary value for the other statistics.
2645+
include_hash_ids : boolean, optional
2646+
If False (default), the per-computation UUID (``computation_id``)
2647+
and the upstream time-series hex hash (``parent_time_series_id``)
2648+
are dropped from the returned DataFrame. Stable identifiers
2649+
(``monitoring_location_id``, ``parameter_code``, the time keys)
2650+
are kept. Set to True to restore the legacy behavior of
2651+
including every column.
26442652
26452653
Examples
26462654
--------
@@ -2665,10 +2673,13 @@ def get_stats_por(
26652673
... )
26662674
"""
26672675
# Build argument dictionary, omitting None values
2668-
params = _get_args(locals(), exclude={"expand_percentiles"})
2676+
params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"})
26692677

26702678
return get_stats_data(
2671-
args=params, service="observationNormals", expand_percentiles=expand_percentiles
2679+
args=params,
2680+
service="observationNormals",
2681+
expand_percentiles=expand_percentiles,
2682+
include_hash_ids=include_hash_ids,
26722683
)
26732684

26742685

@@ -2687,6 +2698,7 @@ def get_stats_date_range(
26872698
site_type_name: str | Iterable[str] | None = None,
26882699
parameter_code: str | Iterable[str] | None = None,
26892700
expand_percentiles: bool = True,
2701+
include_hash_ids: bool = False,
26902702
) -> tuple[pd.DataFrame, BaseMetadata]:
26912703
"""Get monthly and annual water data statistics from the USGS Water Data API.
26922704
This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov)
@@ -2769,6 +2781,13 @@ def get_stats_date_range(
27692781
argument will return both the "values" column, containing the list
27702782
of percentile threshold values, and a "value" column, containing
27712783
the singular summary value for the other statistics.
2784+
include_hash_ids : boolean, optional
2785+
If False (default), the per-computation UUID (``computation_id``)
2786+
and the upstream time-series hex hash (``parent_time_series_id``)
2787+
are dropped from the returned DataFrame. Stable identifiers
2788+
(``monitoring_location_id``, ``parameter_code``, the time keys)
2789+
are kept. Set to True to restore the legacy behavior of
2790+
including every column.
27722791
27732792
Examples
27742793
--------
@@ -2794,12 +2813,13 @@ def get_stats_date_range(
27942813
... )
27952814
"""
27962815
# Build argument dictionary, omitting None values
2797-
params = _get_args(locals(), exclude={"expand_percentiles"})
2816+
params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"})
27982817

27992818
return get_stats_data(
28002819
args=params,
28012820
service="observationIntervals",
28022821
expand_percentiles=expand_percentiles,
2822+
include_hash_ids=include_hash_ids,
28032823
)
28042824

28052825

dataretrieval/waterdata/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s
187187
"parent_time_series_id",
188188
"field_visit_id",
189189
"field_measurements_series_id",
190+
# ``get_stats_*`` (statistics service) output — per-computation
191+
# UUID; ``parent_time_series_id`` is already listed above.
192+
"computation_id",
190193
}
191194
)
192195

@@ -1223,6 +1226,7 @@ def get_stats_data(
12231226
service: str,
12241227
expand_percentiles: bool,
12251228
client: requests.Session | None = None,
1229+
include_hash_ids: bool = False,
12261230
) -> tuple[pd.DataFrame, BaseMetadata]:
12271231
"""
12281232
Retrieves statistical data from a specified endpoint and returns it
@@ -1244,6 +1248,13 @@ def get_stats_data(
12441248
each percentile gets its own row in the returned dataframe. If
12451249
True and user requests a computation_type other than
12461250
percentiles, a percentile column is still returned.
1251+
include_hash_ids : bool, optional
1252+
If False (default), the per-computation UUID (``computation_id``)
1253+
and the upstream time-series hex hash (``parent_time_series_id``)
1254+
are dropped from the returned DataFrame. These IDs are not
1255+
stable across record refreshes; ``computation_id`` is used as a
1256+
join key internally during percentile expansion and only
1257+
removed after that step completes.
12471258
12481259
Returns
12491260
-------
@@ -1320,6 +1331,16 @@ def get_stats_data(
13201331
if expand_percentiles:
13211332
dfs = _expand_percentiles(dfs)
13221333

1334+
# Drop hash-valued ID columns at the end (after
1335+
# ``_expand_percentiles``, which still needs ``computation_id``
1336+
# as a merge key while it explodes the percentile lists into
1337+
# rows). Stable identifiers (``monitoring_location_id``,
1338+
# ``parameter_code``, ``time_of_year``, …) are kept.
1339+
if not include_hash_ids:
1340+
drop_cols = [col for col in dfs.columns if col in _HASH_ID_COLUMNS]
1341+
if drop_cols:
1342+
dfs = dfs.drop(columns=drop_cols)
1343+
13231344
return dfs, BaseMetadata(initial_response)
13241345
finally:
13251346
if close_client:

tests/waterdata_test.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -618,13 +618,31 @@ def test_get_stats_por_expanded_false():
618618
computation_type=["minimum", "percentile"],
619619
)
620620
assert df.shape[0] == 4
621-
assert df.shape[1] == 20 # if geopandas installed, 21 columns if not
621+
# Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols.
622+
assert df.shape[1] == 18
623+
assert "computation_id" not in df.columns
624+
assert "parent_time_series_id" not in df.columns
622625
assert "percentile" not in df.columns
623626
assert "percentiles" in df.columns
624627
assert type(df["percentiles"][2]) is list
625628
assert df.loc[~df["percentiles"].isna(), "value"].isnull().all()
626629

627630

631+
def test_get_stats_por_include_hash_ids():
632+
"""``include_hash_ids=True`` preserves the per-computation UUID
633+
and the upstream time-series hex hash that ``get_stats_*`` used
634+
to return unconditionally."""
635+
df, _ = get_stats_por(
636+
monitoring_location_id="USGS-12451000",
637+
parameter_code="00060",
638+
start_date="01-01",
639+
end_date="01-01",
640+
include_hash_ids=True,
641+
)
642+
assert "computation_id" in df.columns
643+
assert "parent_time_series_id" in df.columns
644+
645+
628646
def test_get_stats_date_range():
629647
df, _ = get_stats_date_range(
630648
monitoring_location_id="USGS-12451000",
@@ -635,7 +653,10 @@ def test_get_stats_date_range():
635653
)
636654

637655
assert df.shape[0] == 3
638-
assert df.shape[1] == 20 # if geopandas installed, 21 columns if not
656+
# Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols.
657+
assert df.shape[1] == 18
658+
assert "computation_id" not in df.columns
659+
assert "parent_time_series_id" not in df.columns
639660
assert "interval_type" in df.columns
640661
assert "percentile" in df.columns
641662
assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all()

0 commit comments

Comments
 (0)