Skip to content

Commit f1755e7

Browse files
thodson-usgsclaude
andcommitted
refactor(waterdata): drop the pandas-path hash-dropping; keep it in xarray
Scope this PR to the xarray module. The earlier work added an include_hash flag and a hash-valued-ID drop (plus a server-side queryables whitelist) to the plain DataFrame getters; revert that public-API surface so api.py / utils.py and their tests match main, and the getters again return every column. The xarray datasets stay hash-free on their own: the timeseries/samples builders surface only the columns they convert (so per-record UUIDs and per-series join keys never appear), and _build_stats now drops the stats service's computation_id / parent_time_series_id explicitly since its flat conversion keeps every column. _fetch still swallows a stray include_hash kwarg so passing it to an xarray wrapper stays harmless. Adds tests pinning the hash-free guarantee for the stats and ragged paths. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ba74979 commit f1755e7

6 files changed

Lines changed: 90 additions & 612 deletions

File tree

dataretrieval/waterdata/api.py

Lines changed: 4 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
SAMPLES_URL,
3434
_check_profiles,
3535
_default_headers,
36-
_drop_hash_columns,
3736
_get_args,
3837
get_ogc_data,
3938
get_stats_data,
@@ -62,7 +61,6 @@ def get_daily(
6261
filter: str | None = None,
6362
filter_lang: FILTER_LANG | None = None,
6463
convert_type: bool = True,
65-
include_hash: bool = False,
6664
) -> tuple[pd.DataFrame, BaseMetadata]:
6765
"""Daily data provide one data value to represent water conditions for the
6866
day.
@@ -195,9 +193,6 @@ def get_daily(
195193
and the lexicographic-comparison pitfall.
196194
convert_type : boolean, optional
197195
If True, converts columns to appropriate types.
198-
include_hash : boolean, optional
199-
If False (default), drop the opaque hash-valued ID columns. Set True to
200-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
201196
202197
Returns
203198
-------
@@ -281,7 +276,6 @@ def get_continuous(
281276
filter: str | None = None,
282277
filter_lang: FILTER_LANG | None = None,
283278
convert_type: bool = True,
284-
include_hash: bool = False,
285279
) -> tuple[pd.DataFrame, BaseMetadata]:
286280
"""
287281
Continuous data provide instantaneous water conditions.
@@ -409,9 +403,6 @@ def get_continuous(
409403
convert_type : boolean, optional
410404
If True, the function will convert the data to dates and qualifier to
411405
string vector
412-
include_hash : boolean, optional
413-
If False (default), drop the opaque hash-valued ID columns. Set True to
414-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
415406
416407
Returns
417408
-------
@@ -505,7 +496,6 @@ def get_monitoring_locations(
505496
filter: str | None = None,
506497
filter_lang: FILTER_LANG | None = None,
507498
convert_type: bool = True,
508-
include_hash: bool = False,
509499
) -> tuple[pd.DataFrame, BaseMetadata]:
510500
"""Location information is basic information about the monitoring location
511501
including the name, identifier, agency responsible for data collection, and
@@ -721,9 +711,6 @@ def get_monitoring_locations(
721711
and the lexicographic-comparison pitfall.
722712
convert_type : boolean, optional
723713
If True, converts columns to appropriate types.
724-
include_hash : boolean, optional
725-
If False (default), drop the opaque hash-valued ID columns. Set True to
726-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
727714
728715
Returns
729716
-------
@@ -787,7 +774,6 @@ def get_time_series_metadata(
787774
filter: str | None = None,
788775
filter_lang: FILTER_LANG | None = None,
789776
convert_type: bool = True,
790-
include_hash: bool = False,
791777
) -> tuple[pd.DataFrame, BaseMetadata]:
792778
"""Daily data and continuous measurements are grouped into time series,
793779
which represent a collection of observations of a single parameter,
@@ -948,9 +934,6 @@ def get_time_series_metadata(
948934
and the lexicographic-comparison pitfall.
949935
convert_type : boolean, optional
950936
If True, converts columns to appropriate types.
951-
include_hash : boolean, optional
952-
If False (default), drop the opaque hash-valued ID columns. Set True to
953-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
954937
955938
Returns
956939
-------
@@ -1048,7 +1031,6 @@ def get_combined_metadata(
10481031
filter: str | None = None,
10491032
filter_lang: FILTER_LANG | None = None,
10501033
convert_type: bool = True,
1051-
include_hash: bool = False,
10521034
) -> tuple[pd.DataFrame, BaseMetadata]:
10531035
"""Get combined monitoring-location and time-series metadata.
10541036
@@ -1149,9 +1131,6 @@ def get_combined_metadata(
11491131
and the lexicographic-comparison pitfall.
11501132
convert_type : boolean, optional
11511133
If True, converts columns to appropriate types.
1152-
include_hash : boolean, optional
1153-
If False (default), drop the opaque hash-valued ID columns. Set True to
1154-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
11551134
11561135
Returns
11571136
-------
@@ -1240,7 +1219,6 @@ def get_latest_continuous(
12401219
filter: str | None = None,
12411220
filter_lang: FILTER_LANG | None = None,
12421221
convert_type: bool = True,
1243-
include_hash: bool = False,
12441222
) -> tuple[pd.DataFrame, BaseMetadata]:
12451223
"""This endpoint provides the most recent observation for each time series
12461224
of continuous data. Continuous data are collected via automated sensors
@@ -1370,9 +1348,6 @@ def get_latest_continuous(
13701348
and the lexicographic-comparison pitfall.
13711349
convert_type : boolean, optional
13721350
If True, converts columns to appropriate types.
1373-
include_hash : boolean, optional
1374-
If False (default), drop the opaque hash-valued ID columns. Set True to
1375-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
13761351
13771352
Returns
13781353
-------
@@ -1439,7 +1414,6 @@ def get_latest_daily(
14391414
filter: str | None = None,
14401415
filter_lang: FILTER_LANG | None = None,
14411416
convert_type: bool = True,
1442-
include_hash: bool = False,
14431417
) -> tuple[pd.DataFrame, BaseMetadata]:
14441418
"""Daily data provide one data value to represent water conditions for the
14451419
day.
@@ -1571,9 +1545,6 @@ def get_latest_daily(
15711545
and the lexicographic-comparison pitfall.
15721546
convert_type : boolean, optional
15731547
If True, converts columns to appropriate types.
1574-
include_hash : boolean, optional
1575-
If False (default), drop the opaque hash-valued ID columns. Set True to
1576-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
15771548
15781549
Returns
15791550
-------
@@ -1641,7 +1612,6 @@ def get_field_measurements(
16411612
filter: str | None = None,
16421613
filter_lang: FILTER_LANG | None = None,
16431614
convert_type: bool = True,
1644-
include_hash: bool = False,
16451615
) -> tuple[pd.DataFrame, BaseMetadata]:
16461616
"""Field measurements are physically measured values collected during a
16471617
visit to the monitoring location. Field measurements consist of measurements
@@ -1763,9 +1733,6 @@ def get_field_measurements(
17631733
and the lexicographic-comparison pitfall.
17641734
convert_type : boolean, optional
17651735
If True, converts columns to appropriate types.
1766-
include_hash : boolean, optional
1767-
If False (default), drop the opaque hash-valued ID columns. Set True to
1768-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
17691736
17701737
Returns
17711738
-------
@@ -1829,7 +1796,6 @@ def get_field_measurements_metadata(
18291796
filter: str | None = None,
18301797
filter_lang: FILTER_LANG | None = None,
18311798
convert_type: bool = True,
1832-
include_hash: bool = False,
18331799
) -> tuple[pd.DataFrame, BaseMetadata]:
18341800
"""Get field-measurement metadata: one row per (location, parameter) series.
18351801
@@ -1885,9 +1851,6 @@ def get_field_measurements_metadata(
18851851
and the lexicographic-comparison pitfall.
18861852
convert_type : boolean, optional
18871853
If True, converts columns to appropriate types.
1888-
include_hash : boolean, optional
1889-
If False (default), drop the opaque hash-valued ID columns. Set True to
1890-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
18911854
18921855
Returns
18931856
-------
@@ -1954,7 +1917,6 @@ def get_peaks(
19541917
filter: str | None = None,
19551918
filter_lang: FILTER_LANG | None = None,
19561919
convert_type: bool = True,
1957-
include_hash: bool = False,
19581920
) -> tuple[pd.DataFrame, BaseMetadata]:
19591921
"""Get the annual peak streamflow / stage record for a monitoring location.
19601922
@@ -2013,9 +1975,6 @@ def get_peaks(
20131975
and the lexicographic-comparison pitfall.
20141976
convert_type : boolean, optional
20151977
If True, converts columns to appropriate types.
2016-
include_hash : boolean, optional
2017-
If False (default), drop the opaque hash-valued ID columns. Set True to
2018-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
20191978
20201979
Returns
20211980
-------
@@ -2193,7 +2152,6 @@ def get_samples(
21932152
pointLocationWithinMiles: float | None = None,
21942153
projectIdentifier: str | Iterable[str] | None = None,
21952154
recordIdentifierUserSupplied: str | Iterable[str] | None = None,
2196-
include_hash: bool = False,
21972155
) -> tuple[pd.DataFrame, BaseMetadata]:
21982156
"""Search Samples database for USGS water quality data.
21992157
This is a wrapper function for the Samples database API. All potential
@@ -2324,9 +2282,6 @@ def get_samples(
23242282
recordIdentifierUserSupplied : string or iterable of strings, optional
23252283
Internal AQS record identifier that returns 1 entry. Only available
23262284
for the "results" service.
2327-
include_hash : boolean, optional
2328-
If False (default), drop the opaque per-activity / per-result UUID columns
2329-
(``Activity_ActivityIdentifier``, ``Result_MeasureIdentifier``).
23302285
23312286
Returns
23322287
-------
@@ -2376,7 +2331,7 @@ def get_samples(
23762331
_check_profiles(service, profile)
23772332

23782333
# Build argument dictionary, omitting None values
2379-
params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash"})
2334+
params = _get_args(locals(), exclude={"ssl_check", "profile"})
23802335

23812336
params.update({"mimeType": "text/csv"})
23822337

@@ -2399,7 +2354,6 @@ def get_samples(
23992354

24002355
df = pd.read_csv(StringIO(response.text), delimiter=",")
24012356
df = _attach_datetime_columns(df)
2402-
df = _drop_hash_columns(df, include_hash)
24032357

24042358
return df, BaseMetadata(response)
24052359

@@ -2492,7 +2446,6 @@ def get_stats_por(
24922446
site_type_name: str | Iterable[str] | None = None,
24932447
parameter_code: str | Iterable[str] | None = None,
24942448
expand_percentiles: bool = True,
2495-
include_hash: bool = False,
24962449
) -> tuple[pd.DataFrame, BaseMetadata]:
24972450
"""Get day-of-year and month-of-year water data statistics from the
24982451
USGS Water Data API.
@@ -2571,9 +2524,6 @@ def get_stats_por(
25712524
argument will return both the "values" column, containing the list
25722525
of percentile threshold values, and a "value" column, containing
25732526
the singular summary value for the other statistics.
2574-
include_hash : boolean, optional
2575-
If False (default), drop the hash columns (``computation_id``,
2576-
``parent_time_series_id``); set True to keep them for joining to metadata.
25772527
25782528
Examples
25792529
--------
@@ -2598,13 +2548,10 @@ def get_stats_por(
25982548
... )
25992549
"""
26002550
# Build argument dictionary, omitting None values
2601-
params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"})
2551+
params = _get_args(locals(), exclude={"expand_percentiles"})
26022552

26032553
return get_stats_data(
2604-
args=params,
2605-
service="observationNormals",
2606-
expand_percentiles=expand_percentiles,
2607-
include_hash=include_hash,
2554+
args=params, service="observationNormals", expand_percentiles=expand_percentiles
26082555
)
26092556

26102557

@@ -2623,7 +2570,6 @@ def get_stats_date_range(
26232570
site_type_name: str | Iterable[str] | None = None,
26242571
parameter_code: str | Iterable[str] | None = None,
26252572
expand_percentiles: bool = True,
2626-
include_hash: bool = False,
26272573
) -> tuple[pd.DataFrame, BaseMetadata]:
26282574
"""Get monthly and annual water data statistics from the USGS Water Data API.
26292575
This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov)
@@ -2706,9 +2652,6 @@ def get_stats_date_range(
27062652
argument will return both the "values" column, containing the list
27072653
of percentile threshold values, and a "value" column, containing
27082654
the singular summary value for the other statistics.
2709-
include_hash : boolean, optional
2710-
If False (default), drop the hash columns (``computation_id``,
2711-
``parent_time_series_id``); set True to keep them for joining to metadata.
27122655
27132656
Examples
27142657
--------
@@ -2734,13 +2677,12 @@ def get_stats_date_range(
27342677
... )
27352678
"""
27362679
# Build argument dictionary, omitting None values
2737-
params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"})
2680+
params = _get_args(locals(), exclude={"expand_percentiles"})
27382681

27392682
return get_stats_data(
27402683
args=params,
27412684
service="observationIntervals",
27422685
expand_percentiles=expand_percentiles,
2743-
include_hash=include_hash,
27442686
)
27452687

27462688

@@ -2776,7 +2718,6 @@ def get_channel(
27762718
filter: str | None = None,
27772719
filter_lang: FILTER_LANG | None = None,
27782720
convert_type: bool = True,
2779-
include_hash: bool = False,
27802721
) -> tuple[pd.DataFrame, BaseMetadata]:
27812722
"""
27822723
Channel measurements taken as part of streamflow field measurements.
@@ -2891,9 +2832,6 @@ def get_channel(
28912832
convert_type : boolean, optional
28922833
If True, the function will convert the data to dates and qualifier to
28932834
string vector
2894-
include_hash : boolean, optional
2895-
If False (default), drop the opaque hash-valued ID columns. Set True to
2896-
keep the secondary hashes (e.g. ``time_series_id``) that join to metadata.
28972835
28982836
Returns
28992837
-------

0 commit comments

Comments
 (0)