Skip to content

Commit 1865aa3

Browse files
committed
break up function into two, add in further unnesting
1 parent f7ee053 commit 1865aa3

4 files changed

Lines changed: 169 additions & 25 deletions

File tree

dataretrieval/waterdata/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
get_codes,
1616
get_continuous,
1717
get_daily,
18+
get_date_range_stats,
1819
get_field_measurements,
1920
get_latest_continuous,
2021
get_latest_daily,
2122
get_monitoring_locations,
23+
get_por_stats,
2224
get_samples,
23-
get_statistics,
2425
get_time_series_metadata,
2526
)
2627
from .types import (
@@ -34,12 +35,13 @@
3435
"get_codes",
3536
"get_continuous",
3637
"get_daily",
38+
"get_date_range_stats",
3739
"get_field_measurements",
3840
"get_latest_continuous",
3941
"get_latest_daily",
4042
"get_monitoring_locations",
43+
"get_por_stats",
4144
"get_samples",
42-
"get_statistics",
4345
"get_time_series_metadata",
4446
"_check_profiles",
4547
"CODE_SERVICES",

dataretrieval/waterdata/api.py

Lines changed: 117 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
PROFILE_LOOKUP,
2020
PROFILES,
2121
SERVICES,
22-
STATISTICS_SERVICES,
2322
)
2423
from dataretrieval.waterdata.utils import (
2524
SAMPLES_URL,
@@ -1646,8 +1645,7 @@ def get_samples(
16461645

16471646
return df, BaseMetadata(response)
16481647

1649-
def get_statistics(
1650-
service: STATISTICS_SERVICES = "observationNormals",
1648+
def get_por_stats(
16511649
approval_status: Optional[str] = None,
16521650
computation_type: Optional[Union[str, list[str]]] = None,
16531651
country_code: Optional[Union[str, list[str]]] = None,
@@ -1661,6 +1659,7 @@ def get_statistics(
16611659
site_type_code: Optional[Union[str, list[str]]] = None,
16621660
site_type_name: Optional[Union[str, list[str]]] = None,
16631661
parameter_code: Optional[Union[str, list[str]]] = None,
1662+
expand_percentiles: bool = True
16641663
) -> Tuple[pd.DataFrame, BaseMetadata]:
16651664
"""Get water data statistics from the USGS Water Data API.
16661665
This service provides endpoints for access to computations on the
@@ -1697,15 +1696,9 @@ def get_statistics(
16971696
the two-digit state code and YYY is the three-digit county code.
16981697
API defaults to "US:42:103" (Pennsylvania, Pike County).
16991698
start_date: string or datetime, optional
1700-
Start date for the query. Its format depends upon the service:
1701-
for "observationNormals", it is in the month-day format (MM-DD),
1702-
for "observationIntervals", it is in the year-month-day format
1703-
(YYYY-MM-DD).
1699+
Start day for the query in the month-day format (MM-DD).
17041700
end_date: string or datetime, optional
1705-
End date for the query. Its format depends upon the service:
1706-
for "observationNormals", it is in the month-day format (MM-DD),
1707-
for "observationIntervals", it is in the year-month-day format
1708-
(YYYY-MM-DD).
1701+
End day for the query in the month-day format (MM-DD).
17091702
monitoring_location_id : string or list of strings, optional
17101703
A unique identifier representing a single monitoring location. This
17111704
corresponds to the id field in the monitoring-locations endpoint.
@@ -1731,22 +1724,129 @@ def get_statistics(
17311724
measured and the units of measure. A complete list of parameter codes
17321725
and associated groupings can be found at
17331726
https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
1727+
expand_percentiles : boolean
1728+
Percentile data for a given day of year or month of year by default
1729+
are returned from the service as lists of string values and percentile
1730+
thresholds in the "values" and "percentiles" columns, respectively.
1731+
When `expand_percentiles` is set to True (default), each value and
1732+
percentile threshold specific to a computation id are returned as
1733+
individual rows in the dataframe. Missing percentile values expressed
1734+
as 'nan' in the list of string values are removed from the dataframe
1735+
to save space.
17341736
"""
1735-
valid_services = get_args(STATISTICS_SERVICES)
1736-
if service not in valid_services:
1737-
raise ValueError(
1738-
f"Invalid service: '{service}'. Valid options are: {valid_services}."
1737+
params = {
1738+
k: v
1739+
for k, v in locals().items()
1740+
if k not in ["expand_percentiles"] and v is not None
1741+
}
1742+
1743+
return get_stats_data(
1744+
args=params,
1745+
service="observationNormals",
1746+
expand_percentiles=expand_percentiles
17391747
)
1748+
1749+
def get_date_range_stats(
1750+
approval_status: Optional[str] = None,
1751+
computation_type: Optional[Union[str, list[str]]] = None,
1752+
country_code: Optional[Union[str, list[str]]] = None,
1753+
state_code: Optional[Union[str, list[str]]] = None,
1754+
county_code: Optional[Union[str, list[str]]] = None,
1755+
start_date: Optional[str] = None,
1756+
end_date: Optional[str] = None,
1757+
monitoring_location_id: Optional[Union[str, list[str]]] = None,
1758+
page_size: int = 1000,
1759+
parent_timeseries_id: Optional[Union[str, list[str]]] = None,
1760+
site_type_code: Optional[Union[str, list[str]]] = None,
1761+
site_type_name: Optional[Union[str, list[str]]] = None,
1762+
parameter_code: Optional[Union[str, list[str]]] = None,
1763+
expand_percentiles: bool = True
1764+
) -> Tuple[pd.DataFrame, BaseMetadata]:
1765+
"""Get water data statistics from the USGS Water Data API.
1766+
This service provides endpoints for access to computations on the
1767+
historical record regarding water conditions, including minimum, maximum,
1768+
mean, median, and percentiles for day of year, month, month-year, and
1769+
water/calendar years. For more information regarding the calculation of
1770+
statistics and other details, please visit the Statistics documentation
1771+
page: https://waterdata.usgs.gov/statistics-documentation/.
17401772
1773+
Note: This API is under active beta development and subject to
1774+
change. Improved handling of significant figures will be
1775+
addressed in a future release.
1776+
1777+
Parameters
1778+
----------
1779+
service: string, One of the following options: "observationNormals"
1780+
or "observationIntervals". "observationNormals" returns
1781+
day-of-year and month-of-year statistics matching your query,
1782+
while "observationIntervals" returns monthly and annual statistics
1783+
matching your query.
1784+
approval_status: string, optional
1785+
Whether to include approved and/or provisional observations.
1786+
At this time, only approved observations are returned.
1787+
computation_type: string, optional
1788+
Desired statistical computation method. Available values are:
1789+
arithmetic_mean, maximum, median, minimum, percentile.
1790+
country_code: string, optional
1791+
Country query parameter. API defaults to "US".
1792+
state_code: string, optional
1793+
State query parameter. Takes the format "US:XX", where XX is
1794+
the two-digit state code. API defaults to "US:42" (Pennsylvania).
1795+
county_code: string, optional
1796+
County query parameter. Takes the format "US:XX:YYY", where XX is
1797+
the two-digit state code and YYY is the three-digit county code.
1798+
API defaults to "US:42:103" (Pennsylvania, Pike County).
1799+
start_date: string or datetime, optional
1800+
Start date for the query in the year-month-day format
1801+
(YYYY-MM-DD).
1802+
end_date: string or datetime, optional
1803+
End date for the query in the year-month-day format
1804+
(YYYY-MM-DD).
1805+
monitoring_location_id : string or list of strings, optional
1806+
A unique identifier representing a single monitoring location. This
1807+
corresponds to the id field in the monitoring-locations endpoint.
1808+
Monitoring location IDs are created by combining the agency code of the
1809+
agency responsible for the monitoring location (e.g. USGS) with the ID
1810+
number of the monitoring location (e.g. 02238500), separated by a hyphen
1811+
(e.g. USGS-02238500).
1812+
page_size : int, optional
1813+
The number of results to return per page, where one result represents a
1814+
monitoring location. The default is 1000.
1815+
parent_time_series_id: string, optional
1816+
The parent_time_series_id returns statistics tied to a particular datbase entry.
1817+
site_type_code: string, optional
1818+
Site type code query parameter. You can see a list of valid site type codes here:
1819+
https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
1820+
Example: "GW" (Groundwater site)
1821+
site_type_name: string, optional
1822+
Site type name query parameter. You can see a list of valid site type names here:
1823+
https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
1824+
Example: "Well"
1825+
parameter_code : string or list of strings, optional
1826+
Parameter codes are 5-digit codes used to identify the constituent
1827+
measured and the units of measure. A complete list of parameter codes
1828+
and associated groupings can be found at
1829+
https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
1830+
expand_percentiles : boolean
1831+
Percentile data for a given day of year or month of year by default
1832+
are returned from the service as lists of string values and percentile
1833+
thresholds in the "values" and "percentiles" columns, respectively.
1834+
When `expand_percentiles` is set to True (default), each value and
1835+
percentile threshold specific to a computation id are returned as
1836+
individual rows in the dataframe. Missing percentile values expressed
1837+
as 'nan' in the list of string values are removed from the dataframe
1838+
to save space.
1839+
"""
17411840
params = {
17421841
k: v
17431842
for k, v in locals().items()
1744-
if k not in ["service", "valid_services"] and v is not None
1843+
if k not in ["expand_percentiles"] and v is not None
17451844
}
17461845

17471846
return get_stats_data(
17481847
args=params,
1749-
service=service
1848+
service="observationIntervals",
1849+
expand_percentiles=expand_percentiles
17501850
)
17511851

17521852

dataretrieval/waterdata/types.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
"states",
1212
]
1313

14-
STATISTICS_SERVICES = Literal[
15-
"observationNormals",
16-
"observationIntervals"
17-
]
18-
1914
SERVICES = Literal[
2015
"activities",
2116
"locations",

dataretrieval/waterdata/utils.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import json
22
import logging
3-
import warnings
43
import os
54
import re
65
from datetime import datetime
@@ -873,9 +872,53 @@ def _handle_stats_nesting(
873872
return df.merge(dat, on='monitoring_location_id', how='left')
874873

875874

875+
def _expand_percentiles(df: pd.DataFrame) -> pd.DataFrame:
876+
"""
877+
Takes percentile value and thresholds columns containing lists
878+
of values and turns each list element into its own row in the
879+
original dataframe. 'nan's are removed from the dataframe.
880+
881+
Parameters
882+
----------
883+
df : pd.DataFrame
884+
The dataframe returned from using one of the statistics services.
885+
886+
Returns
887+
-------
888+
pd.DataFrame
889+
A DataFrame containing the flattened percentile data.
890+
"""
891+
if len(df) > 0 and "percentile" in df['computation'].unique():
892+
893+
# Explode percentile lists into rows called "value" and "percentile"
894+
percentiles = df.loc[df['computation'] == "percentile"]
895+
percentiles_explode = percentiles[['computation_id', 'values', 'percentiles']].explode(['values', 'percentiles'], ignore_index=True)
896+
percentiles_explode = percentiles_explode.loc[percentiles_explode['values']!="nan"]
897+
percentiles_explode['value'] = pd.to_numeric(percentiles_explode['values'])
898+
percentiles_explode['percentile'] = pd.to_numeric(percentiles_explode['percentiles'])
899+
percentiles_explode = percentiles_explode.drop(columns=['values', 'percentiles'])
900+
901+
# Merge exploded values back to other metadata/geometry
902+
percentiles = percentiles.drop(columns=['values', 'percentiles', 'value']).merge(percentiles_explode, on='computation_id', how='left')
903+
904+
# Concatenate back to original
905+
dfs = pd.concat([df.loc[df['computation'] != "percentile"], percentiles]).drop(columns=['values', 'percentiles'])
906+
907+
# Move percentile column
908+
cols = dfs.columns.tolist()
909+
cols.remove("percentile")
910+
col_index = cols.index("value") + 1
911+
cols.insert(col_index, "percentile")
912+
913+
return dfs[cols]
914+
915+
else:
916+
return df
917+
876918
def get_stats_data(
877919
args: Dict[str, Any],
878920
service: str,
921+
expand_percentiles: bool,
879922
client: Optional[requests.Session] = None,
880923
) -> Tuple[pd.DataFrame, BaseMetadata]:
881924
"""
@@ -955,6 +998,10 @@ def get_stats_data(
955998
logger.error("Request incomplete. %s", error_text)
956999
logger.warning("Request failed for URL: %s. Data download interrupted.", resp.url)
9571000
next_token = None
1001+
1002+
if expand_percentiles:
1003+
dfs = _expand_percentiles(dfs)
1004+
9581005
return dfs, BaseMetadata(initial_response)
9591006
finally:
9601007
if close_client:

0 commit comments

Comments
 (0)