Skip to content

Commit c067597

Browse files
committed
initial code for stats service
1 parent 7298d3b commit c067597

3 files changed

Lines changed: 264 additions & 1 deletion

File tree

dataretrieval/waterdata/api.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@
1919
PROFILE_LOOKUP,
2020
PROFILES,
2121
SERVICES,
22+
STATISTICS_SERVICES,
23+
)
24+
from dataretrieval.waterdata.utils import (
25+
SAMPLES_URL,
26+
get_ogc_data,
27+
get_stats_data
2228
)
23-
from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data
2429

2530
# Set up logger for this module
2631
logger = logging.getLogger(__name__)
@@ -1641,6 +1646,109 @@ def get_samples(
16411646

16421647
return df, BaseMetadata(response)
16431648

1649+
def get_statistics(
1650+
service: STATISTICS_SERVICES = "observationNormals",
1651+
approval_status: Optional[str] = None,
1652+
computation_type: Optional[str] = None,
1653+
country_code: Optional[str] = None,
1654+
state_code: Optional[str] = None,
1655+
county_code: Optional[str] = None,
1656+
start_date: Optional[Union[str, datetime]] = None,
1657+
end_date: Optional[Union[str, datetime]] = None,
1658+
monitoring_location_id: Optional[str] = None,
1659+
page_size: int = 1000,
1660+
parent_timeseries_id: Optional[str] = None,
1661+
site_type_code: Optional[str] = None,
1662+
site_type_name: Optional[str] = None,
1663+
parameter_code: Optional[str] = None,
1664+
) -> Tuple[pd.DataFrame, BaseMetadata]:
1665+
"""Get water data statistics from the USGS Water Data API.
1666+
This service provides endpoints for access to computations on the
1667+
historical record regarding water conditions, including minimum, maximum,
1668+
mean, median, and percentiles for day of year, month, month-year, and
1669+
water/calendar years. For more information regarding the calculation of
1670+
statistics and other details, please visit the Statistics documentation
1671+
page: https://waterdata.usgs.gov/statistics-documentation/.
1672+
1673+
Note: This API is under active beta development and subject to
1674+
change. Improved handling of significant figures will be
1675+
addressed in a future release.
1676+
1677+
Parameters
1678+
----------
1679+
service: string, One of the following options: "observationNormals"
1680+
or "observationIntervals". "observationNormals" returns
1681+
day-of-year and month-of-year statistics matching your query,
1682+
while "observationIntervals" returns monthly and annual statistics
1683+
matching your query.
1684+
approval_status: string, optional
1685+
Whether to include approved and/or provisional observations.
1686+
At this time, only approved observations are returned.
1687+
computation_type: string, optional
1688+
Desired statistical computation method. Available values are:
1689+
arithmetic_mean, maximum, median, minimum, percentile.
1690+
country_code: string, optional
1691+
Country query parameter. API defaults to "US".
1692+
state_code: string, optional
1693+
State query parameter. Takes the format "US:XX", where XX is
1694+
the two-digit state code. API defaults to "US:42" (Pennsylvania).
1695+
county_code: string, optional
1696+
County query parameter. Takes the format "US:XX:YYY", where XX is
1697+
the two-digit state code and YYY is the three-digit county code.
1698+
API defaults to "US:42:103" (Pennsylvania, Pike County).
1699+
start_date: string or datetime, optional
1700+
Start date for the query. Its format depends upon the service:
1701+
for "observationNormals", it is in the month-day format (MM-DD),
1702+
for "observationIntervals", it is in the year-month-day format
1703+
(YYYY-MM-DD).
1704+
end_date: string or datetime, optional
1705+
End date for the query. Its format depends upon the service:
1706+
for "observationNormals", it is in the month-day format (MM-DD),
1707+
for "observationIntervals", it is in the year-month-day format
1708+
(YYYY-MM-DD).
1709+
monitoring_location_id : string or list of strings, optional
1710+
A unique identifier representing a single monitoring location. This
1711+
corresponds to the id field in the monitoring-locations endpoint.
1712+
Monitoring location IDs are created by combining the agency code of the
1713+
agency responsible for the monitoring location (e.g. USGS) with the ID
1714+
number of the monitoring location (e.g. 02238500), separated by a hyphen
1715+
(e.g. USGS-02238500).
1716+
page_size : int, optional
1717+
The number of results to return per page, where one result represents a
1718+
monitoring location. The default is 1000.
1719+
parent_time_series_id: string, optional
1720+
The parent_time_series_id returns statistics tied to a particular datbase entry.
1721+
site_type_code: string, optional
1722+
Site type code query parameter. You can see a list of valid site type codes here:
1723+
https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
1724+
Example: "GW" (Groundwater site)
1725+
site_type_name: string, optional
1726+
Site type name query parameter. You can see a list of valid site type names here:
1727+
https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
1728+
Example: "Well"
1729+
parameter_code : string or list of strings, optional
1730+
Parameter codes are 5-digit codes used to identify the constituent
1731+
measured and the units of measure. A complete list of parameter codes
1732+
and associated groupings can be found at
1733+
https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
1734+
"""
1735+
valid_services = get_args(STATISTICS_SERVICES)
1736+
if service not in valid_services:
1737+
raise ValueError(
1738+
f"Invalid service: '{service}'. Valid options are: {valid_services}."
1739+
)
1740+
1741+
params = {
1742+
k: v
1743+
for k, v in locals().items()
1744+
if k not in ["service"] and v is not None
1745+
}
1746+
1747+
return get_stats_data(
1748+
args=params,
1749+
service=service,
1750+
)
1751+
16441752

16451753
def _check_profiles(
16461754
service: SERVICES,

dataretrieval/waterdata/types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
"states",
1212
]
1313

14+
STATISTICS_SERVICES = Literal[
15+
"observationNormals",
16+
"observationIntervals"
17+
]
18+
1419
SERVICES = Literal[
1520
"activities",
1621
"locations",

dataretrieval/waterdata/utils.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
OGC_API_VERSION = "v0"
2828
OGC_API_URL = f"{BASE_URL}/ogcapi/{OGC_API_VERSION}"
2929
SAMPLES_URL = f"{BASE_URL}/samples-data"
30+
STATISTICS_API_VERSION = "v0"
31+
STATISTICS_API_URL = f"{BASE_URL}/statistics/{STATISTICS_API_VERSION}"
3032

3133

3234
def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str):
@@ -820,4 +822,152 @@ def get_ogc_data(
820822
metadata = BaseMetadata(response)
821823
return return_list, metadata
822824

825+
def get_stats_data(
826+
args: Dict[str, Any],
827+
service: str,
828+
geopd: bool,
829+
client: Optional[requests.Session] = None,
830+
) -> Tuple[pd.DataFrame, BaseMetadata]:
831+
"""
832+
Retrieves statistical data from a specified water data endpoint and returns it as a pandas DataFrame with metadata.
833+
834+
This function prepares request arguments, constructs API requests, handles pagination, processes the results,
835+
and formats the output DataFrame according to the specified parameters.
836+
837+
Parameters
838+
----------
839+
args : Dict[str, Any]
840+
Dictionary of request arguments for the statistics service.
841+
service : str
842+
The statistics service type (e.g., "observationNormals", "observationIntervals").
843+
geopd : bool, optional
844+
If True, returns a GeoDataFrame if geometries are present; otherwise, returns a pandas DataFrame.
845+
Defaults to False.
846+
847+
Returns
848+
-------
849+
pd.DataFrame
850+
A DataFrame containing the retrieved and processed statistical data.
851+
BaseMetadata
852+
A metadata object containing request information including URL and query time.
853+
"""
854+
855+
url = f"{STATISTICS_API_URL}/{service}"
856+
857+
if not geopd:
858+
logger.info(
859+
"Geopandas not installed. Geometries will be flattened into pandas DataFrames."
860+
)
861+
862+
headers = _default_headers()
863+
864+
request = requests.Request(
865+
method="GET",
866+
url=url,
867+
headers=headers,
868+
params=args,
869+
)
870+
871+
req = request.prepare()
872+
logger.info("Request: %s", req.url)
873+
874+
# Get first response from client
875+
# using GET or POST call
876+
close_client = client is None
877+
client = client or requests.Session()
878+
879+
try:
880+
resp = client.send(req)
881+
if resp.status_code != 200:
882+
raise Exception(_error_body(resp))
883+
884+
# Store the initial response for metadata
885+
initial_response = resp
886+
887+
# Grab some aspects of the original request: headers and the
888+
# request type (GET or POST)
889+
method = req.method.upper()
890+
headers = dict(req.headers)
891+
892+
# Check if it's an empty response
893+
body = resp.json()
894+
if body is None:
895+
return pd.DataFrame()
896+
897+
# If geopandas not installed, return a pandas dataframe
898+
# otherwise return a geodataframe
899+
if not geopd:
900+
df = pd.json_normalize(resp['features'])
901+
else:
902+
df = gpd.GeoDataFrame.from_features(resp["features"]).drop(columns=['data'])
903+
904+
dat = pd.json_normalize(
905+
resp,
906+
record_path=["features", "properties", "data", "values"],
907+
meta=[
908+
["features", "properties", "monitoring_location_id"],
909+
["features", "properties", "data", "parameter_code"],
910+
["features", "properties", "data", "unit_of_measure"],
911+
["features", "properties", "data", "parent_time_series_id"],
912+
["features", "geometry", "coordinates"],
913+
],
914+
meta_prefix="",
915+
errors="ignore",
916+
)
917+
dat.columns = dat.columns.str.split('.').str[-1]
918+
919+
dfs = df.merge(dat, on='monitoring_location_id', how='left')
920+
921+
curr_url = body['next']
922+
923+
while curr_url:
924+
try:
925+
resp = client.request(
926+
method,
927+
curr_url,
928+
headers=headers,
929+
)
930+
if resp.status_code != 200:
931+
error_text = _error_body(resp)
932+
raise Exception(error_text)
933+
# Check if it's an empty response
934+
body = resp.json()
935+
if body is None:
936+
return pd.DataFrame()
937+
938+
# If geopandas not installed, return a pandas dataframe
939+
# otherwise return a geodataframe
940+
if not geopd:
941+
df1 = pd.json_normalize(resp['features'])
942+
else:
943+
df1 = gpd.GeoDataFrame.from_features(resp["features"]).drop(columns=['data'])
944+
945+
dat = pd.json_normalize(
946+
resp,
947+
record_path=["features", "properties", "data", "values"],
948+
meta=[
949+
["features", "properties", "monitoring_location_id"],
950+
["features", "properties", "data", "parameter_code"],
951+
["features", "properties", "data", "unit_of_measure"],
952+
["features", "properties", "data", "parent_time_series_id"],
953+
["features", "geometry", "coordinates"],
954+
],
955+
meta_prefix="",
956+
errors="ignore",
957+
)
958+
dat.columns = dat.columns.str.split('.').str[-1]
959+
960+
df1 = df1.merge(dat, on='monitoring_location_id', how='left')
961+
dfs = pd.concat([dfs, df1], ignore_index=True)
962+
curr_url = body['next']
963+
except Exception:
964+
warnings.warn(f"{error_text}. Data request incomplete.")
965+
logger.error("Request incomplete. %s", error_text)
966+
logger.warning("Request failed for URL: %s. Data download interrupted.", curr_url)
967+
curr_url = None
968+
return dfs, BaseMetadata(initial_response)
969+
finally:
970+
if close_client:
971+
client.close()
972+
823973

0 commit comments

Comments
 (0)