break up function into two, add in further unnesting

ehinman · ehinman · commit 1865aa35bf1f · 2026-01-08T16:42:16.000-06:00
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
@@ -15,12 +15,13 @@
     get_codes,
     get_continuous,
     get_daily,
+    get_date_range_stats,
     get_field_measurements,
     get_latest_continuous,
     get_latest_daily,
     get_monitoring_locations,
+    get_por_stats,
     get_samples,
-    get_statistics,
     get_time_series_metadata,
 )
 from .types import (
@@ -34,12 +35,13 @@
     "get_codes",
     "get_continuous",
     "get_daily",
+    "get_date_range_stats",
     "get_field_measurements",
     "get_latest_continuous",
     "get_latest_daily",
     "get_monitoring_locations",
+    "get_por_stats",
     "get_samples",
-    "get_statistics",
     "get_time_series_metadata",
     "_check_profiles",
     "CODE_SERVICES",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -19,7 +19,6 @@
     PROFILE_LOOKUP,
     PROFILES,
     SERVICES,
-    STATISTICS_SERVICES,
 )
 from dataretrieval.waterdata.utils import (
     SAMPLES_URL,
@@ -1646,8 +1645,7 @@ def get_samples(
 
     return df, BaseMetadata(response)
 
-def get_statistics(
-        service: STATISTICS_SERVICES = "observationNormals",
+def get_por_stats(
         approval_status: Optional[str] = None,
         computation_type: Optional[Union[str, list[str]]] = None,
         country_code: Optional[Union[str, list[str]]] = None,
@@ -1661,6 +1659,7 @@ def get_statistics(
         site_type_code: Optional[Union[str, list[str]]] = None,
         site_type_name: Optional[Union[str, list[str]]] = None,
         parameter_code: Optional[Union[str, list[str]]] = None,
+        expand_percentiles: bool = True
         ) -> Tuple[pd.DataFrame, BaseMetadata]:
     """Get water data statistics from the USGS Water Data API.
     This service provides endpoints for access to computations on the
@@ -1697,15 +1696,9 @@ def get_statistics(
         the two-digit state code and YYY is the three-digit county code.
         API defaults to "US:42:103" (Pennsylvania, Pike County).
     start_date: string or datetime, optional
-        Start date for the query. Its format depends upon the service:
-        for "observationNormals", it is in the month-day format (MM-DD),
-        for "observationIntervals", it is in the year-month-day format
-        (YYYY-MM-DD).
+        Start day for the query in the month-day format (MM-DD).
     end_date: string or datetime, optional
-        End date for the query. Its format depends upon the service:
-        for "observationNormals", it is in the month-day format (MM-DD),
-        for "observationIntervals", it is in the year-month-day format
-        (YYYY-MM-DD).
+        End day for the query in the month-day format (MM-DD).
     monitoring_location_id : string or list of strings, optional
         A unique identifier representing a single monitoring location. This
         corresponds to the id field in the monitoring-locations endpoint.
@@ -1731,22 +1724,129 @@ def get_statistics(
         measured and the units of measure. A complete list of parameter codes
         and associated groupings can be found at
         https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
+    expand_percentiles : boolean
+        Percentile data for a given day of year or month of year by default
+        are returned from the service as lists of string values and percentile
+        thresholds in the "values" and "percentiles" columns, respectively.
+        When `expand_percentiles` is set to True (default), each value and
+        percentile threshold specific to a computation id are returned as
+        individual rows in the dataframe. Missing percentile values expressed
+        as 'nan' in the list of string values are removed from the dataframe
+        to save space.
     """
-    valid_services = get_args(STATISTICS_SERVICES)
-    if service not in valid_services:
-        raise ValueError(
-            f"Invalid service: '{service}'. Valid options are: {valid_services}."
+    params = {
+        k: v
+        for k, v in locals().items()
+        if k not in ["expand_percentiles"] and v is not None
+    }
+    
+    return get_stats_data(
+        args=params,
+        service="observationNormals",
+        expand_percentiles=expand_percentiles
         )
+
+def get_date_range_stats(
+        approval_status: Optional[str] = None,
+        computation_type: Optional[Union[str, list[str]]] = None,
+        country_code: Optional[Union[str, list[str]]] = None,
+        state_code: Optional[Union[str, list[str]]] = None,
+        county_code: Optional[Union[str, list[str]]] = None,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        monitoring_location_id: Optional[Union[str, list[str]]] = None,
+        page_size: int = 1000,
+        parent_timeseries_id: Optional[Union[str, list[str]]] = None,
+        site_type_code: Optional[Union[str, list[str]]] = None,
+        site_type_name: Optional[Union[str, list[str]]] = None,
+        parameter_code: Optional[Union[str, list[str]]] = None,
+        expand_percentiles: bool = True
+        ) -> Tuple[pd.DataFrame, BaseMetadata]:
+    """Get water data statistics from the USGS Water Data API.
+    This service provides endpoints for access to computations on the
+    historical record regarding water conditions, including minimum, maximum,
+    mean, median, and percentiles for day of year, month, month-year, and 
+    water/calendar years. For more information regarding the calculation of
+    statistics and other details, please visit the Statistics documentation
+    page: https://waterdata.usgs.gov/statistics-documentation/.
     
+    Note: This API is under active beta development and subject to
+    change. Improved handling of significant figures will be
+    addressed in a future release.
+
+    Parameters
+    ----------
+    service: string, One of the following options: "observationNormals"
+        or "observationIntervals". "observationNormals" returns
+        day-of-year and month-of-year statistics matching your query,
+        while "observationIntervals" returns monthly and annual statistics
+        matching your query.
+    approval_status: string, optional
+        Whether to include approved and/or provisional observations.
+        At this time, only approved observations are returned.
+    computation_type: string, optional
+        Desired statistical computation method. Available values are:
+        arithmetic_mean, maximum, median, minimum, percentile.
+    country_code: string, optional
+        Country query parameter. API defaults to "US".
+    state_code: string, optional
+        State query parameter. Takes the format "US:XX", where XX is
+        the two-digit state code. API defaults to "US:42" (Pennsylvania).
+    county_code: string, optional
+        County query parameter. Takes the format "US:XX:YYY", where XX is
+        the two-digit state code and YYY is the three-digit county code.
+        API defaults to "US:42:103" (Pennsylvania, Pike County).
+    start_date: string or datetime, optional
+        Start date for the query in the year-month-day format
+        (YYYY-MM-DD).
+    end_date: string or datetime, optional
+        End date for the query in the year-month-day format
+        (YYYY-MM-DD).
+    monitoring_location_id : string or list of strings, optional
+        A unique identifier representing a single monitoring location. This
+        corresponds to the id field in the monitoring-locations endpoint.
+        Monitoring location IDs are created by combining the agency code of the
+        agency responsible for the monitoring location (e.g. USGS) with the ID
+        number of the monitoring location (e.g. 02238500), separated by a hyphen
+        (e.g. USGS-02238500).
+    page_size : int, optional
+        The number of results to return per page, where one result represents a
+        monitoring location. The default is 1000.
+    parent_time_series_id: string, optional
+        The parent_time_series_id returns statistics tied to a particular datbase entry.
+    site_type_code: string, optional
+        Site type code query parameter. You can see a list of valid site type codes here:
+        https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
+        Example: "GW" (Groundwater site)
+    site_type_name: string, optional
+        Site type name query parameter. You can see a list of valid site type names here:
+        https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items.
+        Example: "Well"
+    parameter_code : string or list of strings, optional
+        Parameter codes are 5-digit codes used to identify the constituent
+        measured and the units of measure. A complete list of parameter codes
+        and associated groupings can be found at
+        https://help.waterdata.usgs.gov/codes-and-parameters/parameters.
+    expand_percentiles : boolean
+        Percentile data for a given day of year or month of year by default
+        are returned from the service as lists of string values and percentile
+        thresholds in the "values" and "percentiles" columns, respectively.
+        When `expand_percentiles` is set to True (default), each value and
+        percentile threshold specific to a computation id are returned as
+        individual rows in the dataframe. Missing percentile values expressed
+        as 'nan' in the list of string values are removed from the dataframe
+        to save space.
+    """
     params = {
         k: v
         for k, v in locals().items()
-        if k not in ["service", "valid_services"] and v is not None
+        if k not in ["expand_percentiles"] and v is not None
     }
     
     return get_stats_data(
         args=params,
-        service=service
+        service="observationIntervals",
+        expand_percentiles=expand_percentiles
         )
 
 
diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py
@@ -11,11 +11,6 @@
     "states",
 ]
 
-STATISTICS_SERVICES = Literal[
-    "observationNormals",
-    "observationIntervals"
-]
-
 SERVICES = Literal[
     "activities",
     "locations",
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -1,6 +1,5 @@
 import json
 import logging
-import warnings
 import os
 import re
 from datetime import datetime
@@ -873,9 +872,53 @@ def _handle_stats_nesting(
     return df.merge(dat, on='monitoring_location_id', how='left')
 
 
+def _expand_percentiles(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Takes percentile value and thresholds columns containing lists
+    of values and turns each list element into its own row in the
+    original dataframe. 'nan's are removed from the dataframe.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe returned from using one of the statistics services.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing the flattened percentile data.
+    """
+    if len(df) > 0 and "percentile" in df['computation'].unique():
+
+        # Explode percentile lists into rows called "value" and "percentile"
+        percentiles = df.loc[df['computation'] == "percentile"]
+        percentiles_explode = percentiles[['computation_id', 'values', 'percentiles']].explode(['values', 'percentiles'], ignore_index=True)
+        percentiles_explode = percentiles_explode.loc[percentiles_explode['values']!="nan"]
+        percentiles_explode['value'] = pd.to_numeric(percentiles_explode['values'])
+        percentiles_explode['percentile'] = pd.to_numeric(percentiles_explode['percentiles'])
+        percentiles_explode = percentiles_explode.drop(columns=['values', 'percentiles'])
+        
+        # Merge exploded values back to other metadata/geometry
+        percentiles = percentiles.drop(columns=['values', 'percentiles', 'value']).merge(percentiles_explode, on='computation_id', how='left')
+        
+        # Concatenate back to original
+        dfs = pd.concat([df.loc[df['computation'] != "percentile"], percentiles]).drop(columns=['values', 'percentiles'])
+
+        # Move percentile column
+        cols = dfs.columns.tolist()
+        cols.remove("percentile")
+        col_index = cols.index("value") + 1
+        cols.insert(col_index, "percentile")
+
+        return dfs[cols]
+    
+    else:
+        return df
+
 def get_stats_data(
     args: Dict[str, Any],
     service: str,
+    expand_percentiles: bool,
     client: Optional[requests.Session] = None,
     ) -> Tuple[pd.DataFrame, BaseMetadata]:
     """
@@ -955,6 +998,10 @@ def get_stats_data(
                 logger.error("Request incomplete. %s", error_text)
                 logger.warning("Request failed for URL: %s. Data download interrupted.", resp.url)
                 next_token = None
+
+        if expand_percentiles:
+            dfs = _expand_percentiles(dfs)
+
         return dfs, BaseMetadata(initial_response)
     finally:
         if close_client: