From fb5a79ace283957a329900b1b2050651c7ffbdf4 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 1 Jun 2026 14:43:05 -0400 Subject: [PATCH] docs: copy-edit docstrings/comments and fix doc/code accuracy gaps A review pass over the package's docstrings and inline comments for spelling, grammar, clarity, and accuracy against the implementation. Spelling & grammar: - Typos ("Times series" -> "Time series", "datetme", "conatining", "indeces" -> "indices", "ESPSG" -> "EPSG", "NDAP" -> "NADP") plus articles, subject-verb agreement, duplicated words, and punctuation. codespell-clean ("gage" is the intentional USGS spelling). R-language cruft -> Python: - bbox `c(xmin,ymin,xmax,ymax)` -> `[xmin, ymin, xmax, ymax]`; "(numeric) vector"/"a vector of requested columns" -> list wording; `limit : numeric` -> `int`; `thresholds : numeric` -> `number`. Accuracy (docs that disagreed with the code): - Broken `Returns` cross-references to a nonexistent `dataretrieval.utils.Metadata` -> the real classes (`nwis.NWIS_Metadata`, `wqp.WQP_Metadata`, `utils.BaseMetadata`). - get_monitoring_locations: three params typed `float` -> string/iterable. - nwis.query_waterdata / query_waterservices: `service` examples listed values the functions reject -> the accepted sets; removed a duplicated `ssl_check` block and a `bBox` documented as a HUC. - waterdata/utils: _switch_properties_id docstring described a dict (copied from _switch_arg_id); _handle_stats_nesting / _combine_chunk_frames notes corrected to match the code. - get_time_series_metadata: replaced the stale `properties` list (copied from get_daily) with the collection's real queryables. - Added missing pieces: `nadp` module deprecation note; `Raises` on `utils.query` and `streamstats.get_watershed` (plus its `format` param and dual return type); `get_channel` params; `get_stats_por` / `get_stats_date_range` `Returns` sections. - Dropped bogus `Returns` blocks from `__init__` methods (they return None). Code fixes surfaced by the review: - Removed the non-functional `time` parameter from `get_monitoring_locations` and `get_time_series_metadata`. Their OGC collections have no `time` queryable (verified via the collections' queryables and a live 400), so `time=...` only produced an opaque server error; with no `**kwargs`, removal now yields a clear `TypeError`. The observation getters' `time` is valid and untouched. - Fixed a malformed URL in the nwis seriesCatalogOutput deprecation warning (`waterdata.usgs.gov.nwis/qwdata` -> `waterdata.usgs.gov/nwis/qwdata`). ruff check + ruff format clean; mypy --strict clean; tests pass; live-API behavior verified. Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/__init__.py | 3 +- dataretrieval/codes/states.py | 7 +- dataretrieval/nadp.py | 21 ++- dataretrieval/nldi.py | 8 +- dataretrieval/nwis.py | 72 ++++----- dataretrieval/streamstats.py | 27 +++- dataretrieval/utils.py | 23 +-- dataretrieval/waterdata/_progress.py | 2 +- dataretrieval/waterdata/api.py | 221 ++++++++++++++++----------- dataretrieval/waterdata/chunking.py | 16 +- dataretrieval/waterdata/nearest.py | 13 +- dataretrieval/waterdata/utils.py | 60 +++++--- dataretrieval/wqp.py | 34 ++--- 13 files changed, 289 insertions(+), 218 deletions(-) diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index cca267f8..29b184f7 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -11,7 +11,8 @@ df, meta = nwis.get_dv(sites="05427718") Available service modules: ``waterdata``, ``wqp`` (Water Quality Portal), -``nldi``, ``samples``, ``streamstats``, ``nadp``, and the deprecated ``nwis``. +``nldi``, ``samples``, ``streamstats``, and the deprecated ``nwis`` and +``nadp``. ``nldi`` requires geopandas (``pip install dataretrieval[nldi]``) and is imported on demand: ``from dataretrieval import nldi``. diff --git a/dataretrieval/codes/states.py b/dataretrieval/codes/states.py index e3d83f0f..5d761736 100644 --- a/dataretrieval/codes/states.py +++ b/dataretrieval/codes/states.py @@ -1,4 +1,9 @@ -"""List of 2-digit state codes with commented full names.""" +"""State code lookups keyed by full state name. + +``state_codes`` maps each state name to its two-letter postal abbreviation +(e.g. ``"Alabama": "al"``); ``fips_codes`` maps it to its two-digit FIPS +code (e.g. ``"Alabama": "01"``). +""" state_codes = { "Alabama": "al", diff --git a/dataretrieval/nadp.py b/dataretrieval/nadp.py index d6b26381..a02b6671 100644 --- a/dataretrieval/nadp.py +++ b/dataretrieval/nadp.py @@ -1,11 +1,18 @@ """ Tools for retrieving data from the National Atmospheric Deposition Program -(NADP) including the National Trends Network (NTN), the Mercury Deposition -Network (MDN). +(NADP), including the National Trends Network (NTN) and the Mercury +Deposition Network (MDN). + +.. deprecated:: + + The ``nadp`` module is deprecated and will be removed from + ``dataretrieval`` on or after 2026-11-01. NADP is not a USGS data + source; please retrieve NADP data directly from + https://nadp.slh.wisc.edu/. National Trends Network ----------------------- -The NTN provides long-term records of precipitation chemistry across the +The NTN provides long-term records of precipitation chemistry across the United States. See https://nadp.slh.wisc.edu/ntn for more info. Mercury Deposition Network @@ -89,7 +96,7 @@ def tif(self) -> bytes: def get_annual_MDN_map(measurement_type: str, year: str, path: str) -> str: - """Download a MDN map from NDAP. + """Download an MDN map from NADP. This function looks for a zip file containing gridded information at: https://nadp.slh.wisc.edu/maps-data/mdn-gradient-maps/. @@ -143,7 +150,7 @@ def get_annual_NTN_map( year: str | None = None, path: str = ".", ) -> str: - """Download a NTN map from NDAP. + """Download an NTN map from NADP. This function looks for a zip file containing gridded information at: https://nadp.slh.wisc.edu/maps-data/ntn-gradient-maps/. @@ -158,11 +165,11 @@ def get_annual_NTN_map( Parameters ---------- - measurement : string - The measured constituent to return. measurement_type : string The type of measurement, 'conc', 'dep', or 'Precip', which represent concentration, deposition, or precipitation respectively. + measurement : string + The measured constituent to return. year : string Year as a string 'YYYY' path : string diff --git a/dataretrieval/nldi.py b/dataretrieval/nldi.py index a03aa1e6..cb4e2488 100644 --- a/dataretrieval/nldi.py +++ b/dataretrieval/nldi.py @@ -135,7 +135,7 @@ def get_basin( as_json: bool = False, ) -> gpd.GeoDataFrame | dict[str, Any]: """Gets the aggregated basin for the specified feature in WGS84 lat/lon - as GeoDataFrame or as JSON conatining a polygon geometry. + as GeoDataFrame or as JSON containing a polygon geometry. Parameters ---------- @@ -143,7 +143,7 @@ def get_basin( feature_id: string, identifier of the feature simplified: bool, simplified, default is True split_catchment: bool, split catchment, default is False - as_json: bool, return basin as JSON is set to True, otherwise return + as_json: bool, return basin as JSON if set to True, otherwise return as GeoDataFrame, default is False Returns @@ -299,8 +299,8 @@ def get_features( return gdf -# TODO: This function can cause timeout error for some data sources -# - may be we shouldn't provide this function? +# TODO: This function can cause a timeout error for some data sources +# - maybe we shouldn't provide this function? def get_features_by_data_source(data_source: str) -> gpd.GeoDataFrame: """Gets all features found for the specified data source as points in WGS84 lat/long coordinates as GeoDataFrame containing a point geometry. diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index fafd0a08..756d95f7 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -251,8 +251,8 @@ def get_discharge_peaks( Returns ------- df: ``pandas.DataFrame`` - Times series data from the NWIS JSON - md: :obj:`dataretrieval.utils.Metadata` + Time series data from the NWIS JSON + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -331,7 +331,7 @@ def get_stats( ------- df: ``pandas.DataFrame`` Statistics data from the statistics service - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object .. todo:: @@ -372,7 +372,7 @@ def query_waterdata( Parameters ---------- service: string - Name of the service to query: 'site', 'stats', etc. + Name of the service to query: 'peaks' or 'ratings'. ssl_check: bool, optional If True, check SSL certificates, if False, do not check SSL, default is True @@ -424,10 +424,7 @@ def query_waterservices( Parameters ---------- service: string - Name of the service to query: 'site', 'stats', etc. - ssl_check: bool, optional - If True, check SSL certificates, if False, do not check SSL, - default is True + Name of the service to query: 'dv', 'iv', 'site', or 'stat'. ssl_check: bool, optional If True, check SSL certificates, if False, do not check SSL, default is True @@ -437,7 +434,9 @@ def query_waterservices( Keyword Arguments ---------------- bBox: string - 7-digit Hydrologic Unit Code (HUC) + Bounding box of decimal latitude and longitude values, given as + west longitude, south latitude, east longitude, north latitude, + separated by commas startDT: string Start date (e.g., '2017-12-31') endDT: string @@ -484,7 +483,7 @@ def get_dv( """ Get daily values data from NWIS and return it as a ``pandas.DataFrame``. - .. note: + .. note:: If no start or end date are provided, only the most recent record is returned. @@ -511,8 +510,8 @@ def get_dv( Returns ------- df: ``pandas.DataFrame`` - Times series data from the NWIS JSON - md: :obj:`dataretrieval.utils.Metadata` + Time series data from the NWIS JSON + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -580,7 +579,7 @@ def get_info( A contiguous range of decimal latitude and longitude, starting with the west longitude, then the south latitude, then the east longitude, and then the north latitude with each value separated by a comma. The - product of the range of latitude range and longitude cannot exceed 25 + product of the range of latitude and longitude cannot exceed 25 degrees. Whole or decimal degrees must be specified, up to six digits of precision. Minutes and seconds are not allowed. countyCd: string or list of strings @@ -625,7 +624,7 @@ def get_info( ------- df: ``pandas.DataFrame`` Site data from the NWIS web service - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -645,7 +644,7 @@ def get_info( ( "WARNING: Starting in March 2024, the NWIS qw data endpoint is " "retiring and no longer receives updates. For more information, " - "refer to https://waterdata.usgs.gov.nwis/qwdata and " + "refer to https://waterdata.usgs.gov/nwis/qwdata and " "https://doi-usgs.github.io/dataRetrieval/articles/Status.html " "or email CompTools@usgs.gov." ), @@ -701,8 +700,8 @@ def get_iv( Returns ------- df: ``pandas.DataFrame`` - Times series data from the NWIS JSON - md: :obj:`dataretrieval.utils.Metadata` + Time series data from the NWIS JSON + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -734,7 +733,7 @@ def get_iv( def get_pmcodes(**kwargs: Any) -> NoReturn: - """Defunct: use ``get_reference_table(collection='parameter-codes')``.""" + """Defunct: use ``waterdata.get_reference_table(collection='parameter-codes')``.""" raise NameError( "`nwis.get_pmcodes` has been replaced " "with `get_reference_table(collection='parameter-codes')`." @@ -762,7 +761,7 @@ def get_ratings( Parameters ---------- site: string, optional, default is None - USGS site number. This is usually an 8 digit number as a string. + USGS site number. This is usually an 8 digit number as a string. If the nwis parameter site_no is supplied, it will overwrite the site parameter file_type: string, default is "base" @@ -773,11 +772,11 @@ def get_ratings( **kwargs: optional If supplied, will be used as query parameters - Return - ------ + Returns + ------- df: ``pandas.DataFrame`` Formatted requested data - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -819,11 +818,11 @@ def what_sites( **kwargs: optional Accepts the same parameters as :obj:`dataretrieval.nwis.get_info` - Return - ------ + Returns + ------- df: ``pandas.DataFrame`` Formatted requested data - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.nwis.NWIS_Metadata` A custom metadata object Examples @@ -870,7 +869,7 @@ def get_record( Parameters ---------- sites: string or list of strings, optional, default is None - List or comma delimited string of site. + List or comma delimited string of sites. start: string, optional, default is None Starting date of record (YYYY-MM-DD) end: string, optional, default is None @@ -882,7 +881,7 @@ def get_record( If True, return data in wide format with multiple samples per row and one row per time, default is True datetime_index : bool, optional - If True, create a datetime index. default is True + If True, create a datetime index. Default is True state: string, optional, default is None full name, abbreviation or id service: string, default is 'iv' @@ -893,7 +892,7 @@ def get_record( - 'peaks': discharge peaks - 'gwlevels': (defunct) use `waterdata.get_continuous`, `waterdata.get_daily`, or `waterdata.get_field_measurements` - - 'pmcodes': (defunct) use `get_reference_table` + - 'pmcodes': (defunct) use `waterdata.get_reference_table` - 'water_use': (defunct) no replacement available - 'ratings': get rating table - 'stat': get statistics @@ -1021,9 +1020,7 @@ def _read_json(json: dict[str, Any]) -> pd.DataFrame: Returns ------- df: ``pandas.DataFrame`` - Times series data from the NWIS JSON - md: :obj:`dataretrieval.utils.Metadata` - A custom metadata object + Time series data from the NWIS JSON """ all_site_dfs = [] @@ -1033,7 +1030,7 @@ def _read_json(json: dict[str, Any]) -> pd.DataFrame: ] # create a list of indexes for each change in site no - # for example, [0, 21, 22] would be the first and last indeces + # for example, [0, 21, 22] would be the first and last indices index_list = [0] index_list.extend( [i + 1 for i, (a, b) in enumerate(zip(site_list[:-1], site_list[1:])) if a != b] @@ -1127,7 +1124,7 @@ class NWIS_Metadata(BaseMetadata): ---------- url : str Response url - query_time: datetme.timedelta + query_time: datetime.timedelta Response elapsed time header: httpx.Headers Response headers @@ -1152,11 +1149,6 @@ def __init__(self, response: httpx.Response, **parameters: Any) -> None: parameters: unpacked dictionary Unpacked dictionary of the parameters supplied in the request - Returns - ------- - md: :obj:`dataretrieval.nwis.NWIS_Metadata` - A ``dataretrieval`` custom :obj:`dataretrieval.nwis.NWIS_Metadata` object. - """ super().__init__(response) @@ -1177,8 +1169,8 @@ def site_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None: ``huc``, ``countyCd`` or ``bBox`` (``site_no`` is preferred over ``sites`` if both are present); ``None`` otherwise. - Return - ------ + Returns + ------- df: ``pandas.DataFrame`` Formatted requested data from calling `nwis.what_sites` md: :obj:`dataretrieval.nwis.NWIS_Metadata` diff --git a/dataretrieval/streamstats.py b/dataretrieval/streamstats.py index 039f292b..d17aa7df 100644 --- a/dataretrieval/streamstats.py +++ b/dataretrieval/streamstats.py @@ -1,5 +1,5 @@ """ -This module is a wrapper for the streamstats API (`streamstats documentation`_). +This module is a wrapper for the StreamStats API (`streamstats documentation`_). .. _streamstats documentation: https://streamstats.usgs.gov/streamstatsservices/#/ @@ -16,7 +16,7 @@ def download_workspace(workspaceID: str, format: str = "") -> httpx.Response: - """Function to download streamstats workspace. + """Function to download a StreamStats workspace. Parameters ---------- @@ -60,7 +60,7 @@ def get_sample_watershed() -> Watershed: ------- Watershed: :obj:`dataretrieval.streamstats.Watershed` Custom object that contains the watershed information as extracted - from the streamstats JSON object. + from the StreamStats JSON object. """ return cast( @@ -82,7 +82,7 @@ def get_watershed( ) -> httpx.Response | Watershed: """Get watershed object based on location - **Streamstats documentation:** + **StreamStats documentation:** Returns a watershed object. The request configuration will determine the overall request response. However all returns will return a watershed object with at least the workspaceid. The workspace id is the id to the @@ -102,7 +102,7 @@ def get_watershed( ylocation: float Y location of the most downstream point of desired study area. crs: integer, string, optional - ESPSG spatial reference code, default is 4326 + EPSG spatial reference code, default is 4326 includeparameters: bool, optional Boolean flag to include parameters in response. includeflowtypes: bool, string, optional @@ -113,12 +113,23 @@ def get_watershed( simplify: bool, optional Boolean flag controlling whether or not to simplify the returned result. + format: string, optional + Controls the return type, default is 'geojson'. 'geojson' returns + the raw ``httpx.Response``; 'object' parses the response into a + :obj:`dataretrieval.streamstats.Watershed`. 'shape' is not + implemented and raises ``NotImplementedError``. Returns ------- - Watershed: :obj:`dataretrieval.streamstats.Watershed` - Custom object that contains the watershed information as extracted - from the streamstats JSON object. + r: ``httpx.Response`` or :obj:`dataretrieval.streamstats.Watershed` + The raw response when ``format='geojson'`` (the default), or a + custom ``Watershed`` object containing the watershed information + extracted from the StreamStats JSON when ``format='object'``. + + Raises + ------ + NotImplementedError + If ``format='shape'``, which is not yet implemented. """ payload: dict[str, str | int | float | bool] = { diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index f9766ee6..154f2a4d 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -217,7 +217,7 @@ class BaseMetadata: ---------- url : str Response url - query_time: datetme.timedelta + query_time: datetime.timedelta Response elapsed time header: httpx.Headers Response headers @@ -229,13 +229,8 @@ def __init__(self, response: httpx.Response) -> None: Parameters ---------- - response: Response - Response object from httpx module - - Returns - ------- - md: :obj:`dataretrieval.utils.BaseMetadata` - A ``dataretrieval`` custom :obj:`dataretrieval.utils.BaseMetadata` object. + response: ``httpx.Response`` + Response object from the ``httpx`` module. """ @@ -312,8 +307,16 @@ def query( Returns ------- - string: query response + response: ``httpx.Response`` The response from the API query ``httpx.get`` function call. + + Raises + ------ + ValueError + If the service returns a 400, 404, 414, or 5xx status code, or if + ``httpx`` rejects the URL client-side (e.g. it is too long). + NoSitesError + If the response indicates that no sites or data matched the query. """ for key, value in payload.items(): @@ -359,7 +362,7 @@ def query( class NoSitesError(Exception): - """Custom error class used when selection criteria returns no sites/data.""" + """Custom error class used when selection criteria return no sites/data.""" def __init__(self, url: httpx.URL) -> None: self.url = url diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index ce94effb..0e4963cd 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -18,7 +18,7 @@ :func:`progress_context` to activate one and :func:`current` to reach it. By default the line is shown for interactive use — an interactive terminal or a -Jupyter/IPython kernel (like ``tqdm``) — while redirected logs and CI stay clean. +Jupyter/IPython kernel, like ``tqdm`` — while redirected logs and CI stay clean. ``API_USGS_PROGRESS`` forces it on (``1``/``true``) or off (``0``/``false``). """ diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 3144bf80..5c2a4657 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -105,7 +105,7 @@ def get_daily( A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. + A list of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, monitoring_location_id, parameter_code, statistic_id, time, value, unit_of_measure, approval_status, qualifier, last_modified @@ -144,7 +144,7 @@ def get_daily( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). @@ -187,14 +187,14 @@ def get_daily( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower - if your internet connection is spotty. The default (NA) will set the + if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / @@ -208,7 +208,7 @@ def get_daily( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -326,7 +326,7 @@ def get_continuous( descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. + A list of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, monitoring_location_id, parameter_code, statistic_id, time, value, unit_of_measure, approval_status, qualifier, last_modified @@ -365,7 +365,7 @@ def get_continuous( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). @@ -398,11 +398,11 @@ def get_continuous( * Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours - limit : numeric, optional + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower - if your internet connection is spotty. The default (NA) will set the + if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / @@ -410,14 +410,13 @@ def get_continuous( :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional - If True, the function will convert the data to dates and qualifier to - string vector + If True, converts columns to appropriate types. Returns ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -498,7 +497,6 @@ def get_monitoring_locations( depth_source_code: str | Iterable[str] | None = None, properties: str | Iterable[str] | None = None, skip_geometry: bool | None = None, - time: str | Iterable[str] | None = None, bbox: list[float] | None = None, limit: int | None = None, filter: str | None = None, @@ -536,7 +534,7 @@ def get_monitoring_locations( For well information this can be a district-assigned local number. district_code : string or iterable of strings, optional The Water Science Centers (WSCs) across the United States use the FIPS - state code as the district code. In some case, monitoring locations and + state code as the district code. In some cases, monitoring locations and samples may be managed by a water science center that is adjacent to the state in which the monitoring location actually resides. For example a monitoring location may have a district code of 30 which translates to @@ -572,7 +570,6 @@ def get_monitoring_locations( county equivalent in which the monitoring location is located. site_type_code : string or iterable of strings, optional A code describing the hydrologic setting of the monitoring location. - Example: "US:15:001" (United States: Hawaii, Hawaii County) site_type : string or iterable of strings, optional A description of the hydrologic setting of the monitoring location. hydrologic_unit_code : string or iterable of strings, optional @@ -599,12 +596,12 @@ def get_monitoring_locations( entered as one-half of the contour interval. altitude_method_code : string or iterable of strings, optional Codes representing the method used to measure altitude. - altitude_method_name : float, optional - The name of the the method used to measure altitude. - vertical_datum : float, optional + altitude_method_name : string or iterable of strings, optional + The name of the method used to measure altitude. + vertical_datum : string or iterable of strings, optional The datum used to determine altitude and vertical position at the - monitoring location.' - vertical_datum_name : float, optional + monitoring location. + vertical_datum_name : string or iterable of strings, optional The datum used to determine altitude and vertical position at the monitoring location. horizontal_positional_accuracy_code : string or iterable of strings, optional @@ -633,7 +630,7 @@ def get_monitoring_locations( if the contributing area is different from the total drainage area. This situation can occur when part of the drainage area consists of very porous soil or depressions that either allow all runoff to enter the - groundwater or traps the water in ponds so that rainfall does not + groundwater or trap the water in ponds so that rainfall does not contribute to runoff. A transbasin diversion can also affect the total drainage area. time_zone_abbreviation : string or iterable of strings, optional @@ -678,7 +675,7 @@ def get_monitoring_locations( codes `_ is available. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. Available + A list of requested columns to be returned from the query. Available options are: geometry, id, agency_code, agency_name, monitoring_location_number, monitoring_location_name, district_code, country_code, country_name, state_code, state_name, county_code, @@ -698,14 +695,14 @@ def get_monitoring_locations( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower - if your internet connection is spotty. The default (NA) will set the + if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. @@ -724,7 +721,7 @@ def get_monitoring_locations( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -775,7 +772,6 @@ def get_time_series_metadata( time_series_id: str | Iterable[str] | None = None, web_description: str | Iterable[str] | None = None, skip_geometry: bool | None = None, - time: str | Iterable[str] | None = None, bbox: list[float] | None = None, limit: int | None = None, filter: str | None = None, @@ -806,10 +802,13 @@ def get_time_series_metadata( parameter_name : string or iterable of strings, optional A human-understandable name corresponding to parameter_code. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. - Available options are: geometry, id, time_series_id, - monitoring_location_id, parameter_code, statistic_id, time, value, - unit_of_measure, approval_status, qualifier, last_modified + A list of requested columns to be returned from the query. + Available options are: begin, begin_utc, computation_identifier, + computation_period_identifier, end, end_utc, geometry, + hydrologic_unit_code, id, last_modified, monitoring_location_id, + parameter_code, parameter_description, parameter_name, + parent_time_series_id, primary, state_name, statistic_id, + sublocation_identifier, thresholds, unit_of_measure, web_description statistic_id : string or iterable of strings, optional A code corresponding to the statistic an observation represents. Example codes include 00001 (max), 00002 (min), and 00003 (mean). @@ -829,7 +828,7 @@ def get_time_series_metadata( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). Only features that have a last_modified that @@ -881,7 +880,7 @@ def get_time_series_metadata( You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). Only - features that have a end that intersects the value of datetime are + features that have an end that intersects the value of datetime are selected. Examples: @@ -900,7 +899,7 @@ def get_time_series_metadata( computation_identifier : string or iterable of strings, optional Indicates whether the data from this time series represent a specific statistical computation. - thresholds : numeric or list of numbers, optional + thresholds : number or list of numbers, optional Thresholds represent known numeric limits for a time series, for example the historic maximum value for a parameter or a level below which a sensor is non-operative. These thresholds are sometimes used to @@ -925,10 +924,10 @@ def get_time_series_metadata( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower @@ -946,7 +945,7 @@ def get_time_series_metadata( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -1087,7 +1086,7 @@ def get_combined_metadata( computation_identifier : string or iterable of strings, optional Indicates whether the data from this time series represent a specific statistical computation. - thresholds : numeric or list of numbers, optional + thresholds : number or list of numbers, optional Numeric limits known for a time series (e.g. historic maximum, below-which-the-sensor-is-non-operative). sublocation_identifier : string or iterable of strings, optional @@ -1125,7 +1124,7 @@ def get_combined_metadata( Only features whose geometry intersects the bounding box are selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 (longitude/latitude, west-south-east-north). - limit : numeric, optional + limit : int, optional Page size; the maximum allowable value is 50000. Default (``None``) requests the maximum allowable limit. filter, filter_lang : optional @@ -1140,7 +1139,7 @@ def get_combined_metadata( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. Examples @@ -1232,7 +1231,7 @@ def get_latest_continuous( of data may be delayed if the monitoring location does not have the capacity to automatically transmit data. Continuous data are described by parameter name and parameter code. These data might also be referred to as "instantaneous - values" or "IV" + values" or "IV". Parameters ---------- @@ -1254,7 +1253,7 @@ def get_latest_continuous( A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. Available + A list of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, monitoring_location_id, parameter_code, statistic_id, time, value, unit_of_measure, approval_status, qualifier, last_modified @@ -1293,7 +1292,7 @@ def get_latest_continuous( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). Only features that have a last_modified that @@ -1335,10 +1334,10 @@ def get_latest_continuous( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower @@ -1356,7 +1355,7 @@ def get_latest_continuous( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -1450,7 +1449,7 @@ def get_latest_daily( A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. Available + A list of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, monitoring_location_id, parameter_code, statistic_id, time, value, unit_of_measure, approval_status, qualifier, last_modified @@ -1489,7 +1488,7 @@ def get_latest_daily( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). Only features that have a last_modified that @@ -1531,10 +1530,10 @@ def get_latest_daily( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower @@ -1552,7 +1551,7 @@ def get_latest_daily( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -1639,7 +1638,7 @@ def get_field_measurements( A short code corresponding to the observing procedure for the field measurement. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. See the + A list of requested columns to be returned from the query. See the field-measurements schema in the OpenAPI reference for the available columns (e.g. geometry, id, monitoring_location_id, parameter_code, value, unit_of_measure, approval_status, qualifier, last_modified): @@ -1671,7 +1670,7 @@ def get_field_measurements( last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field + that anything about the measurement has changed. You can query this field using date-times or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals may be bounded or half-bounded (double-dots at start or end). Only features that have a last_modified that @@ -1719,10 +1718,10 @@ def get_field_measurements( selected. The bounding box is provided as four or six numbers, depending on whether the coordinate reference system includes a vertical axis (height or depth). Coordinates are assumed to be in crs 4326. The - expected format is a numeric vector structured: c(xmin,ymin,xmax,ymax). - Another way to think of it is c(Western-most longitude, Southern-most - latitude, Eastern-most longitude, Northern-most latitude). - limit : numeric, optional + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower @@ -1740,7 +1739,7 @@ def get_field_measurements( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples @@ -1840,7 +1839,7 @@ def get_field_measurements_metadata( Only features whose geometry intersects the bounding box are selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 (longitude / latitude, west-south-east-north). - limit : numeric, optional + limit : int, optional Page size; the maximum allowable value is 50000. Default (``None``) requests the maximum allowable limit. filter, filter_lang : optional @@ -1855,7 +1854,7 @@ def get_field_measurements_metadata( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. Examples @@ -1963,7 +1962,7 @@ def get_peaks( Only features whose geometry intersects the bounding box are selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 (longitude / latitude, west-south-east-north). - limit : numeric, optional + limit : int, optional Page size; the maximum allowable value is 50000. Default (``None``) requests the maximum allowable limit. filter, filter_lang : optional @@ -1978,7 +1977,7 @@ def get_peaks( ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. Examples @@ -2035,14 +2034,14 @@ def get_reference_table( "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes", "parameter-codes", "reliability-codes", "site-types", "states", "statistic-codes", "topographic-codes", "time-zone-codes" - limit : numeric, optional + limit : int, optional The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. query: dictionary, optional - The optional args parameter can be used to pass a dictionary of + The optional query parameter can be used to pass a dictionary of query parameters to the collection API call. max_rows : int, optional Cap the total number of rows returned, stopping pagination early @@ -2060,7 +2059,7 @@ def get_reference_table( separated by underscores (e.g. the "medium-codes" reference table has a column called "medium_code", which contains all possible medium code values). - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object including the URL request and query time. Examples @@ -2107,7 +2106,7 @@ def get_codes(code_service: CODE_SERVICES) -> tuple[pd.DataFrame, BaseMetadata]: Parameters ---------- code_service : string - One of the following options: "states", "counties", "countries" + One of the following options: "states", "counties", "countries", "sitetype", "samplemedia", "characteristicgroup", "characteristics", or "observedproperty" @@ -2253,12 +2252,12 @@ def get_samples( characteristicUserSupplied : string or iterable of strings, optional A user supplied characteristic name describing one or more results. boundingBox: list of four floats, optional - Filters on the the associated monitoring location's point location + Filters on the associated monitoring location's point location by checking if it is located within the specified geographic area. The logic is inclusive, i.e. it will include locations that overlap with the edge of the bounding box. Values are separated by commas, expressed in decimal degrees, NAD83, and longitudes west of Greenwich - are negative. The format is a string consisting of: + are negative. The format is a list consisting of: * Western-most longitude * Southern-most latitude @@ -2327,7 +2326,7 @@ def get_samples( timezone abbreviation is not recognized resolve to ``NaT``. Rows are sorted by ``Activity_StartDateTime`` when present (the API's default order is unstable). - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.utils.BaseMetadata` Custom ``dataretrieval`` metadata object pertaining to the query. Examples @@ -2411,7 +2410,7 @@ def get_samples_summary( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.utils.BaseMetadata` Custom ``dataretrieval`` metadata object pertaining to the query. Examples @@ -2501,7 +2500,7 @@ def get_stats_por( monitoring location. The default is 1000. parent_time_series_id: string, optional The parent_time_series_id returns statistics tied to a - particular datbase entry. + particular database entry. site_type_code: string, optional Site type code query parameter. A list of valid site type codes is available at: @@ -2533,6 +2532,13 @@ def get_stats_por( of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + Returns + ------- + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` + Formatted data returned from the API query. + md : :obj:`dataretrieval.utils.BaseMetadata` + A custom metadata object. + Examples -------- .. code:: @@ -2626,7 +2632,7 @@ def get_stats_date_range( monitoring location. The default is 1000. parent_time_series_id: string, optional The parent_time_series_id returns statistics tied to a - particular datbase entry. + particular database entry. site_type_code: string, optional Site type code query parameter. You can see a list of valid site type codes here: @@ -2661,6 +2667,13 @@ def get_stats_date_range( of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + Returns + ------- + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` + Formatted data returned from the API query. + md : :obj:`dataretrieval.utils.BaseMetadata` + A custom metadata object. + Examples -------- .. code:: @@ -2799,6 +2812,23 @@ def get_channel( The longitudinal velocity description. measurement_type : string or iterable of strings, optional The type of channel measurement. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + that anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Only features that have a last_modified that + intersects the value of datetime are selected. + Examples: + + * A date-time: "2018-02-12T23:20:50Z" + * A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + * Half-bounded intervals: "2018-02-12T00:00:00Z/.." or + "../2018-03-18T12:31:12Z" + * Duration objects: "P1M" for data from the past month or + "PT36H" for the last 36 hours + skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. @@ -2807,7 +2837,7 @@ def get_channel( channel_measurement_type : string or iterable of strings, optional The channel measurement type. properties : string or iterable of strings, optional - A vector of requested columns to be returned from the query. Available + A list of requested columns to be returned from the query. Available options are: geometry, channel_measurements_id, monitoring_location_id, field_visit_id, measurement_number, time, channel_name, channel_flow, channel_flow_unit, channel_width, channel_width_unit, channel_area, @@ -2815,22 +2845,35 @@ def get_channel( channel_location_distance, channel_location_distance_unit, channel_stability, channel_material, channel_evenness, horizontal_velocity_description, vertical_velocity_description, longitudinal_velocity_description, - measurement_type, last_modified, channel_measurement_type. The default (NA) will - return all columns of the data. + measurement_type, last_modified, channel_measurement_type. The default + (None) will return all columns of the data. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are + selected. The bounding box is provided as four or six numbers, + depending on whether the coordinate reference system includes a vertical + axis (height or depth). Coordinates are assumed to be in crs 4326. The + expected format is ``[xmin, ymin, xmax, ymax]``, i.e. + ``[Western-most longitude, Southern-most latitude, Eastern-most + longitude, Northern-most latitude]``. + limit : int, optional + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 50000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional - If True, the function will convert the data to dates and qualifier to - string vector + If True, converts columns to appropriate types. Returns ------- df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. - md: :obj:`dataretrieval.utils.Metadata` + md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object Examples diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 538a5883..9a5e20f5 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -1235,12 +1235,12 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: Notes ----- - ``_get_resp_data`` returns a plain ``pd.DataFrame()`` on empty - responses; concatenating it with real ``GeoDataFrame``s downgrades - the result to plain ``DataFrame`` and strips geometry/CRS, so - empties are dropped first. Dedup on the pre-rename feature ``id`` - keeps overlapping user OR-clauses from producing duplicate rows - across chunks. + An empty chunk can be a plain ``pd.DataFrame()`` (no geopandas); + concatenating it with real ``GeoDataFrame``s downgrades the result + to plain ``DataFrame`` and strips geometry/CRS, so empties are + dropped first. Dedup on the pre-rename feature ``id`` keeps + overlapping user OR-clauses from producing duplicate rows across + chunks. Dedup is restricted to rows whose ``id`` is non-null. ``pandas`` treats NaN==NaN as a duplicate for ``drop_duplicates``, so a @@ -1254,8 +1254,8 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: # input even when every chunk is empty — ``_get_resp_data`` # returns ``gpd.GeoDataFrame()`` on empty geopd responses, and # returning a plain ``pd.DataFrame()`` here would downgrade - # the type a downstream ``pd.concat([result, geo_page])`` to a - # plain DataFrame and strip geometry/CRS. + # the type in a downstream ``pd.concat([result, geo_page])`` to + # a plain DataFrame and strip geometry/CRS. return frames[0] if frames else pd.DataFrame() if len(non_empty) == 1: # Single-completed-chunk fast path. Return a copy so callers diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index 39a80332..42a10764 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -61,14 +61,14 @@ def get_nearest_continuous( Must be small enough that every target's window captures roughly one observation at the service cadence. The default - matches a 15-minute continuous gauge; widen (e.g. + matches a 15-minute continuous gage; widen (e.g. ``"PT15M"``) for irregular cadences or resilience to data gaps. on_tie : {"first", "last", "mean"}, default ``"first"`` How to resolve ties when two observations are exactly equidistant from a target (which happens when the target falls at the midpoint between grid points — e.g. target ``10:22:30`` for a 15-minute - gauge). + gage). - ``"first"``: keep the earlier observation. - ``"last"``: keep the later observation. @@ -97,9 +97,10 @@ def get_nearest_continuous( *Window sizing and ties.* When ``window`` is exactly half the service cadence, most targets' windows contain a single observation and ``on_tie`` is moot. Ties arise only when a target sits exactly at the - window edge — rare in practice but possible. Setting ``window`` to a - full cadence (or larger) guarantees at least one observation per - target in steady state at the cost of more bytes per response. + midpoint between two grid observations — rare in practice but possible. + Setting ``window`` to a full cadence (or larger) guarantees at least one + observation per target in steady state at the cost of more bytes per + response. *Why windowed CQL rather than sort+limit.* The API's advertised ``sortby`` parameter would make this a one-liner per target (``filter`` @@ -128,7 +129,7 @@ def get_nearest_continuous( ... parameter_code="00060", ... ) - >>> # Widen the window for an irregular-cadence gauge + >>> # Widen the window for an irregular-cadence gage >>> df, md = waterdata.get_nearest_continuous( ... targets, ... monitoring_location_id="USGS-02238500", diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 5581d086..5c98967c 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -103,9 +103,10 @@ def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str) -> dict[str, Switch argument id from its package-specific identifier to the standardized "id" key that the API recognizes. - Sets the "id" key in the provided dictionary `ls` - with the value from either the service name or the expected id column name. - If neither key exists, "id" will be set to None. + If `ls` does not already have an "id" key, sets it from either the + service-derived id key or the expected id column name. If neither key + exists, "id" is left unset. The original service-specific id keys are + removed regardless. Parameters ---------- @@ -148,11 +149,12 @@ def _switch_properties_id( ) -> list[str]: """ Switch properties id from its package-specific identifier to the - standardized "id" key that the API recognizes. + standardized "id" name that the API recognizes. - Sets the "id" key in the provided dictionary `ls` with the value from either - the service name or the expected id column name. If neither key exists, "id" - will be set to None. + Replaces any service-specific id name in `properties` with "id", + normalizes remaining hyphens to underscores, and drops the "geometry" + and service-id entries. Returns an empty list when `properties` is empty + or None. Parameters ---------- @@ -167,7 +169,7 @@ def _switch_properties_id( Returns ------- List[str] - The modified list with the "id" key set appropriately. + The modified list with id names standardized to "id". Examples -------- @@ -883,17 +885,20 @@ def _get_resp_data( gpd.GeoDataFrame or pd.DataFrame A ``GeoDataFrame`` when ``geopd`` is True; otherwise a plain ``DataFrame`` carrying the feature properties plus an ``id`` - column and a ``geometry`` column (coordinates list) where the - response includes them. Returns an empty ``DataFrame`` when no - features are returned. + column (always present, possibly all-None) and a ``geometry`` + column (coordinates list) when at least one feature includes + geometry. Returns an empty ``DataFrame`` when no features are + returned. Notes ----- The non-geopandas branch builds the frame directly from each feature's ``properties`` dict, plus the top-level ``id`` and - ``geometry.coordinates`` columns — but adds the ``id`` and - ``geometry`` columns only when at least one feature actually - carries them. This skips the GeoJSON envelope entirely, so + ``geometry.coordinates`` columns — the ``id`` column is always + added (so the downstream rename to the service-specific output id + works even on an all-None id), while the ``geometry`` column is + added only when at least one feature carries geometry. This skips + the GeoJSON envelope entirely, so newly-added Feature-level fields (e.g. ``geometry.type`` after USGS migrated to full GeoJSON geometry objects) can't leak into the result frame; no reactive drop-list needs maintenance every @@ -1255,8 +1260,8 @@ def _deal_with_empty( If `return_list` is empty, determines the column names to use: - If `properties` is not provided or contains only NaN values, - retrieves schema properties from the specified service. - - Otherwise, uses the provided `properties` list as column names. + retrieves schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. Parameters ---------- @@ -1587,8 +1592,10 @@ def _handle_stats_nesting( :func:`_get_resp_data`: it builds the per-feature outer frame directly from each feature's ``properties`` (minus the nested ``data`` field, which is unrolled separately below via the - ``record_path`` json_normalize), then adds ``id`` and ``geometry`` - only when present. Skipping the GeoJSON envelope keeps newly-added + ``record_path`` json_normalize), then adds ``geometry`` only when + present. Unlike :func:`_get_resp_data`, no top-level ``id`` column + is added — stats features don't carry one, so this matches the + geopandas branch. Skipping the GeoJSON envelope keeps newly-added fields like ``geometry.type`` from leaking into the result. """ if body is None: @@ -1649,9 +1656,9 @@ def _expand_percentiles(df: pd.DataFrame) -> pd.DataFrame: """ Takes percentile value and thresholds columns containing lists of values and turns each list element into its own row in the - original dataframe. 'nan's are removed from the dataframe. If + original dataframe. Exploded ``'nan'`` values are dropped. If no percentile data exist, it adds a percentile column and - populates column with percentile assigned to min, max, and + populates it with the percentile assigned to min, max, and median. Parameters @@ -1767,7 +1774,7 @@ def get_stats_data( expand_percentiles : bool Determines whether the percentiles column is expanded so that each percentile gets its own row in the returned dataframe. If - True and user requests a computation_type other than + True and the user requests a computation_type other than percentiles, a percentile column is still returned. client : httpx.AsyncClient, optional Caller-borrowed async client. ``None`` (default) opens a @@ -2003,13 +2010,18 @@ def _get_args( - ``properties`` is materialized to ``list[str]`` (a bare string gets wrapped in a single-element list so downstream ``",".join(properties)`` doesn't iterate per character). - - Any other ``Iterable[str]`` that isn't in ``_NO_NORMALIZE_PARAMS`` + - A non-string iterable in ``_NO_NORMALIZE_PARAMS`` (numeric params + such as ``water_year``, ``bbox``, ``thresholds``) is materialized + to a ``list`` with its element types preserved (no string + normalization), so the GET comma-join and the chunker — which test + ``list``/``tuple`` — handle it instead of ``str()``-ing the whole + array. + - Any other ``Iterable[str]`` (i.e. not in ``_NO_NORMALIZE_PARAMS``) is materialized to ``list[str]`` via :func:`_normalize_str_iterable` so downstream code that branches on ``isinstance(v, (list, tuple))`` works for ``pandas.Series``, ``numpy.ndarray``, generators, etc. - - Scalars, strings, and ``_NO_NORMALIZE_PARAMS`` values pass through - unchanged. + - Scalars and strings pass through unchanged. Parameters ---------- diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index 1ca0098a..45d1b206 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -91,8 +91,9 @@ def get_results( dataProfile : string, optional Specifies the data fields returned by the query. WQX3.0 profiles include 'fullPhysChem', 'narrow', and 'basicPhysChem'. - Legacy profiles include 'resultPhysChem','biological', and - 'narrowResult'. Default is 'fullPhysChem'. + Legacy profiles include 'resultPhysChem', 'biological', and + 'narrowResult'. For WQX3.0 queries (``legacy=False``), defaults to + 'fullPhysChem'; legacy queries have no default profile. siteid : string Monitoring location identified by agency code, a hyphen, and identification number (Example: "USGS-05586100"). @@ -136,7 +137,7 @@ def get_results( columns are preserved; unrecognized timezone codes yield ``NaT``. Rows are sorted by ``ActivityStartDateTime`` (or ``Activity_StartDateTime`` for WQX3 responses) when present. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. Examples @@ -215,7 +216,7 @@ def what_sites( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -268,7 +269,7 @@ def what_organizations( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -319,7 +320,7 @@ def what_projects( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -370,7 +371,7 @@ def what_activities( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -435,7 +436,7 @@ def what_detection_limits( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -492,7 +493,7 @@ def what_habitat_metrics( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -543,7 +544,7 @@ def what_project_weights( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -599,7 +600,7 @@ def what_activity_metrics( ------- df : ``pandas.DataFrame`` Formatted data returned from the API query. - md : :obj:`dataretrieval.utils.Metadata` + md : :obj:`dataretrieval.wqp.WQP_Metadata` Custom metadata object pertaining to the query. Examples @@ -662,7 +663,7 @@ class WQP_Metadata(BaseMetadata): ---------- url : str Response url - query_time : datetme.timedelta + query_time : datetime.timedelta Response elapsed time header : httpx.Headers Response headers @@ -678,17 +679,12 @@ def __init__(self, response: httpx.Response, **parameters: Any) -> None: Parameters ---------- - response : Response - Response object from httpx module + response : ``httpx.Response`` + Response object from the ``httpx`` module. parameters : dict Unpacked dictionary of the parameters supplied in the request - Returns - ------- - md : :obj:`dataretrieval.wqp.WQP_Metadata` - A ``dataretrieval`` custom :obj:`dataretrieval.wqp.WQP_Metadata` object. - """ super().__init__(response)