|
| 1 | +"""Parser for the USGS RDB tab-separated text format. |
| 2 | +
|
| 3 | +RDB (Relational DataBase) is the text format used by NWIS web services |
| 4 | +and by the Water Data STAC catalog's rating-curve assets. Every RDB |
| 5 | +file has the same shape: |
| 6 | +
|
| 7 | +- One or more ``#``-prefixed comment lines carrying provenance metadata |
| 8 | + (data source, retrieval timestamp, station name, parameter codes, etc.). |
| 9 | +- A tab-separated header row naming each column. |
| 10 | +- A second tab-separated row giving column format specs (e.g. ``5s 15s``); |
| 11 | + it is informational only and skipped during parsing. |
| 12 | +- Tab-separated data rows. |
| 13 | +
|
| 14 | +This module exposes the parsing primitives that both ``dataretrieval.nwis`` |
| 15 | +and ``dataretrieval.waterdata.ratings`` use. Callers layer their own |
| 16 | +post-processing (NWIS-specific datetime indexing, ratings-specific |
| 17 | +``df.attrs`` provenance, etc.) on top of the raw frame. |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +from io import StringIO |
| 23 | + |
| 24 | +import pandas as pd |
| 25 | + |
| 26 | + |
| 27 | +def read_rdb(text: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: |
| 28 | + """Parse an RDB text response into a ``pandas.DataFrame``. |
| 29 | +
|
| 30 | + Parameters |
| 31 | + ---------- |
| 32 | + text : str |
| 33 | + The RDB text response from a USGS web service. |
| 34 | + dtypes : dict[str, type] or None, optional |
| 35 | + Optional column-name to dtype hints, forwarded to |
| 36 | + ``pandas.read_csv``. Unknown column names are silently ignored, so |
| 37 | + callers may safely pass a dict of all columns they might be |
| 38 | + interested in. |
| 39 | +
|
| 40 | + Returns |
| 41 | + ------- |
| 42 | + pandas.DataFrame |
| 43 | + The parsed data. An RDB consisting only of comment lines (e.g. a |
| 44 | + "no sites found" response) returns an empty DataFrame rather than |
| 45 | + raising. |
| 46 | +
|
| 47 | + Raises |
| 48 | + ------ |
| 49 | + ValueError |
| 50 | + If the response body looks like HTML, which usually means the |
| 51 | + service has been moved, is degraded, or returned an error page. |
| 52 | + """ |
| 53 | + if "<html>" in text.lower() or "<!doctype html>" in text.lower(): |
| 54 | + raise ValueError( |
| 55 | + "Received HTML response instead of RDB. This often indicates " |
| 56 | + "that the service has been moved or is currently unavailable." |
| 57 | + ) |
| 58 | + |
| 59 | + lines = text.splitlines() |
| 60 | + header_idx = next( |
| 61 | + (i for i, line in enumerate(lines) if not line.startswith("#")), |
| 62 | + len(lines), |
| 63 | + ) |
| 64 | + if header_idx == len(lines): |
| 65 | + # All lines are comments — a legitimate empty result. |
| 66 | + return pd.DataFrame() |
| 67 | + |
| 68 | + fields = [f.replace(",", "").strip() for f in lines[header_idx].split("\t")] |
| 69 | + fields = [f for f in fields if f] |
| 70 | + |
| 71 | + return pd.read_csv( |
| 72 | + StringIO(text), |
| 73 | + delimiter="\t", |
| 74 | + skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row |
| 75 | + names=fields, |
| 76 | + na_values="NaN", |
| 77 | + dtype=dtypes, |
| 78 | + ) |
| 79 | + |
| 80 | + |
| 81 | +def extract_rdb_comment(text: str) -> list[str]: |
| 82 | + """Return the RDB ``#``-prefixed comment block, raw and in original order. |
| 83 | +
|
| 84 | + Each entry includes its leading ``#`` and any whitespace, matching what |
| 85 | + R's ``dataRetrieval`` returns from ``comment(df)``. The comment block |
| 86 | + carries provenance metadata that is otherwise lost during parsing — |
| 87 | + data source, retrieval timestamp, parameter codes, rating id and |
| 88 | + last-shifted timestamp for ratings, etc. |
| 89 | + """ |
| 90 | + return [line for line in text.splitlines() if line.startswith("#")] |
0 commit comments