get_ratings: surface RDB header as df.attrs and document the nwis dep

thodson-usgs · claude · thodson-usgs · commit 630cc8c2d93d · 2026-05-06T11:18:49.000-05:00
Two non-functional follow-ups suggested during review of #269: (1) Document the cross-module reach into nwis._read_rdb. Rating files use the same USGS RDB shape as NWIS responses, so the parser is already reusable as-is — no refactor of the legacy nwis module is needed. Added a comment at the import site explaining why the private import is intentional and what to watch for if _read_rdb ever moves. (2) Surface the RDB #-prefixed header block. Each parsed rating frame now carries provenance in df.attrs: - df.attrs["comment"]: the list of "#"-prefixed header lines (rating id, parameter, expansion type, last-shifted timestamp, warnings, etc.). - df.attrs["url"]: the asset URL it was fetched from. R's read_waterdata_ratings exposes the comment block via comment(df); pandas's standard `attrs` dict is the Python equivalent. Done in ratings.py only — does not touch nwis. A live spot-check against api.waterdata.usgs.gov on USGS-01104475 exsa shows the 31-line USGS header survives intact (gauge name, parameter code, rating expansion, etc.). One new unit test pins the behavior. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py
@@ -24,6 +24,11 @@
 import pandas as pd
 import requests
 
+# Rating files use the same USGS RDB shape as NWIS responses (comment
+# block prefixed with ``#``, header row, format-spec row, then tab-separated
+# data), so we reuse the parser already in ``nwis``. ``_read_rdb`` is private;
+# if it ever moves or its contract changes we want a loud failure here, hence
+# the explicit import rather than a copy.
 from dataretrieval.nwis import _read_rdb
 
 from .utils import BASE_URL, _default_headers, _format_api_dates
@@ -84,6 +89,18 @@ def _search(
     return response.json().get("features", [])
 
 
+def _extract_rdb_comment(rdb: str) -> list[str]:
+    """Return the RDB ``#``-prefixed comment block as a list of header lines.
+
+    The comment block carries useful per-rating metadata — rating id,
+    parameter description, expansion type, last-shifted timestamp, etc.
+    R's ``read_waterdata_ratings`` exposes this via ``comment(df)``; we
+    attach it to ``df.attrs["comment"]`` so callers can inspect or log
+    provenance without re-reading the on-disk RDB.
+    """
+    return [line for line in rdb.splitlines() if line.startswith("#")]
+
+
 def _download_and_parse(
     feature: dict[str, Any],
     file_path: str,
@@ -100,7 +117,10 @@ def _download_and_parse(
     with open(target, "w") as f:
         f.write(response.text)
 
-    return _read_rdb(response.text)
+    df = _read_rdb(response.text)
+    df.attrs["comment"] = _extract_rdb_comment(response.text)
+    df.attrs["url"] = url
+    return df
 
 
 def get_ratings(
@@ -168,8 +188,12 @@ def get_ratings(
     dict[str, pandas.DataFrame] or list[dict]
         When ``download_and_parse=True`` (the default), a dict keyed by
         feature ID (e.g. ``"USGS-01104475.exsa.rdb"``) mapping to a parsed
-        ``DataFrame``. When ``download_and_parse=False``, the raw list of
-        STAC feature dicts as returned by the search endpoint.
+        ``DataFrame``. Each frame carries provenance in
+        ``df.attrs["comment"]`` (the RDB ``#``-prefixed header lines, like
+        rating id, parameter, last-shifted timestamp) and
+        ``df.attrs["url"]`` (the asset URL it was fetched from). When
+        ``download_and_parse=False``, the raw list of STAC feature dicts
+        as returned by the search endpoint.
 
     Raises
     ------
diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py
@@ -100,6 +100,31 @@ def test_get_ratings_mocked_search_and_download(requests_mock, tmp_path):
     assert "monitoring_location_id IN ('USGS-01104475')" in qs["filter"][0]
 
 
+def test_get_ratings_attaches_rdb_comment_and_url(requests_mock, tmp_path):
+    """Each parsed frame should carry its RDB header + source URL in df.attrs."""
+    requests_mock.get(
+        "https://api.waterdata.usgs.gov/stac/v0/search",
+        json=_stub_search_response(),
+    )
+    asset_url = (
+        "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb"
+    )
+    requests_mock.get(asset_url, text=_SAMPLE_RDB)
+
+    out = get_ratings(
+        monitoring_location_id="USGS-01104475",
+        file_type="exsa",
+        file_path=str(tmp_path),
+    )
+    df = out["USGS-01104475.exsa.rdb"]
+    # The fixture has two `# ...` lines at the top; both should land in attrs.
+    assert df.attrs["comment"] == [
+        "# header line one",
+        "# header line two",
+    ]
+    assert df.attrs["url"] == asset_url
+
+
 def test_get_ratings_download_and_parse_false_returns_features(requests_mock):
     requests_mock.get(
         "https://api.waterdata.usgs.gov/stac/v0/search",