test(nwis): mock the live NWIS tests; drop the flaky_api marker

thodson-usgs · claude · thodson-usgs · commit ece2b54ec0b9 · 2026-06-24T12:39:04.000-05:00
The legacy NWIS getters hit waterservices.usgs.gov, so several tests made live calls and flaked CI on transient outages (the recurring nwis/site connect timeouts). NWIS is deprecated (removal ~2027), so rather than maintain live coverage, convert these tests to offline fixtures: - nwis_test.py: mock TestMetaData (site_info → what_sites → /site), TestSiteseriesCatalogOutput (/site, with a captured seriesCatalogOutput fixture vs. the basic one), and TestTZ (/site + /iv). Remove the pure live-smoke test_nwis_service_live (nothing to assert once offline). Drop the now-unneeded module-level flaky_api marker. - utils_test.py: mock Test_query — a 414 → URLTooLong via the nwis.get_iv getter path (DOI-USGS#64), and the User-Agent header check. Drop flaky_api. - Add tests/data/nwis_site_seriescatalog.txt (a small captured seriesCatalogOutput=True RDB response). The module is now fully offline (45 + 36 tests, ~1s, no network), preserving the DOI-USGS#34 / DOI-USGS#60 / DOI-USGS#73 regression intent without the flakes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Sjb14HkwuCydKSKMsaXsgd
diff --git a/tests/data/nwis_site_seriescatalog.txt b/tests/data/nwis_site_seriescatalog.txt
@@ -0,0 +1,46 @@
+#
+#
+# US Geological Survey
+# retrieved: 2026-06-24 13:30:23 -04:00	(caas01)
+#
+# The Site File stores location and general information about groundwater,
+# surface water, and meteorological sites
+# for sites in USA.
+#
+# File-format description:  http://help.waterdata.usgs.gov/faq/about-tab-delimited-output
+# Automated-retrieval info: http://waterservices.usgs.gov/rest/Site-Service.html
+#
+# Contact:   gs-w_support_nwisweb@usgs.gov
+#
+# The following selected fields are included in this output:
+#
+#  agency_cd       -- Agency
+#  site_no         -- Site identification number
+#  station_nm      -- Site name
+#  site_tp_cd      -- Site type
+#  dec_lat_va      -- Decimal latitude
+#  dec_long_va     -- Decimal longitude
+#  coord_acy_cd    -- Latitude-longitude accuracy
+#  dec_coord_datum_cd -- Decimal Latitude-longitude datum
+#  alt_va          -- Altitude of Gage/land surface
+#  alt_acy_va      -- Altitude accuracy
+#  alt_datum_cd    -- Altitude datum
+#  huc_cd          -- Hydrologic unit code
+#  data_type_cd    -- Data type
+#  parm_cd         -- Parameter code
+#  stat_cd         -- Statistical code
+#  ts_id           -- Internal timeseries ID
+#  loc_web_ds      -- Additional measurement description
+#  medium_grp_cd   -- Medium group code
+#  parm_grp_cd     -- Parameter group code
+#  srs_id          -- SRS ID
+#  access_cd       -- Access code
+#  begin_date      -- Begin date
+#  end_date        -- End date
+#  count_nu        -- Record count
+#
+agency_cd	site_no	station_nm	site_tp_cd	dec_lat_va	dec_long_va	coord_acy_cd	dec_coord_datum_cd	alt_va	alt_acy_va	alt_datum_cd	huc_cd	data_type_cd	parm_cd	stat_cd	ts_id	loc_web_ds	medium_grp_cd	parm_grp_cd	srs_id	access_cd	begin_date	end_date	count_nu
+5s	15s	50s	7s	16s	16s	1s	10s	8s	3s	10s	16s	2s	5s	5s	5n	30s	3s	3s	5n	4n	20d	20d	5n
+USGS	01491000	CHOPTANK RIVER NEAR GREENSBORO, MD	ST	38.99719444	-75.7858056	S	NAD83	 2.73	 .1	NAVD88	02060005	ad			0		wat		0	0	2006	2025	20
+USGS	01491000	CHOPTANK RIVER NEAR GREENSBORO, MD	ST	38.99719444	-75.7858056	S	NAD83	 2.73	 .1	NAVD88	02060005	dv	00010	00001	68074	[Discontinued]	wat		1645597	0	1988-10-01	2012-05-09	894
+USGS	01491000	CHOPTANK RIVER NEAR GREENSBORO, MD	ST	38.99719444	-75.7858056	S	NAD83	 2.73	 .1	NAVD88	02060005	dv	00010	00001	327630		wat		1645597	0	2023-04-21	2026-06-23	1155
diff --git a/tests/nwis_test.py b/tests/nwis_test.py
@@ -9,7 +9,6 @@
 import pandas as pd
 import pytest
 
-from dataretrieval.exceptions import DataRetrievalError
 from dataretrieval.nwis import (
     NWIS_Metadata,
     _read_rdb,
@@ -24,17 +23,16 @@
     preformat_peaks_response,
     what_sites,
 )
-from tests.conftest import flaky_api
 
 START_DATE = "2018-01-24"
 END_DATE = "2018-01-25"
 
 DATETIME_COL = "datetime"
 SITENO_COL = "site_no"
 
-# Several tests in this module hit the live NWIS services, so retry a transient
-# upstream failure rather than failing CI (see ``conftest.flaky_api``).
-pytestmark = flaky_api
+# Legacy NWIS endpoints these tests mock — this module makes no live calls.
+_SITE_RE = re.compile(r"^https://waterservices\.usgs\.gov/nwis/site(\?.*)?$")
+_IV_RE = re.compile(r"^https://waterservices\.usgs\.gov/nwis/iv(\?.*)?$")
 
 
 def _load_mock_json(file_name):
@@ -44,6 +42,16 @@ def _load_mock_json(file_name):
         return json.load(f)
 
 
+def _load_fixture(file_name):
+    """Read a raw fixture file (e.g. an RDB response) from tests/data."""
+    return (Path(__file__).parent / "data" / file_name).read_text(encoding="utf-8")
+
+
+def _mock_site(httpx_mock, fixture="waterservices_site.txt"):
+    """Mock the legacy NWIS ``site`` endpoint with an RDB fixture."""
+    httpx_mock.add_response(method="GET", url=_SITE_RE, text=_load_fixture(fixture))
+
+
 def _test_iv_service(httpx_mock):
     """Mocked test of instantaneous value service"""
     start = START_DATE
@@ -73,39 +81,6 @@ def test_iv_service_answer(httpx_mock):
     ], f"iv service returned incorrect index: {df.index.names}"
 
 
-def test_nwis_service_live():
-    """Live sanity check of NWIS service, tolerant of transient NWIS outages."""
-    site = "01491000"
-    try:
-        # Minimal query: just most recent record
-        get_iv(sites=site)
-    except (DataRetrievalError, ValueError) as e:
-        # Catch known transient service failures: a typed DataRetrievalError
-        # (e.g. ServiceUnavailable on a 5xx, a RuntimeError) or a legacy ValueError
-        error_text = str(e)
-        if any(
-            err in error_text
-            for err in [
-                "500",
-                "502",
-                "503",
-                "Service Unavailable",
-                "Received HTML response instead of JSON",
-            ]
-        ):
-            pytest.skip(
-                f"Service is currently unavailable (transient NWIS outage): {e}"
-            )
-        raise
-    except Exception as e:
-        # Fallback for other potential transient network issues
-        if "Expecting value" in str(e) or "JSON" in str(e):
-            pytest.skip(
-                f"Service returned invalid response (likely transient outage): {e}"
-            )
-        raise
-
-
 def test_preformat_peaks_response():
     # make a data frame with a "peak_dt" datetime column
     # it will have some nan and none values
@@ -119,14 +94,6 @@ def test_preformat_peaks_response():
     assert df["datetime"].isna().sum() == 0
 
 
-# tests using real queries to USGS webservices
-# these specific queries represent some edge-cases and the tests to address
-# incomplete date-time information
-
-
-# Removed defunct gwlevels tests.
-
-
 class TestDeprecationWarnings:
     """Verify per-function DeprecationWarning fires with the right replacement.
 
@@ -257,78 +224,80 @@ def test_get_record_defunct_service_water_use(self):
 
 
 class TestTZ:
-    """Tests relating to GitHub Issue #60."""
+    """Tests relating to GitHub Issue #60 — merging IV results across sites
+    yields a proper datetime index. Mocked against fixture responses."""
 
-    @pytest.fixture(scope="class")
-    def sites(self):
-        # Fetch once per class, at test time (not at collection) so a transient
-        # upstream failure is retried by the module ``flaky`` marker instead of
-        # aborting collection — a class-body call cannot be reran.
-        sites, _ = what_sites(stateCd="MD")
-        return sites
+    def _mock(self, httpx_mock):
+        _mock_site(httpx_mock)
+        httpx_mock.add_response(
+            method="GET", url=_IV_RE, json=_load_mock_json("nwis_iv_mock.json")
+        )
 
-    def test_multiple_tz_01(self, sites):
-        """Test based on GitHub Issue #60 - error merging different time zones."""
-        # this test fails before issue #60 is fixed
+    def test_multiple_tz_01(self, httpx_mock):
+        """Issue #60 - merging IV across sites yields a datetime index."""
+        self._mock(httpx_mock)
+        sites, _ = what_sites(stateCd="MD")
         iv, _ = get_iv(sites=sites.site_no.values[:25].tolist())
-        # assert that the datetime column exists
         assert "datetime" in iv.index.names
-        # assert that it is a datetime type
         assert isinstance(iv.index[0][1], datetime.datetime)
 
-    def test_multiple_tz_02(self, sites):
-        """Test based on GitHub Issue #60 - confirm behavior for same tz."""
-        # this test passes before issue #60 is fixed
+    def test_multiple_tz_02(self, httpx_mock):
+        """Issue #60 - the same-tz path also yields a datetime index."""
+        self._mock(httpx_mock)
+        sites, _ = what_sites(stateCd="MD")
         iv, _ = get_iv(sites=sites.site_no.values[:20].tolist())
-        # assert that the datetime column exists
         assert "datetime" in iv.index.names
-        # assert that it is a datetime type
         assert isinstance(iv.index[0][1], datetime.datetime)
 
 
 class TestSiteseriesCatalogOutput:
-    """Tests relating to GitHub Issue #34."""
+    """Tests relating to GitHub Issue #34 — ``seriesCatalogOutput`` adds the
+    data-inventory columns (begin_date / end_date / count_nu). Mocked against
+    fixture responses (the chosen fixture, not the request param, decides which
+    columns come back)."""
 
-    def test_seriesCatalogOutput_get_record(self):
-        """Test setting seriesCatalogOutput to true with get_record."""
+    _SERIESCATALOG = "nwis_site_seriescatalog.txt"
+
+    def test_seriesCatalogOutput_get_record(self, httpx_mock):
+        """seriesCatalogOutput=True with get_record exposes inventory columns."""
+        _mock_site(httpx_mock, self._SERIESCATALOG)
         data = get_record(
             huc="20", parameterCd="00060", service="site", seriesCatalogOutput="True"
         )
-        # assert that expected data columns are present
         assert "begin_date" in data.columns
         assert "end_date" in data.columns
         assert "count_nu" in data.columns
 
-    def test_seriesCatalogOutput_get_info(self):
-        """Test setting seriesCatalogOutput to true with get_info."""
+    def test_seriesCatalogOutput_get_info(self, httpx_mock):
+        """seriesCatalogOutput=TRUE with get_info exposes inventory columns."""
+        _mock_site(httpx_mock, self._SERIESCATALOG)
         data, _ = get_info(huc="20", parameterCd="00060", seriesCatalogOutput="TRUE")
-        # assert that expected data columns are present
         assert "begin_date" in data.columns
         assert "end_date" in data.columns
         assert "count_nu" in data.columns
 
-    def test_seriesCatalogOutput_bool(self):
-        """Test setting seriesCatalogOutput with a boolean."""
+    def test_seriesCatalogOutput_bool(self, httpx_mock):
+        """A boolean seriesCatalogOutput is accepted and exposes inventory cols."""
+        _mock_site(httpx_mock, self._SERIESCATALOG)
         data, _ = get_info(huc="20", parameterCd="00060", seriesCatalogOutput=True)
-        # assert that expected data columns are present
         assert "begin_date" in data.columns
         assert "end_date" in data.columns
         assert "count_nu" in data.columns
 
-    def test_expandedrdb_get_record(self):
-        """Test default expanded_rdb format with get_record."""
+    def test_expandedrdb_get_record(self, httpx_mock):
+        """The default expanded-rdb format omits the inventory columns."""
+        _mock_site(httpx_mock)
         data = get_record(
             huc="20", parameterCd="00060", service="site", seriesCatalogOutput="False"
         )
-        # assert that seriesCatalogOutput columns are not present
         assert "begin_date" not in data.columns
         assert "end_date" not in data.columns
         assert "count_nu" not in data.columns
 
-    def test_expandedrdb_get_info(self):
-        """Test default expanded_rdb format with get_info."""
+    def test_expandedrdb_get_info(self, httpx_mock):
+        """get_info default omits the inventory columns."""
+        _mock_site(httpx_mock)
         data, _ = get_info(huc="20", parameterCd="00060")
-        # assert that seriesCatalogOutput columns are not present
         assert "begin_date" not in data.columns
         assert "end_date" not in data.columns
         assert "count_nu" not in data.columns
@@ -353,67 +322,46 @@ def test_empty_timeseries(httpx_mock):
 
 
 class TestMetaData:
-    """Tests of NWIS metadata setting,
-
-    Notes
-    -----
+    """Tests of NWIS metadata setting (originally GitHub Issue #73).
 
-    - Originally based on GitHub Issue #73.
-    - Modified to expose site_info as a property, not a callable.
+    ``site_info`` is a property that lazily re-queries ``what_sites``; mocked
+    here against the ``site`` endpoint so it is exercised offline.
     """
 
-    def test_set_metadata_info_site(self):
-        """Test metadata info is set when site parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, sites="01491000")
-        # assert that site_info is implemented
+    def test_set_metadata_info_site(self, httpx_mock):
+        """site_info is populated when ``sites`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), sites="01491000")
         assert md.site_info
 
-    def test_set_metadata_info_site_no(self):
-        """Test metadata info is set when site_no parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, site_no="01491000")
-        # assert that site_info is implemented
+    def test_set_metadata_info_site_no(self, httpx_mock):
+        """site_info is populated when ``site_no`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), site_no="01491000")
         assert md.site_info
 
-    def test_set_metadata_info_stateCd(self):
-        """Test metadata info is set when stateCd parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, stateCd="RI")
-        # assert that site_info is implemented
+    def test_set_metadata_info_stateCd(self, httpx_mock):
+        """site_info is populated when ``stateCd`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), stateCd="RI")
         assert md.site_info
 
-    def test_set_metadata_info_huc(self):
-        """Test metadata info is set when huc parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, huc="01")
-        # assert that site_info is implemented
+    def test_set_metadata_info_huc(self, httpx_mock):
+        """site_info is populated when ``huc`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), huc="01")
         assert md.site_info
 
-    def test_set_metadata_info_bbox(self):
-        """Test metadata info is set when bbox parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, bBox="-92.8,44.2,-88.9,46.0")
-        # assert that site_info is implemented
+    def test_set_metadata_info_bbox(self, httpx_mock):
+        """site_info is populated when ``bBox`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), bBox="-92.8,44.2,-88.9,46.0")
         assert md.site_info
 
-    def test_set_metadata_info_countyCd(self):
-        """Test metadata info is set when countyCd parameter is supplied."""
-        # mock the query response
-        response = mock.MagicMock()
-        # make metadata call
-        md = NWIS_Metadata(response, countyCd="01001")
-        # assert that site_info is implemented
+    def test_set_metadata_info_countyCd(self, httpx_mock):
+        """site_info is populated when ``countyCd`` is supplied."""
+        _mock_site(httpx_mock)
+        md = NWIS_Metadata(mock.MagicMock(), countyCd="01001")
         assert md.site_info
 
 
diff --git a/tests/utils_test.py b/tests/utils_test.py