chore(typing): set up mypy and fix the type errors it surfaces

thodson-usgs · claude · thodson-usgs · commit 9d5fbd744202 · 2026-05-31T21:20:20.000-04:00
The package ships a ``py.typed`` marker (advertising itself as typed to
downstream users) but nothing type-checked it. Add mypy and get a clean run.

Setup:
  - [tool.mypy] in pyproject.toml: a lenient first-pass config
    (ignore_missing_imports, target python_version 3.9), scoped to the
    dataretrieval package.
  - mypy&lt;2 added to the [test] extra (&lt;2 so it can still target 3.9).
  - a type-check job in the CI workflow, parallel to the ruff lint job.

Fixes (mypy went from 78 errors to 0 on the tracked package):
  - HTTPX_DEFAULTS annotated dict[str, Any] so **-splatting it into
    httpx.get / httpx.AsyncClient type-checks -- cleared ~55 errors across
    7 call sites at once.
  - utils.py gains `from __future__ import annotations`: mypy (targeting
    3.9) caught that the new `str | None` annotations there would be a
    runtime error on 3.9, because this module -- unlike the rest of the
    package -- lacked the future import.
  - BaseMetadata.comment annotated `str | None` (was inferred `None`, which
    rejected every subclass that assigns a comment string).
  - _format_api_dates: accept Sequence[str | None] (covariant) so a
    list[str] caller type-checks, and build the formatted list with an
    early return so the final join sees list[str].
  - _as_str_list: delegate to _normalize_str_iterable then wrap, so the
    declared return type list[str] | None holds.
  - _next_req_url: declare next_host / cur_host as `str | None`.
  - ratings._search: build the query dict in a non-Optional local before
    aliasing it to the loop's `params` (which toggles to None per page).
  - nldi: drop the bool-&gt;str / Literal-&gt;str variable reuse; guard the basin
    branch so feature_source / feature_id are non-None before get_basin.
  - chunking: narrow the optional filter before _is_chunkable; fix a stale
    `# type: ignore` error code.

The fixes are annotations and type-narrowing guards. The only runtime-visible
change is that nldi.search() now raises a clear ValueError up front when a
basin search is missing feature_source/feature_id, where the same condition
previously raised deeper inside get_basin. 259 tests pass across the affected
suites.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -26,6 +26,22 @@ jobs:
           ruff check . --output-format=github
           ruff format --check .
 
+  type-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.13"
+          cache: "pip"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[test]
+      - name: Type-check with mypy
+        run: mypy
+
   test:
     needs: lint
     runs-on: ${{ matrix.os }}
diff --git a/dataretrieval/nldi.py b/dataretrieval/nldi.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from json import JSONDecodeError
-from typing import Literal
+from typing import Literal, cast
 
 from dataretrieval.utils import query
 
@@ -162,9 +162,12 @@ def get_basin(
         raise ValueError("feature_id is required")
 
     url = f"{NLDI_API_BASE_URL}/{feature_source}/{feature_id}/basin"
-    simplified = str(simplified).lower()
-    split_catchment = str(split_catchment).lower()
-    query_params = {"simplified": simplified, "splitCatchment": split_catchment}
+    simplified_str = str(simplified).lower()
+    split_catchment_str = str(split_catchment).lower()
+    query_params = {
+        "simplified": simplified_str,
+        "splitCatchment": split_catchment_str,
+    }
     err_msg = (
         f"Error getting basin for feature source '{feature_source}' and "
         f"feature_id '{feature_id}'"
@@ -408,7 +411,7 @@ def search(
     if (lat is None) != (long is None):
         raise ValueError("Both lat and long are required")
 
-    find = find.lower()
+    find = cast(Literal["basin", "flowlines", "features"], find.lower())
     if find not in ("basin", "flowlines", "features"):
         raise ValueError(
             f"Invalid value for find: {find} - allowed values are:"
@@ -428,6 +431,10 @@ def search(
         return get_features(lat=lat, long=long, as_json=True)
 
     if find == "basin":
+        if feature_source is None or feature_id is None:
+            raise ValueError(
+                "feature_source and feature_id are required to find a basin"
+            )
         return get_basin(
             feature_source=feature_source, feature_id=feature_id, as_json=True
         )
diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py
@@ -2,16 +2,22 @@
 Useful utilities for data munging.
 """
 
+from __future__ import annotations
+
 import warnings
 from collections.abc import Iterable
+from typing import Any
 
 import httpx
 import pandas as pd
 
 import dataretrieval
 from dataretrieval.codes import tz
 
-HTTPX_DEFAULTS = {
+# Typed as ``dict[str, Any]`` (not the inferred ``dict[str, object]``) so that
+# splatting it as ``**HTTPX_DEFAULTS`` into ``httpx.get`` / ``httpx.AsyncClient``
+# type-checks: the values are a heterogeneous bag of httpx keyword arguments.
+HTTPX_DEFAULTS: dict[str, Any] = {
     "follow_redirects": True,
     "timeout": httpx.Timeout(60.0, connect=10.0),
 }
@@ -190,6 +196,7 @@ def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame:
         # Concat in one shot — per-column assignment on a wide CSV-derived
         # frame triggers pandas' fragmentation PerformanceWarning.
         df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
+    sort_key: str | None
     if "Activity_StartDateTime" in df.columns:
         sort_key = "Activity_StartDateTime"
     elif "ActivityStartDateTime" in df.columns:
@@ -234,7 +241,7 @@ def __init__(self, response) -> None:
         self.url = str(response.url)
         self.query_time = response.elapsed
         self.header = response.headers
-        self.comment = None
+        self.comment: str | None = None
 
         # # not sure what statistic_info is
         # self.statistic_info = None
diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py
@@ -681,7 +681,7 @@ def _set_response_url(response: httpx.Response, url: str | httpx.URL) -> None:
     same ``.request``.
     """
     try:
-        response.url = url  # type: ignore[misc]
+        response.url = url  # type: ignore[misc, assignment]
     except AttributeError:
         target = httpx.URL(str(url))
         try:
@@ -800,7 +800,7 @@ def _extract_axes(args: dict[str, Any]) -> list[_Axis]:
             axes.append(_Axis(arg_key=key, atoms=tuple(value), joiner=_LIST_SEP))
 
     filter_expr = args.get("filter")
-    if _is_chunkable(filter_expr, args.get("filter_lang")):
+    if filter_expr is not None and _is_chunkable(filter_expr, args.get("filter_lang")):
         _check_numeric_filter_pitfall(filter_expr)
         clauses = _split_top_level_or(filter_expr)
         if len(clauses) >= 2:
diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py
@@ -246,15 +246,18 @@ def _search(
     STAC ``next`` link is followed until exhausted so a result set larger than
     one page isn't silently truncated.
     """
-    params: dict[str, Any] | None = {"limit": min(limit, 10000)}
+    query_params: dict[str, Any] = {"limit": min(limit, 10000)}
     if filter_str is not None:
-        params["filter"] = filter_str
+        query_params["filter"] = filter_str
     if time_str is not None:
-        params["datetime"] = time_str
+        query_params["datetime"] = time_str
     if bbox is not None:
-        params["bbox"] = ",".join(map(str, bbox))
+        query_params["bbox"] = ",".join(map(str, bbox))
 
     url: str | None = f"{STAC_URL}/search"
+    # ``params`` is sent only on the first request; each STAC ``next`` link
+    # already carries the query, so it is reset to None inside the loop.
+    params: dict[str, Any] | None = query_params
     features: list[dict[str, Any]] = []
     while url is not None:
         response = httpx.get(
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -14,6 +14,7 @@
     Iterable,
     Iterator,
     Mapping,
+    Sequence,
 )
 from contextlib import asynccontextmanager, contextmanager
 from contextvars import ContextVar
@@ -247,7 +248,7 @@ def _format_one(dt, *, date: bool, local_tz) -> str | None:
 
 
 def _format_api_dates(
-    datetime_input: str | list[str | None] | None, date: bool = False
+    datetime_input: str | Sequence[str | None] | None, date: bool = False
 ) -> str | None:
     """
     Formats date or datetime input(s) for use with an API.
@@ -330,11 +331,12 @@ def _format_api_dates(
     # element invalidates the range. Resolve the local tz only now — after the
     # all-NA / duration / interval guards above have had their chance to return.
     local_timezone = datetime.now().astimezone().tzinfo
-    formatted = [
-        _format_one(dt, date=date, local_tz=local_timezone) for dt in datetime_input
-    ]
-    if any(f is None for f in formatted):
-        return None
+    formatted: list[str] = []
+    for dt in datetime_input:
+        one = _format_one(dt, date=date, local_tz=local_timezone)
+        if one is None:
+            return None
+        formatted.append(one)
     return "/".join(formatted)
 
 
@@ -823,6 +825,8 @@ def _next_req_url(
         # body might supply. Guarded against mock-shaped ``resp.url``
         # attributes (tests sometimes set strings or ``MagicMock``)
         # by falling open when host extraction isn't reliable.
+        next_host: str | None
+        cur_host: str | None
         try:
             next_host = httpx.URL(href).host
             resp_url = (
@@ -1915,11 +1919,10 @@ def _as_str_list(
     ``",".join(...)`` doesn't iterate it character-by-character — and
     materializes any other iterable via :func:`_normalize_str_iterable`.
     """
-    return (
-        [value]
-        if isinstance(value, str)
-        else _normalize_str_iterable(value, param_name)
-    )
+    normalized = _normalize_str_iterable(value, param_name)
+    if isinstance(normalized, str):
+        return [normalized]
+    return normalized
 
 
 def _check_monitoring_location_id(
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ test = [
   "coverage",
   "pytest-httpx",
   "ruff",
+  "mypy<2",  # <2 so it can still target Python 3.9 (the project's floor)
 ]
 doc = [
   "docutils<0.22",
@@ -102,3 +103,13 @@ skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = true
 docstring-code-line-length = 72
+
+[tool.mypy]
+# First-pass type checking, kept lenient so it can be adopted incrementally on a
+# large, largely-unannotated scientific codebase: ``ignore_missing_imports``
+# treats untyped third-party libraries (geopandas, anyio, ...) as ``Any`` rather
+# than erroring, and unannotated function bodies are not checked by default.
+# Tightening (e.g. ``disallow_untyped_defs``) can follow once annotations are in.
+python_version = "3.9"  # the project's minimum supported version
+files = ["dataretrieval"]
+ignore_missing_imports = true