Skip to content

Commit 788bda1

Browse files
thodson-usgsclaude
andcommitted
feat(wqp): accept tsv mimeType and reject xlsx with a clear error
`_check_kwargs` now accepts `mimeType=tsv` alongside `csv`, defaults a missing mimeType to csv, and raises a clear NotImplementedError for `xlsx` that points at the csv/tsv options. `_read_wqp_csv` gained a `delimiter` argument (tab for tsv, comma otherwise, selected by the new `_wqp_delimiter` helper) so tsv responses parse correctly while still preserving leading zeros on code columns; `get_results` and the shared `_what` helper pass the mimeType-derived delimiter. Re-authored onto main's `_what`/`_read_wqp_csv` structure (it predated the DOI-USGS#320 what_* consolidation, the DOI-USGS#311 leading-zero fix, and the httpx migration); the branch's stale `requests_mock` tests are replaced with offline unit tests for the new behavior. Addresses DOI-USGS#162 (tsv support; xlsx now fails with a clear, actionable message rather than silently). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Sjb14HkwuCydKSKMsaXsgd
1 parent 4daf771 commit 788bda1

2 files changed

Lines changed: 64 additions & 9 deletions

File tree

dataretrieval/wqp.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,29 @@ def _is_code_column(name: str) -> bool:
5252
)
5353

5454

55-
def _read_wqp_csv(text: str) -> DataFrame:
56-
"""Read a WQP CSV, forcing code/identifier columns to ``str``.
55+
def _read_wqp_csv(text: str, delimiter: str = ",") -> DataFrame:
56+
"""Read a WQP CSV/TSV, forcing code/identifier columns to ``str``.
5757
5858
WQP returns codes with significant leading zeros — HUCs, parameter codes
5959
(``USGSpcode``), FIPS state/county codes. A bare ``read_csv`` infers those
6060
as int/float and silently drops the zeros (``"00060"`` -> ``60``, HUC8
6161
``"07090002"`` -> ``7090002``). Read the header first, then re-read with
6262
``dtype=str`` for every column that :func:`_is_code_column` flags, so the
6363
zeros survive.
64+
65+
``delimiter`` selects comma (CSV, the default) vs tab (TSV); see
66+
:func:`_wqp_delimiter`.
6467
"""
65-
columns = pd.read_csv(StringIO(text), delimiter=",", nrows=0).columns
68+
columns = pd.read_csv(StringIO(text), delimiter=delimiter, nrows=0).columns
6669
str_cols = {col: str for col in columns if _is_code_column(col)}
67-
return pd.read_csv(StringIO(text), delimiter=",", low_memory=False, dtype=str_cols)
70+
return pd.read_csv(
71+
StringIO(text), delimiter=delimiter, low_memory=False, dtype=str_cols
72+
)
73+
74+
75+
def _wqp_delimiter(kwargs: dict[str, Any]) -> str:
76+
"""Field delimiter for the requested ``mimeType``: tab for ``tsv``, else comma."""
77+
return "\t" if kwargs.get("mimeType") == "tsv" else ","
6878

6979

7080
def get_results(
@@ -181,7 +191,7 @@ def get_results(
181191

182192
response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)
183193

184-
df = _read_wqp_csv(response.text)
194+
df = _read_wqp_csv(response.text, _wqp_delimiter(kwargs))
185195
df = _attach_datetime_columns(df)
186196
return df, WQP_Metadata(response, **kwargs)
187197

@@ -209,7 +219,7 @@ def _what(
209219
url = _legacy_only_url(service, legacy=legacy)
210220

211221
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
212-
df = _read_wqp_csv(response.text)
222+
df = _read_wqp_csv(response.text, _wqp_delimiter(kwargs))
213223
return df, WQP_Metadata(response, **kwargs)
214224

215225

@@ -690,9 +700,13 @@ def _check_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]:
690700
mimetype = kwargs.get("mimeType")
691701
if mimetype == "geojson":
692702
raise NotImplementedError("GeoJSON not yet supported. Set 'mimeType=csv'.")
693-
elif mimetype != "csv" and mimetype is not None:
694-
raise ValueError("Invalid mimeType. Set 'mimeType=csv'.")
695-
else:
703+
elif mimetype == "xlsx":
704+
raise NotImplementedError(
705+
"Excel format not yet supported. Set 'mimeType=csv' or 'mimeType=tsv'."
706+
)
707+
elif mimetype not in ("csv", "tsv", None):
708+
raise ValueError("Invalid mimeType. Supported options: 'csv', 'tsv'.")
709+
elif mimetype is None:
696710
kwargs["mimeType"] = "csv"
697711

698712
return kwargs

tests/wqp_test.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,37 @@ def test_read_wqp_csv_preserves_leading_zero_codes():
5353
assert df["ResultMeasureValue"].iloc[0] == 1.5
5454

5555

56+
def test_read_wqp_csv_tsv_delimiter_preserves_codes():
57+
"""``mimeType=tsv`` responses are parsed as tab-delimited via
58+
``_read_wqp_csv``'s ``delimiter`` while still preserving leading zeros on
59+
code columns."""
60+
from dataretrieval.wqp import _read_wqp_csv
61+
62+
tsv = (
63+
"Location_HUCEightDigitCode\tUSGSpcode\tResultMeasureValue\n"
64+
"07090002\t00060\t1.5\n"
65+
)
66+
df = _read_wqp_csv(tsv, delimiter="\t")
67+
assert list(df.columns) == [
68+
"Location_HUCEightDigitCode",
69+
"USGSpcode",
70+
"ResultMeasureValue",
71+
]
72+
assert df["Location_HUCEightDigitCode"].iloc[0] == "07090002"
73+
assert df["USGSpcode"].iloc[0] == "00060"
74+
assert df["ResultMeasureValue"].iloc[0] == 1.5
75+
76+
77+
def test_wqp_delimiter_selects_tab_for_tsv():
78+
"""``_wqp_delimiter`` maps ``mimeType=tsv`` to a tab and everything else
79+
(including a missing mimeType) to a comma."""
80+
from dataretrieval.wqp import _wqp_delimiter
81+
82+
assert _wqp_delimiter({"mimeType": "tsv"}) == "\t"
83+
assert _wqp_delimiter({"mimeType": "csv"}) == ","
84+
assert _wqp_delimiter({}) == ","
85+
86+
5687
def test_get_results(httpx_mock):
5788
"""Tests water quality portal ratings query"""
5889
request_url = (
@@ -155,6 +186,16 @@ def test_check_kwargs():
155186
kwargs = _check_kwargs(kwargs)
156187

157188

189+
def test_check_kwargs_mimetype_csv_tsv_xlsx():
190+
"""csv/tsv are accepted as-is, a missing mimeType defaults to csv, and
191+
xlsx raises a clear NotImplementedError pointing at the csv/tsv options."""
192+
assert _check_kwargs({"mimeType": "csv"})["mimeType"] == "csv"
193+
assert _check_kwargs({"mimeType": "tsv"})["mimeType"] == "tsv"
194+
assert _check_kwargs({})["mimeType"] == "csv"
195+
with pytest.raises(NotImplementedError, match="Excel"):
196+
_check_kwargs({"mimeType": "xlsx"})
197+
198+
158199
def test_get_results_wqx3_preserves_user_dataProfile(httpx_mock):
159200
"""A valid user-supplied WQX3.0 profile must not be overwritten.
160201

0 commit comments

Comments
 (0)