Skip to content

Commit 316af70

Browse files
thodson-usgsclaude
andauthored
fix(wqp): preserve leading zeros on code columns (HUCs, parameter codes, FIPS) (#311)
The nine WQP getters read responses with a bare `pd.read_csv(StringIO(text), delimiter=",", low_memory=False)`, which infers code columns as int/float and silently drops their significant leading zeros: a USGS parameter code "00060" became 60, HUC8 "07090002" became 7090002. (R dataRetrieval reads these as character.) Add a `_read_wqp_csv` helper that reads the header, then re-reads with `dtype=str` for any column whose name is a code/identifier (ends with "code", or contains "identifier"/"huc"/"fips") — covering both the legacy and WQX3.0 column schemas — while leaving value columns numeric. All nine read sites use it. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent d3779de commit 316af70

2 files changed

Lines changed: 51 additions & 9 deletions

File tree

dataretrieval/wqp.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,32 @@
4040
]
4141

4242

43+
def _is_code_column(name: str) -> bool:
44+
"""True if a WQP column name denotes a code/identifier whose leading zeros
45+
are significant and must be preserved as ``str`` (HUCs, parameter codes,
46+
FIPS codes): the name ends with "code" or contains "identifier"/"huc"/"fips".
47+
"""
48+
lname = name.lower()
49+
return lname.endswith("code") or any(
50+
token in lname for token in ("identifier", "huc", "fips")
51+
)
52+
53+
54+
def _read_wqp_csv(text: str) -> DataFrame:
55+
"""Read a WQP CSV, forcing code/identifier columns to ``str``.
56+
57+
WQP returns codes with significant leading zeros — HUCs, parameter codes
58+
(``USGSpcode``), FIPS state/county codes. A bare ``read_csv`` infers those
59+
as int/float and silently drops the zeros (``"00060"`` -> ``60``, HUC8
60+
``"07090002"`` -> ``7090002``). Read the header first, then re-read with
61+
``dtype=str`` for every column that :func:`_is_code_column` flags, so the
62+
zeros survive.
63+
"""
64+
columns = pd.read_csv(StringIO(text), delimiter=",", nrows=0).columns
65+
str_cols = {col: str for col in columns if _is_code_column(col)}
66+
return pd.read_csv(StringIO(text), delimiter=",", low_memory=False, dtype=str_cols)
67+
68+
4369
def get_results(
4470
ssl_check=True,
4571
legacy=True,
@@ -153,7 +179,7 @@ def get_results(
153179

154180
response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)
155181

156-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
182+
df = _read_wqp_csv(response.text)
157183
df = _attach_datetime_columns(df)
158184
return df, WQP_Metadata(response, **kwargs)
159185

@@ -208,7 +234,7 @@ def what_sites(
208234

209235
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
210236

211-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
237+
df = _read_wqp_csv(response.text)
212238

213239
return df, WQP_Metadata(response, **kwargs)
214240

@@ -259,7 +285,7 @@ def what_organizations(
259285

260286
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
261287

262-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
288+
df = _read_wqp_csv(response.text)
263289

264290
return df, WQP_Metadata(response, **kwargs)
265291

@@ -306,7 +332,7 @@ def what_projects(ssl_check=True, legacy=True, **kwargs):
306332

307333
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
308334

309-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
335+
df = _read_wqp_csv(response.text)
310336

311337
return df, WQP_Metadata(response, **kwargs)
312338

@@ -370,7 +396,7 @@ def what_activities(
370396

371397
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
372398

373-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
399+
df = _read_wqp_csv(response.text)
374400

375401
return df, WQP_Metadata(response, **kwargs)
376402

@@ -428,7 +454,7 @@ def what_detection_limits(
428454

429455
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
430456

431-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
457+
df = _read_wqp_csv(response.text)
432458

433459
return df, WQP_Metadata(response, **kwargs)
434460

@@ -479,7 +505,7 @@ def what_habitat_metrics(
479505

480506
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
481507

482-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
508+
df = _read_wqp_csv(response.text)
483509

484510
return df, WQP_Metadata(response, **kwargs)
485511

@@ -531,7 +557,7 @@ def what_project_weights(ssl_check=True, legacy=True, **kwargs):
531557

532558
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
533559

534-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
560+
df = _read_wqp_csv(response.text)
535561

536562
return df, WQP_Metadata(response, **kwargs)
537563

@@ -583,7 +609,7 @@ def what_activity_metrics(ssl_check=True, legacy=True, **kwargs):
583609

584610
response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)
585611

586-
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
612+
df = _read_wqp_csv(response.text)
587613

588614
return df, WQP_Metadata(response, **kwargs)
589615

tests/wqp_test.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,22 @@
1919
)
2020

2121

22+
def test_read_wqp_csv_preserves_leading_zero_codes():
23+
"""Regression: WQP code columns (HUCs, parameter codes, FIPS) carry
24+
significant leading zeros; a bare ``read_csv`` inferred them as int/float
25+
and dropped the zeros (``"00060"`` -> ``60``). ``_read_wqp_csv`` reads
26+
code/identifier columns as ``str`` while leaving value columns numeric."""
27+
from dataretrieval.wqp import _read_wqp_csv
28+
29+
csv = (
30+
"Location_HUCEightDigitCode,USGSpcode,ResultMeasureValue\n07090002,00060,1.5\n"
31+
)
32+
df = _read_wqp_csv(csv)
33+
assert df["Location_HUCEightDigitCode"].iloc[0] == "07090002"
34+
assert df["USGSpcode"].iloc[0] == "00060"
35+
assert df["ResultMeasureValue"].iloc[0] == 1.5
36+
37+
2238
def test_get_results(httpx_mock):
2339
"""Tests water quality portal ratings query"""
2440
request_url = (

0 commit comments

Comments
 (0)