Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions dataretrieval/wqp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,32 @@
]


def _is_code_column(name: str) -> bool:
"""True if a WQP column name denotes a code/identifier whose leading zeros
are significant and must be preserved as ``str`` (HUCs, parameter codes,
FIPS codes): the name ends with "code" or contains "identifier"/"huc"/"fips".
"""
lname = name.lower()
return lname.endswith("code") or any(
token in lname for token in ("identifier", "huc", "fips")
)


def _read_wqp_csv(text: str) -> DataFrame:
"""Read a WQP CSV, forcing code/identifier columns to ``str``.

WQP returns codes with significant leading zeros — HUCs, parameter codes
(``USGSpcode``), FIPS state/county codes. A bare ``read_csv`` infers those
as int/float and silently drops the zeros (``"00060"`` -> ``60``, HUC8
``"07090002"`` -> ``7090002``). Read the header first, then re-read with
``dtype=str`` for every column that :func:`_is_code_column` flags, so the
zeros survive.
"""
columns = pd.read_csv(StringIO(text), delimiter=",", nrows=0).columns
str_cols = {col: str for col in columns if _is_code_column(col)}
return pd.read_csv(StringIO(text), delimiter=",", low_memory=False, dtype=str_cols)


def get_results(
ssl_check=True,
legacy=True,
Expand Down Expand Up @@ -153,7 +179,7 @@ def get_results(

response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)
df = _attach_datetime_columns(df)
return df, WQP_Metadata(response)

Expand Down Expand Up @@ -208,7 +234,7 @@ def what_sites(

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -259,7 +285,7 @@ def what_organizations(

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -306,7 +332,7 @@ def what_projects(ssl_check=True, legacy=True, **kwargs):

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -370,7 +396,7 @@ def what_activities(

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -428,7 +454,7 @@ def what_detection_limits(

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -479,7 +505,7 @@ def what_habitat_metrics(

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -531,7 +557,7 @@ def what_project_weights(ssl_check=True, legacy=True, **kwargs):

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down Expand Up @@ -583,7 +609,7 @@ def what_activity_metrics(ssl_check=True, legacy=True, **kwargs):

response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _read_wqp_csv(response.text)

return df, WQP_Metadata(response)

Expand Down
16 changes: 16 additions & 0 deletions tests/wqp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@
)


def test_read_wqp_csv_preserves_leading_zero_codes():
"""Regression: WQP code columns (HUCs, parameter codes, FIPS) carry
significant leading zeros; a bare ``read_csv`` inferred them as int/float
and dropped the zeros (``"00060"`` -> ``60``). ``_read_wqp_csv`` reads
code/identifier columns as ``str`` while leaving value columns numeric."""
from dataretrieval.wqp import _read_wqp_csv

csv = (
"Location_HUCEightDigitCode,USGSpcode,ResultMeasureValue\n07090002,00060,1.5\n"
)
df = _read_wqp_csv(csv)
assert df["Location_HUCEightDigitCode"].iloc[0] == "07090002"
assert df["USGSpcode"].iloc[0] == "00060"
assert df["ResultMeasureValue"].iloc[0] == 1.5


def test_get_results(httpx_mock):
"""Tests water quality portal ratings query"""
request_url = (
Expand Down
Loading