diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index e41235a9..592ca984 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -40,6 +40,32 @@ ] +def _is_code_column(name: str) -> bool: + """True if a WQP column name denotes a code/identifier whose leading zeros + are significant and must be preserved as ``str`` (HUCs, parameter codes, + FIPS codes): the name ends with "code" or contains "identifier"/"huc"/"fips". + """ + lname = name.lower() + return lname.endswith("code") or any( + token in lname for token in ("identifier", "huc", "fips") + ) + + +def _read_wqp_csv(text: str) -> DataFrame: + """Read a WQP CSV, forcing code/identifier columns to ``str``. + + WQP returns codes with significant leading zeros — HUCs, parameter codes + (``USGSpcode``), FIPS state/county codes. A bare ``read_csv`` infers those + as int/float and silently drops the zeros (``"00060"`` -> ``60``, HUC8 + ``"07090002"`` -> ``7090002``). Read the header first, then re-read with + ``dtype=str`` for every column that :func:`_is_code_column` flags, so the + zeros survive. + """ + columns = pd.read_csv(StringIO(text), delimiter=",", nrows=0).columns + str_cols = {col: str for col in columns if _is_code_column(col)} + return pd.read_csv(StringIO(text), delimiter=",", low_memory=False, dtype=str_cols) + + def get_results( ssl_check=True, legacy=True, @@ -153,7 +179,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) df = _attach_datetime_columns(df) return df, WQP_Metadata(response) @@ -208,7 +234,7 @@ def what_sites( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -259,7 +285,7 @@ def what_organizations( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -306,7 +332,7 @@ def what_projects(ssl_check=True, legacy=True, **kwargs): response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -370,7 +396,7 @@ def what_activities( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -428,7 +454,7 @@ def what_detection_limits( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -479,7 +505,7 @@ def what_habitat_metrics( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -531,7 +557,7 @@ def what_project_weights(ssl_check=True, legacy=True, **kwargs): response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) @@ -583,7 +609,7 @@ def what_activity_metrics(ssl_check=True, legacy=True, **kwargs): response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _read_wqp_csv(response.text) return df, WQP_Metadata(response) diff --git a/tests/wqp_test.py b/tests/wqp_test.py index 5ea4edea..efda92e6 100644 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -19,6 +19,22 @@ ) +def test_read_wqp_csv_preserves_leading_zero_codes(): + """Regression: WQP code columns (HUCs, parameter codes, FIPS) carry + significant leading zeros; a bare ``read_csv`` inferred them as int/float + and dropped the zeros (``"00060"`` -> ``60``). ``_read_wqp_csv`` reads + code/identifier columns as ``str`` while leaving value columns numeric.""" + from dataretrieval.wqp import _read_wqp_csv + + csv = ( + "Location_HUCEightDigitCode,USGSpcode,ResultMeasureValue\n07090002,00060,1.5\n" + ) + df = _read_wqp_csv(csv) + assert df["Location_HUCEightDigitCode"].iloc[0] == "07090002" + assert df["USGSpcode"].iloc[0] == "00060" + assert df["ResultMeasureValue"].iloc[0] == 1.5 + + def test_get_results(httpx_mock): """Tests water quality portal ratings query""" request_url = (