Skip to content

Commit 8aae7b4

Browse files
thodson-usgsclaude
andcommitted
Parse Date/Time/TimeZone triplets in samples and WQP responses
Add a shared utils.attach_datetime_columns helper that scans a CSV-derived DataFrame for <prefix>Date / <prefix>Time / <prefix>TimeZone triplets and appends a derived <prefix>DateTime UTC column for each one, leaving the original triplet columns intact. Recognizes both the WQX3 / Samples naming (Activity_StartDate, Activity_StartTime, Activity_StartTimeZone) and the legacy WQP naming (ActivityStartDate, ActivityStartTime/Time, ActivityStartTime/TimeZoneCode). Mirrors R dataRetrieval's create_dateTime. Wired into waterdata.get_samples and wqp.get_results. Closes #266. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 18f831f commit 8aae7b4

7 files changed

Lines changed: 241 additions & 8 deletions

File tree

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `<prefix>DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone``Activity_StartDateTime`). Both the WQX3 (`<X>Date`/`<X>Time`/`<X>TimeZone`) and legacy WQP (`<X>Date`/`<X>Time/Time`/`<X>Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266.
2+
13
**05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.
24

35
**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.

dataretrieval/utils.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,98 @@ def format_datetime(df, date_field, time_field, tz_field):
9494
return df
9595

9696

97+
# Triplet patterns we recognize in WQP and Samples CSV responses. Each entry
98+
# defines how to derive the time/timezone column names from a date column, and
99+
# the suffix to strip when forming the new <prefix>DateTime column name.
100+
_DATETIME_TRIPLET_PATTERNS = (
101+
# WQX3 / Samples: Activity_StartDate, Activity_StartTime, Activity_StartTimeZone
102+
{
103+
"date_suffix": "Date",
104+
"time_from_date": lambda d: d[: -len("Date")] + "Time",
105+
"tz_from_date": lambda d: d[: -len("Date")] + "TimeZone",
106+
},
107+
# Legacy WQP: <X>Date, <X>Time/Time, <X>Time/TimeZoneCode
108+
{
109+
"date_suffix": "Date",
110+
"time_from_date": lambda d: d[: -len("Date")] + "Time/Time",
111+
"tz_from_date": lambda d: d[: -len("Date")] + "Time/TimeZoneCode",
112+
},
113+
)
114+
115+
116+
def _build_utc_datetime(date_series, time_series, tz_series):
117+
"""Combine date + time + tz-abbreviation columns into a UTC pandas Series.
118+
119+
Unknown timezone codes (and rows missing any of the three values) yield
120+
``NaT``. The input columns are not mutated.
121+
"""
122+
offsets = tz_series.map(tz)
123+
combined = (
124+
date_series.astype("string")
125+
+ " "
126+
+ time_series.astype("string")
127+
+ " "
128+
+ offsets.astype("string")
129+
)
130+
# Rows where any input is missing produce a string containing "<NA>"; mark
131+
# those so pd.to_datetime returns NaT rather than guessing.
132+
invalid = (
133+
date_series.isna() | time_series.isna() | tz_series.isna() | offsets.isna()
134+
)
135+
combined = combined.mask(invalid)
136+
return pd.to_datetime(combined, format="mixed", utc=True, errors="coerce")
137+
138+
139+
def attach_datetime_columns(df):
140+
"""Add ``<prefix>DateTime`` UTC columns for any Date/Time/TimeZone triplets.
141+
142+
Detects two naming patterns that appear in USGS Samples and Water Quality
143+
Portal CSV responses:
144+
145+
* **WQX3** — ``<prefix>Date``, ``<prefix>Time``, ``<prefix>TimeZone``
146+
* **Legacy WQP** — ``<prefix>Date``, ``<prefix>Time/Time``,
147+
``<prefix>Time/TimeZoneCode``
148+
149+
For every triplet present, a new ``<prefix>DateTime`` column is appended
150+
holding a UTC ``Timestamp`` (offsets resolved via
151+
:data:`dataretrieval.codes.tz`). The original Date/Time/TimeZone columns
152+
are left intact, and an existing ``<prefix>DateTime`` column is never
153+
overwritten.
154+
155+
Parameters
156+
----------
157+
df : ``pandas.DataFrame``
158+
DataFrame returned from a Samples or WQP CSV endpoint.
159+
160+
Returns
161+
-------
162+
df : ``pandas.DataFrame``
163+
A DataFrame with any derivable ``<prefix>DateTime`` columns appended.
164+
Callers should use the returned value (the helper may concatenate
165+
rather than mutate in place).
166+
"""
167+
columns = set(df.columns)
168+
new_columns = {}
169+
for col in df.columns:
170+
if not col.endswith("Date"):
171+
continue
172+
for pattern in _DATETIME_TRIPLET_PATTERNS:
173+
time_col = pattern["time_from_date"](col)
174+
tz_col = pattern["tz_from_date"](col)
175+
if time_col not in columns or tz_col not in columns:
176+
continue
177+
target = col[: -len("Date")] + "DateTime"
178+
if target in columns or target in new_columns:
179+
break
180+
new_columns[target] = _build_utc_datetime(df[col], df[time_col], df[tz_col])
181+
break
182+
if not new_columns:
183+
return df
184+
# Concat in one shot — appending columns one-by-one to a wide CSV-derived
185+
# frame triggers pandas' fragmentation PerformanceWarning.
186+
return pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
187+
188+
97189
class BaseMetadata:
98190
"""Base class for metadata.
99191

dataretrieval/waterdata/api.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import requests
1717
from requests.models import PreparedRequest
1818

19-
from dataretrieval.utils import BaseMetadata, to_str
19+
from dataretrieval.utils import BaseMetadata, attach_datetime_columns, to_str
2020
from dataretrieval.waterdata.filters import FILTER_LANG
2121
from dataretrieval.waterdata.types import (
2222
CODE_SERVICES,
@@ -2266,7 +2266,13 @@ def get_samples(
22662266
Returns
22672267
-------
22682268
df : ``pandas.DataFrame``
2269-
Formatted data returned from the API query.
2269+
Formatted data returned from the API query. For each
2270+
``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
2271+
the response (e.g. ``Activity_StartDate``, ``Activity_StartTime``,
2272+
``Activity_StartTimeZone``), an additional ``<prefix>DateTime`` column
2273+
is appended holding a UTC ``Timestamp`` derived from the three. The
2274+
original Date/Time/TimeZone columns are left intact; rows whose
2275+
timezone abbreviation is not recognized resolve to ``NaT``.
22702276
md : :obj:`dataretrieval.utils.Metadata`
22712277
Custom ``dataretrieval`` metadata object pertaining to the query.
22722278
@@ -2323,6 +2329,7 @@ def get_samples(
23232329
response.raise_for_status()
23242330

23252331
df = pd.read_csv(StringIO(response.text), delimiter=",")
2332+
df = attach_datetime_columns(df)
23262333

23272334
return df, BaseMetadata(response)
23282335

dataretrieval/wqp.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import pandas as pd
1919

20-
from .utils import BaseMetadata, query
20+
from .utils import BaseMetadata, attach_datetime_columns, query
2121

2222
if TYPE_CHECKING:
2323
from pandas import DataFrame
@@ -101,7 +101,12 @@ def get_results(
101101
Returns
102102
-------
103103
df : ``pandas.DataFrame``
104-
Formatted data returned from the API query.
104+
Formatted data returned from the API query. For each
105+
``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
106+
the response (legacy WQP uses ``<prefix>Time/Time`` and
107+
``<prefix>Time/TimeZoneCode``), an additional ``<prefix>DateTime``
108+
column is appended holding a UTC ``Timestamp``. Original triplet
109+
columns are preserved; unrecognized timezone codes yield ``NaT``.
105110
md : :obj:`dataretrieval.utils.Metadata`
106111
Custom ``dataretrieval`` metadata object pertaining to the query.
107112
@@ -147,6 +152,7 @@ def get_results(
147152
response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)
148153

149154
df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
155+
df = attach_datetime_columns(df)
150156
return df, WQP_Metadata(response)
151157

152158

tests/utils_test.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,110 @@ def test_to_str_custom_delimiter(self):
9797

9898
def test_to_str_non_iterable(self):
9999
assert utils.to_str(123) is None
100+
101+
102+
class Test_attach_datetime_columns:
103+
"""Tests of attach_datetime_columns, which derives <prefix>DateTime UTC
104+
columns from Date/Time/TimeZone triplets in Samples and WQP CSVs."""
105+
106+
def test_wqx3_triplet_resolves_to_utc(self):
107+
"""The Samples / WQX3 pattern (Activity_Start*) is detected and the
108+
resulting DateTime is converted to UTC."""
109+
df = pd.DataFrame(
110+
{
111+
"Activity_StartDate": ["2024-01-09", "2024-02-15"],
112+
"Activity_StartTime": ["10:00:00", "14:30:00"],
113+
"Activity_StartTimeZone": ["PST", "EST"],
114+
}
115+
)
116+
df = utils.attach_datetime_columns(df)
117+
assert "Activity_StartDateTime" in df.columns
118+
# PST is UTC-8 → 10:00 PST is 18:00 UTC
119+
assert df["Activity_StartDateTime"][0] == pd.Timestamp(
120+
"2024-01-09 18:00:00", tz="UTC"
121+
)
122+
# EST is UTC-5 → 14:30 EST is 19:30 UTC
123+
assert df["Activity_StartDateTime"][1] == pd.Timestamp(
124+
"2024-02-15 19:30:00", tz="UTC"
125+
)
126+
# Original columns are preserved
127+
assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"]
128+
129+
def test_legacy_wqp_triplet_resolves_to_utc(self):
130+
"""The legacy WQP pattern (slash-separated time/tz columns) is also
131+
detected."""
132+
df = pd.DataFrame(
133+
{
134+
"ActivityStartDate": ["2024-01-09"],
135+
"ActivityStartTime/Time": ["10:00:00"],
136+
"ActivityStartTime/TimeZoneCode": ["PST"],
137+
}
138+
)
139+
df = utils.attach_datetime_columns(df)
140+
assert "ActivityStartDateTime" in df.columns
141+
assert df["ActivityStartDateTime"][0] == pd.Timestamp(
142+
"2024-01-09 18:00:00", tz="UTC"
143+
)
144+
145+
def test_unknown_timezone_is_NaT(self):
146+
"""Unknown timezone codes resolve to NaT rather than raising."""
147+
df = pd.DataFrame(
148+
{
149+
"Activity_StartDate": ["2024-01-09"],
150+
"Activity_StartTime": ["10:00:00"],
151+
"Activity_StartTimeZone": ["BOGUS"],
152+
}
153+
)
154+
df = utils.attach_datetime_columns(df)
155+
assert df["Activity_StartDateTime"].isna().all()
156+
157+
def test_missing_time_or_tz_is_NaT(self):
158+
"""Rows with a missing time or tz produce NaT but don't poison others."""
159+
df = pd.DataFrame(
160+
{
161+
"Activity_StartDate": ["2024-01-09", "2024-02-15"],
162+
"Activity_StartTime": ["10:00:00", None],
163+
"Activity_StartTimeZone": ["PST", "EST"],
164+
}
165+
)
166+
df = utils.attach_datetime_columns(df)
167+
assert df["Activity_StartDateTime"][0] == pd.Timestamp(
168+
"2024-01-09 18:00:00", tz="UTC"
169+
)
170+
assert pd.isna(df["Activity_StartDateTime"][1])
171+
172+
def test_existing_datetime_column_not_overwritten(self):
173+
"""An existing <prefix>DateTime column is left alone."""
174+
df = pd.DataFrame(
175+
{
176+
"Activity_StartDate": ["2024-01-09"],
177+
"Activity_StartTime": ["10:00:00"],
178+
"Activity_StartTimeZone": ["PST"],
179+
"Activity_StartDateTime": ["preexisting"],
180+
}
181+
)
182+
df = utils.attach_datetime_columns(df)
183+
assert df["Activity_StartDateTime"].tolist() == ["preexisting"]
184+
185+
def test_multiple_triplets_handled(self):
186+
"""All Date/Time/TimeZone triplets in the frame get DateTime columns."""
187+
df = pd.DataFrame(
188+
{
189+
"Activity_StartDate": ["2024-01-09"],
190+
"Activity_StartTime": ["10:00:00"],
191+
"Activity_StartTimeZone": ["PST"],
192+
"LabInfo_AnalysisStartDate": ["2024-01-10"],
193+
"LabInfo_AnalysisStartTime": ["09:00:00"],
194+
"LabInfo_AnalysisStartTimeZone": ["EST"],
195+
}
196+
)
197+
df = utils.attach_datetime_columns(df)
198+
assert "Activity_StartDateTime" in df.columns
199+
assert "LabInfo_AnalysisStartDateTime" in df.columns
200+
201+
def test_lone_date_column_left_alone(self):
202+
"""A Date column without matching Time/TimeZone columns is ignored."""
203+
df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]})
204+
df = utils.attach_datetime_columns(df)
205+
assert "LastChangeDateTime" not in df.columns
206+
assert list(df.columns) == ["LastChangeDate"]

tests/waterdata_test.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import sys
33

4+
import pandas as pd
45
import pytest
56
from pandas import DataFrame
67

@@ -54,11 +55,20 @@ def test_mock_get_samples(requests_mock):
5455
monitoringLocationIdentifier="USGS-05406500",
5556
)
5657
assert type(df) is DataFrame
57-
assert df.size == 12127
58+
# 67 rows × 181 source columns + 6 derived <prefix>DateTime columns
59+
assert df.shape == (67, 187)
5860
assert md.url == request_url
5961
assert isinstance(md.query_time, datetime.timedelta)
6062
assert md.header == {"mock_header": "value"}
6163
assert md.comment is None
64+
# The Activity start triplet is parsed into a UTC Timestamp column.
65+
assert "Activity_StartDateTime" in df.columns
66+
# Row 0 is "2023-08-22 08:50:00 CDT" → 13:50 UTC.
67+
assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp(
68+
"2023-08-22 13:50:00", tz="UTC"
69+
)
70+
# Original triplet columns are preserved.
71+
assert df["Activity_StartTimeZone"].iloc[0] == "CDT"
6272

6373

6474
def test_mock_get_samples_summary(requests_mock):
@@ -127,7 +137,8 @@ def test_samples_activity():
127137
monitoringLocationIdentifier="USGS-06719505",
128138
)
129139
assert len(df) > 0
130-
assert len(df.columns) == 95
140+
# 95 columns from the API plus 2 derived <prefix>DateTime columns.
141+
assert len(df.columns) == 97
131142
assert "Location_HUCTwelveDigitCode" in df.columns
132143

133144

tests/wqp_test.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,15 @@ def test_get_results(requests_mock):
3333
startDateHi="09-30-2011",
3434
)
3535
assert type(df) is DataFrame
36-
assert df.size == 315
36+
# 5 rows × 63 source columns + 2 derived <prefix>DateTime columns
37+
assert df.shape == (5, 65)
3738
assert md.url == request_url
3839
assert isinstance(md.query_time, datetime.timedelta)
3940
assert md.header == {"mock_header": "value"}
4041
assert md.comment is None
42+
# Legacy WQP triplets (slash-separated) are parsed into UTC.
43+
assert "ActivityStartDateTime" in df.columns
44+
assert df["ActivityStartDateTime"].notna().all()
4145

4246

4347
def test_get_results_WQX3(requests_mock):
@@ -58,11 +62,15 @@ def test_get_results_WQX3(requests_mock):
5862
startDateHi="09-30-2011",
5963
)
6064
assert type(df) is DataFrame
61-
assert df.size == 900
65+
# 5 rows × 180 source columns + 6 derived <prefix>DateTime columns
66+
assert df.shape == (5, 186)
6267
assert md.url == request_url
6368
assert isinstance(md.query_time, datetime.timedelta)
6469
assert md.header == {"mock_header": "value"}
6570
assert md.comment is None
71+
# WQX3 WQP triplets are parsed into UTC.
72+
assert "Activity_StartDateTime" in df.columns
73+
assert df["Activity_StartDateTime"].notna().all()
6674

6775

6876
def test_what_sites(requests_mock):

0 commit comments

Comments
 (0)