Parse Date/Time/TimeZone triplets in samples and WQP responses

thodson-usgs · claude · thodson-usgs · commit 8aae7b41532c · 2026-05-07T07:03:50.000-05:00
Add a shared utils.attach_datetime_columns helper that scans a CSV-derived DataFrame for <prefix>Date / <prefix>Time / <prefix>TimeZone triplets and appends a derived <prefix>DateTime UTC column for each one, leaving the original triplet columns intact. Recognizes both the WQX3 / Samples naming (Activity_StartDate, Activity_StartTime, Activity_StartTimeZone) and the legacy WQP naming (ActivityStartDate, ActivityStartTime/Time, ActivityStartTime/TimeZoneCode). Mirrors R dataRetrieval's create_dateTime. Wired into waterdata.get_samples and wqp.get_results. Closes #266. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `<prefix>DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`<X>Date`/`<X>Time`/`<X>TimeZone`) and legacy WQP (`<X>Date`/`<X>Time/Time`/`<X>Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266.
+
 **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.
 
 **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py
@@ -94,6 +94,98 @@ def format_datetime(df, date_field, time_field, tz_field):
     return df
 
 
+# Triplet patterns we recognize in WQP and Samples CSV responses. Each entry
+# defines how to derive the time/timezone column names from a date column, and
+# the suffix to strip when forming the new <prefix>DateTime column name.
+_DATETIME_TRIPLET_PATTERNS = (
+    # WQX3 / Samples: Activity_StartDate, Activity_StartTime, Activity_StartTimeZone
+    {
+        "date_suffix": "Date",
+        "time_from_date": lambda d: d[: -len("Date")] + "Time",
+        "tz_from_date": lambda d: d[: -len("Date")] + "TimeZone",
+    },
+    # Legacy WQP: <X>Date, <X>Time/Time, <X>Time/TimeZoneCode
+    {
+        "date_suffix": "Date",
+        "time_from_date": lambda d: d[: -len("Date")] + "Time/Time",
+        "tz_from_date": lambda d: d[: -len("Date")] + "Time/TimeZoneCode",
+    },
+)
+
+
+def _build_utc_datetime(date_series, time_series, tz_series):
+    """Combine date + time + tz-abbreviation columns into a UTC pandas Series.
+
+    Unknown timezone codes (and rows missing any of the three values) yield
+    ``NaT``. The input columns are not mutated.
+    """
+    offsets = tz_series.map(tz)
+    combined = (
+        date_series.astype("string")
+        + " "
+        + time_series.astype("string")
+        + " "
+        + offsets.astype("string")
+    )
+    # Rows where any input is missing produce a string containing "<NA>"; mark
+    # those so pd.to_datetime returns NaT rather than guessing.
+    invalid = (
+        date_series.isna() | time_series.isna() | tz_series.isna() | offsets.isna()
+    )
+    combined = combined.mask(invalid)
+    return pd.to_datetime(combined, format="mixed", utc=True, errors="coerce")
+
+
+def attach_datetime_columns(df):
+    """Add ``<prefix>DateTime`` UTC columns for any Date/Time/TimeZone triplets.
+
+    Detects two naming patterns that appear in USGS Samples and Water Quality
+    Portal CSV responses:
+
+    * **WQX3** — ``<prefix>Date``, ``<prefix>Time``, ``<prefix>TimeZone``
+    * **Legacy WQP** — ``<prefix>Date``, ``<prefix>Time/Time``,
+      ``<prefix>Time/TimeZoneCode``
+
+    For every triplet present, a new ``<prefix>DateTime`` column is appended
+    holding a UTC ``Timestamp`` (offsets resolved via
+    :data:`dataretrieval.codes.tz`). The original Date/Time/TimeZone columns
+    are left intact, and an existing ``<prefix>DateTime`` column is never
+    overwritten.
+
+    Parameters
+    ----------
+    df : ``pandas.DataFrame``
+        DataFrame returned from a Samples or WQP CSV endpoint.
+
+    Returns
+    -------
+    df : ``pandas.DataFrame``
+        A DataFrame with any derivable ``<prefix>DateTime`` columns appended.
+        Callers should use the returned value (the helper may concatenate
+        rather than mutate in place).
+    """
+    columns = set(df.columns)
+    new_columns = {}
+    for col in df.columns:
+        if not col.endswith("Date"):
+            continue
+        for pattern in _DATETIME_TRIPLET_PATTERNS:
+            time_col = pattern["time_from_date"](col)
+            tz_col = pattern["tz_from_date"](col)
+            if time_col not in columns or tz_col not in columns:
+                continue
+            target = col[: -len("Date")] + "DateTime"
+            if target in columns or target in new_columns:
+                break
+            new_columns[target] = _build_utc_datetime(df[col], df[time_col], df[tz_col])
+            break
+    if not new_columns:
+        return df
+    # Concat in one shot — appending columns one-by-one to a wide CSV-derived
+    # frame triggers pandas' fragmentation PerformanceWarning.
+    return pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
+
+
 class BaseMetadata:
     """Base class for metadata.
 
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -16,7 +16,7 @@
 import requests
 from requests.models import PreparedRequest
 
-from dataretrieval.utils import BaseMetadata, to_str
+from dataretrieval.utils import BaseMetadata, attach_datetime_columns, to_str
 from dataretrieval.waterdata.filters import FILTER_LANG
 from dataretrieval.waterdata.types import (
     CODE_SERVICES,
@@ -2266,7 +2266,13 @@ def get_samples(
     Returns
     -------
     df : ``pandas.DataFrame``
-        Formatted data returned from the API query.
+        Formatted data returned from the API query. For each
+        ``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
+        the response (e.g. ``Activity_StartDate``, ``Activity_StartTime``,
+        ``Activity_StartTimeZone``), an additional ``<prefix>DateTime`` column
+        is appended holding a UTC ``Timestamp`` derived from the three. The
+        original Date/Time/TimeZone columns are left intact; rows whose
+        timezone abbreviation is not recognized resolve to ``NaT``.
     md : :obj:`dataretrieval.utils.Metadata`
         Custom ``dataretrieval`` metadata object pertaining to the query.
 
@@ -2323,6 +2329,7 @@ def get_samples(
     response.raise_for_status()
 
     df = pd.read_csv(StringIO(response.text), delimiter=",")
+    df = attach_datetime_columns(df)
 
     return df, BaseMetadata(response)
 
diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py
@@ -17,7 +17,7 @@
 
 import pandas as pd
 
-from .utils import BaseMetadata, query
+from .utils import BaseMetadata, attach_datetime_columns, query
 
 if TYPE_CHECKING:
     from pandas import DataFrame
@@ -101,7 +101,12 @@ def get_results(
     Returns
     -------
     df : ``pandas.DataFrame``
-        Formatted data returned from the API query.
+        Formatted data returned from the API query. For each
+        ``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
+        the response (legacy WQP uses ``<prefix>Time/Time`` and
+        ``<prefix>Time/TimeZoneCode``), an additional ``<prefix>DateTime``
+        column is appended holding a UTC ``Timestamp``. Original triplet
+        columns are preserved; unrecognized timezone codes yield ``NaT``.
     md : :obj:`dataretrieval.utils.Metadata`
         Custom ``dataretrieval`` metadata object pertaining to the query.
 
@@ -147,6 +152,7 @@ def get_results(
     response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)
 
     df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
+    df = attach_datetime_columns(df)
     return df, WQP_Metadata(response)
 
 
diff --git a/tests/utils_test.py b/tests/utils_test.py
@@ -97,3 +97,110 @@ def test_to_str_custom_delimiter(self):
 
     def test_to_str_non_iterable(self):
         assert utils.to_str(123) is None
+
+
+class Test_attach_datetime_columns:
+    """Tests of attach_datetime_columns, which derives <prefix>DateTime UTC
+    columns from Date/Time/TimeZone triplets in Samples and WQP CSVs."""
+
+    def test_wqx3_triplet_resolves_to_utc(self):
+        """The Samples / WQX3 pattern (Activity_Start*) is detected and the
+        resulting DateTime is converted to UTC."""
+        df = pd.DataFrame(
+            {
+                "Activity_StartDate": ["2024-01-09", "2024-02-15"],
+                "Activity_StartTime": ["10:00:00", "14:30:00"],
+                "Activity_StartTimeZone": ["PST", "EST"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert "Activity_StartDateTime" in df.columns
+        # PST is UTC-8 → 10:00 PST is 18:00 UTC
+        assert df["Activity_StartDateTime"][0] == pd.Timestamp(
+            "2024-01-09 18:00:00", tz="UTC"
+        )
+        # EST is UTC-5 → 14:30 EST is 19:30 UTC
+        assert df["Activity_StartDateTime"][1] == pd.Timestamp(
+            "2024-02-15 19:30:00", tz="UTC"
+        )
+        # Original columns are preserved
+        assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"]
+
+    def test_legacy_wqp_triplet_resolves_to_utc(self):
+        """The legacy WQP pattern (slash-separated time/tz columns) is also
+        detected."""
+        df = pd.DataFrame(
+            {
+                "ActivityStartDate": ["2024-01-09"],
+                "ActivityStartTime/Time": ["10:00:00"],
+                "ActivityStartTime/TimeZoneCode": ["PST"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert "ActivityStartDateTime" in df.columns
+        assert df["ActivityStartDateTime"][0] == pd.Timestamp(
+            "2024-01-09 18:00:00", tz="UTC"
+        )
+
+    def test_unknown_timezone_is_NaT(self):
+        """Unknown timezone codes resolve to NaT rather than raising."""
+        df = pd.DataFrame(
+            {
+                "Activity_StartDate": ["2024-01-09"],
+                "Activity_StartTime": ["10:00:00"],
+                "Activity_StartTimeZone": ["BOGUS"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert df["Activity_StartDateTime"].isna().all()
+
+    def test_missing_time_or_tz_is_NaT(self):
+        """Rows with a missing time or tz produce NaT but don't poison others."""
+        df = pd.DataFrame(
+            {
+                "Activity_StartDate": ["2024-01-09", "2024-02-15"],
+                "Activity_StartTime": ["10:00:00", None],
+                "Activity_StartTimeZone": ["PST", "EST"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert df["Activity_StartDateTime"][0] == pd.Timestamp(
+            "2024-01-09 18:00:00", tz="UTC"
+        )
+        assert pd.isna(df["Activity_StartDateTime"][1])
+
+    def test_existing_datetime_column_not_overwritten(self):
+        """An existing <prefix>DateTime column is left alone."""
+        df = pd.DataFrame(
+            {
+                "Activity_StartDate": ["2024-01-09"],
+                "Activity_StartTime": ["10:00:00"],
+                "Activity_StartTimeZone": ["PST"],
+                "Activity_StartDateTime": ["preexisting"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert df["Activity_StartDateTime"].tolist() == ["preexisting"]
+
+    def test_multiple_triplets_handled(self):
+        """All Date/Time/TimeZone triplets in the frame get DateTime columns."""
+        df = pd.DataFrame(
+            {
+                "Activity_StartDate": ["2024-01-09"],
+                "Activity_StartTime": ["10:00:00"],
+                "Activity_StartTimeZone": ["PST"],
+                "LabInfo_AnalysisStartDate": ["2024-01-10"],
+                "LabInfo_AnalysisStartTime": ["09:00:00"],
+                "LabInfo_AnalysisStartTimeZone": ["EST"],
+            }
+        )
+        df = utils.attach_datetime_columns(df)
+        assert "Activity_StartDateTime" in df.columns
+        assert "LabInfo_AnalysisStartDateTime" in df.columns
+
+    def test_lone_date_column_left_alone(self):
+        """A Date column without matching Time/TimeZone columns is ignored."""
+        df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]})
+        df = utils.attach_datetime_columns(df)
+        assert "LastChangeDateTime" not in df.columns
+        assert list(df.columns) == ["LastChangeDate"]
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -1,6 +1,7 @@
 import datetime
 import sys
 
+import pandas as pd
 import pytest
 from pandas import DataFrame
 
@@ -54,11 +55,20 @@ def test_mock_get_samples(requests_mock):
         monitoringLocationIdentifier="USGS-05406500",
     )
     assert type(df) is DataFrame
-    assert df.size == 12127
+    # 67 rows × 181 source columns + 6 derived <prefix>DateTime columns
+    assert df.shape == (67, 187)
     assert md.url == request_url
     assert isinstance(md.query_time, datetime.timedelta)
     assert md.header == {"mock_header": "value"}
     assert md.comment is None
+    # The Activity start triplet is parsed into a UTC Timestamp column.
+    assert "Activity_StartDateTime" in df.columns
+    # Row 0 is "2023-08-22 08:50:00 CDT" → 13:50 UTC.
+    assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp(
+        "2023-08-22 13:50:00", tz="UTC"
+    )
+    # Original triplet columns are preserved.
+    assert df["Activity_StartTimeZone"].iloc[0] == "CDT"
 
 
 def test_mock_get_samples_summary(requests_mock):
@@ -127,7 +137,8 @@ def test_samples_activity():
         monitoringLocationIdentifier="USGS-06719505",
     )
     assert len(df) > 0
-    assert len(df.columns) == 95
+    # 95 columns from the API plus 2 derived <prefix>DateTime columns.
+    assert len(df.columns) == 97
     assert "Location_HUCTwelveDigitCode" in df.columns
 
 
diff --git a/tests/wqp_test.py b/tests/wqp_test.py
@@ -33,11 +33,15 @@ def test_get_results(requests_mock):
         startDateHi="09-30-2011",
     )
     assert type(df) is DataFrame
-    assert df.size == 315
+    # 5 rows × 63 source columns + 2 derived <prefix>DateTime columns
+    assert df.shape == (5, 65)
     assert md.url == request_url
     assert isinstance(md.query_time, datetime.timedelta)
     assert md.header == {"mock_header": "value"}
     assert md.comment is None
+    # Legacy WQP triplets (slash-separated) are parsed into UTC.
+    assert "ActivityStartDateTime" in df.columns
+    assert df["ActivityStartDateTime"].notna().all()
 
 
 def test_get_results_WQX3(requests_mock):
@@ -58,11 +62,15 @@ def test_get_results_WQX3(requests_mock):
         startDateHi="09-30-2011",
     )
     assert type(df) is DataFrame
-    assert df.size == 900
+    # 5 rows × 180 source columns + 6 derived <prefix>DateTime columns
+    assert df.shape == (5, 186)
     assert md.url == request_url
     assert isinstance(md.query_time, datetime.timedelta)
     assert md.header == {"mock_header": "value"}
     assert md.comment is None
+    # WQX3 WQP triplets are parsed into UTC.
+    assert "Activity_StartDateTime" in df.columns
+    assert df["Activity_StartDateTime"].notna().all()
 
 
 def test_what_sites(requests_mock):

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+05/07/2026: `waterdata.get_samples()` and `wqp.get_results()` now append a derived `<prefix>DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`<X>Date`/`<X>Time`/`<X>TimeZone`) and legacy WQP (`<X>Date`/`<X>Time/Time`/`<X>Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266.
	`2`	`+`
`1`	`3`	05/06/2026: Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after 2027-05-06.
`2`	`4`
`3`	`5`	05/06/2026: Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.