Skip to content

Commit 85e1cc1

Browse files
thodson-usgsclaude
andcommitted
fix(waterdata.xarray): handle NaT times and mixed units in the converter
Two correctness fixes in _build_timeseries from review: - NaT time: a row whose time fails to parse (errors="coerce" -> NaT) could not index the array and was silently swallowed by to_xarray (and an all-NaT group crashed). Drop such rows explicitly with a warning, and return an empty Dataset if nothing parseable remains -- no more silent data loss. - Mixed units: unit_of_measure was collapsed with _first over a (parameter_code, statistic_id) group that can span multiple units (e.g. the same parameter reported in different units across sites). A variable can carry only one units attr, so warn when a group spans multiple units instead of silently labeling every value with the first. Adds tests for the bad-time drop, all-bad-time empty result, and the mixed-unit warning. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 04f7b70 commit 85e1cc1

2 files changed

Lines changed: 65 additions & 1 deletion

File tree

dataretrieval/waterdata/xarray.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,11 +242,26 @@ def _build_timeseries(
242242
).dt.tz_localize(None)
243243
work["value"] = _pd.to_numeric(work["value"], errors="coerce")
244244

245+
# A NaT time can't index the array and would be silently swallowed by
246+
# ``to_xarray`` (or crash an all-NaT group). Drop such rows explicitly and
247+
# say so, rather than losing observations without a trace.
248+
n_before = len(work)
249+
work = work[work["time"].notna()]
250+
dropped = n_before - len(work)
251+
if dropped:
252+
_warnings.warn(
253+
f"dropped {dropped} row(s) with an unparseable or missing time.",
254+
stacklevel=3,
255+
)
256+
if work.empty:
257+
return _empty_dataset(service, base_meta)
258+
245259
datasets, used = [], set()
246260
for _, group in work.groupby(group_cols, dropna=False):
247261
pcode = _first(group["parameter_code"]) if "parameter_code" in group else None
248262
stat = _first(group["statistic_id"]) if "statistic_id" in group else None
249-
unit = _first(group["unit_of_measure"]) if has_unit else None
263+
group_units = group["unit_of_measure"].dropna().unique() if has_unit else ()
264+
unit = group_units[0] if len(group_units) else None
250265
desc = series_meta.get(str(pcode), {}) if pcode is not None else {}
251266

252267
name = _slug(desc.get("parameter_name") or pcode)
@@ -257,6 +272,16 @@ def _build_timeseries(
257272
name += "_x"
258273
used.add(name)
259274

275+
if len(group_units) > 1:
276+
# One variable can carry only one ``units`` attr; surface the
277+
# mix instead of silently labeling every value with the first.
278+
_warnings.warn(
279+
f"'{name}' spans multiple units {list(group_units)}; labeling "
280+
f"with '{unit}'. Filter by site/parameter to avoid mixing "
281+
"units in one variable.",
282+
stacklevel=3,
283+
)
284+
260285
sub = group.set_index([_INSTANCE, "time"])[["value", *ancillary]]
261286
if not sub.index.is_unique:
262287
_warnings.warn(

tests/waterdata_xarray_test.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,42 @@ def spy(func, args, kwargs):
216216
# the wrapper hands include_hash to _fetch; _fetch is what drops it
217217
# (asserted in test_fetch_strips_include_hash)
218218
assert seen[0].get("include_hash") is True
219+
220+
221+
def test_unparseable_time_dropped_with_warning():
222+
# A bad/missing time must be dropped explicitly (with a warning), not
223+
# silently swallowed by to_xarray.
224+
df = _daily_frame(
225+
values=(100, 110, 120),
226+
times=("2024-06-01", "not-a-date", "2024-06-03"),
227+
)
228+
with pytest.warns(UserWarning, match="unparseable or missing time"):
229+
ds = wdx._build_timeseries(
230+
df, _meta(), service="daily", series_meta=_DISCHARGE_META
231+
)
232+
assert ds.sizes["time"] == 2 # the bad-time row is gone, the good ones stay
233+
assert 110 not in ds["discharge"].values # the dropped value did not survive
234+
235+
236+
def test_all_unparseable_time_returns_empty_dataset():
237+
df = _daily_frame(values=(1, 2), times=("bad-a", "bad-b"))
238+
with pytest.warns(UserWarning, match="unparseable or missing time"):
239+
ds = wdx._build_timeseries(
240+
df, _meta(), service="daily", series_meta=_DISCHARGE_META
241+
)
242+
assert list(ds.data_vars) == []
243+
assert ds.attrs["Conventions"] == "CF-1.11"
244+
245+
246+
def test_mixed_units_in_one_variable_warns():
247+
# Same (parameter, statistic) across two sites but different units -> one
248+
# variable can hold only one units attr; warn instead of silently mislabeling.
249+
a = _daily_frame(site="USGS-1", values=(100,), times=("2024-06-01",))
250+
b = _daily_frame(site="USGS-2", values=(3,), times=("2024-06-01",))
251+
b["unit_of_measure"] = "m3 s-1"
252+
with pytest.warns(UserWarning, match="spans multiple units"):
253+
ds = wdx._build_timeseries(
254+
pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META
255+
)
256+
assert ds.sizes["monitoring_location_id"] == 2
257+
assert ds["discharge"].attrs["units"] == "ft3 s-1" # first unit (ft^3/s)

0 commit comments

Comments
 (0)