Skip to content

Commit 1e77158

Browse files
thodson-usgsclaude
andcommitted
feat(waterdata.xarray): add to_awkward() ragged -> awkward view
Convert a ragged (dense=False) Dataset to a one-record-per-series awkward.Array: row_size is awkward's offsets and obs is its flat content, so it is a near-zero-copy re-view -- each series carries its scalar identity metadata plus jagged time/value/flag fields, no NaN fill, with per-series ops vectorized across the whole collection (e.g. ak.mean(arr.value, axis=1)). awkward is NOT a dependency: to_awkward lazy-imports it and raises an informative ModuleNotFoundError ("pip install awkward") when absent. Object columns route through ak.from_iter (NaN -> missing) so flags become a clean option[string]; numeric/datetime content stays numpy. Adds offline tests (importorskip awkward; the missing-dep error is tested by simulating the absent import) and a demo note. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 961be13 commit 1e77158

3 files changed

Lines changed: 128 additions & 0 deletions

File tree

dataretrieval/waterdata/xarray.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
__all__ = [
8585
"clear_metadata_cache",
8686
"select_series",
87+
"to_awkward",
8788
"get_continuous",
8889
"get_daily",
8990
"get_field_measurements",
@@ -593,6 +594,63 @@ def select_series(ds, **keys):
593594
return series.drop_vars("row_size", errors="ignore")
594595

595596

597+
def to_awkward(ds):
598+
"""Convert a ragged (``dense=False``) Dataset to a per-series ``awkward.Array``.
599+
600+
The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs``
601+
dimension) is structurally identical to awkward's jagged ``ListOffsetArray``,
602+
so this is a near-zero-copy re-view: each timeseries instance becomes one
603+
record carrying its per-series identity/metadata (scalar fields such as
604+
``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its
605+
observations as variable-length jagged fields (``time`` / ``value`` / flags).
606+
No NaN fill, each series on its own time axis -- per-series operations then
607+
vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``::
608+
609+
ds = wdx.get_daily(..., dense=False)
610+
arr = wdx.to_awkward(ds)
611+
ak.mean(arr.value, axis=1) # per-series means
612+
arr[arr.parameter_code == "00060"] # filter series by metadata
613+
614+
``awkward`` is an optional dependency that is *not* installed with
615+
``dataretrieval``; install it separately (``pip install awkward``).
616+
"""
617+
try:
618+
import awkward as ak
619+
except ModuleNotFoundError as exc: # pragma: no cover - exercised only sans awkward
620+
raise ModuleNotFoundError(
621+
"to_awkward requires the optional 'awkward' dependency, which is not "
622+
"installed with dataretrieval. Install it with: pip install awkward"
623+
) from exc
624+
if "row_size" not in ds.variables or "obs" not in ds.dims:
625+
raise ValueError(
626+
"to_awkward expects a ragged Dataset (from dense=False); the default "
627+
"dense Dataset is already a (monitoring_location_id, time) grid."
628+
)
629+
counts = ds["row_size"].to_numpy()
630+
631+
def _content(values):
632+
# awkward rejects numpy object dtype; route string/None columns through
633+
# from_iter (normalizing NaN -> missing), and keep numeric/datetime
634+
# content as numpy -- that part is the zero-copy re-view.
635+
if values.dtype == object:
636+
return ak.from_iter([_none_if_nan(v) for v in values.tolist()])
637+
return values
638+
639+
# Per-series (timeseries-dim) coords -> scalar record fields; obs-dim
640+
# variables/coords -> jagged fields (unflattened by row_size). ``row_size``
641+
# itself is the offsets, already encoded in the jagged structure.
642+
record = {}
643+
for name in (*ds.data_vars, *ds.coords):
644+
if name == "row_size":
645+
continue
646+
da = ds[name]
647+
if da.dims == ("timeseries",):
648+
record[name] = _content(da.to_numpy())
649+
elif da.dims == ("obs",):
650+
record[name] = ak.unflatten(_content(da.to_numpy()), counts)
651+
return ak.zip(record, depth_limit=1)
652+
653+
596654
# === column schemas ========================================================
597655

598656
# Water-quality samples (Samples DB / WQX) speak a different column vocabulary

demos/waterdata_xarray_demo.ipynb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,29 @@
271271
"regroup/decode the whole dataset for you."
272272
]
273273
},
274+
{
275+
"cell_type": "markdown",
276+
"metadata": {},
277+
"source": [
278+
"### The whole collection at once: `to_awkward`\n",
279+
"\n",
280+
"For analysis across *all* series at once, convert the ragged dataset to an\n",
281+
"[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n",
282+
"(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n",
283+
"a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n",
284+
"fields, its observations jagged), and per-series operations vectorize with no NaN\n",
285+
"fill. `awkward` is an optional dependency that is *not* installed with\n",
286+
"`dataretrieval` (`pip install awkward`):\n",
287+
"\n",
288+
"```python\n",
289+
"import awkward as ak\n",
290+
"\n",
291+
"arr = wdx.to_awkward(ragged) # one record per series\n",
292+
"ak.mean(arr.value, axis=1) # per-series means, all at once\n",
293+
"arr[arr.parameter_code == \"00060\"] # filter series by metadata\n",
294+
"```"
295+
]
296+
},
274297
{
275298
"cell_type": "markdown",
276299
"metadata": {},

tests/waterdata_xarray_test.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,53 @@ def test_select_series_on_dense_raises_helpful_error():
722722
wdx.select_series(dense, monitoring_location_id="USGS-1")
723723

724724

725+
def test_to_awkward_missing_dependency_raises_informative(monkeypatch):
726+
# awkward is NOT a dependency; calling to_awkward without it must raise a
727+
# clear, actionable error rather than a bare ImportError. (Simulated so the
728+
# test holds whether or not awkward happens to be installed.)
729+
import builtins
730+
731+
real_import = builtins.__import__
732+
733+
def fake_import(name, *args, **kwargs):
734+
if name == "awkward":
735+
raise ModuleNotFoundError("No module named 'awkward'")
736+
return real_import(name, *args, **kwargs)
737+
738+
monkeypatch.setattr(builtins, "__import__", fake_import)
739+
ds = wdx._build_ragged(
740+
_daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
741+
)
742+
with pytest.raises(ModuleNotFoundError, match="pip install awkward"):
743+
wdx.to_awkward(ds)
744+
745+
746+
def test_to_awkward_converts_ragged_to_jagged_records():
747+
ak = pytest.importorskip("awkward")
748+
ds = _two_instance_ragged() # two series at USGS-1: 00060 and 00010
749+
arr = wdx.to_awkward(ds)
750+
assert len(arr) == ds.sizes["timeseries"] # one record per series
751+
# scalar identity fields + jagged observation fields
752+
assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set(
753+
arr.fields
754+
)
755+
# faithful: per-series lengths == row_size, total obs preserved, no fill
756+
assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist()
757+
assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"]
758+
# per-series reductions vectorize across all series at once
759+
means = ak.mean(arr.value, axis=1)
760+
assert len(means) == len(arr)
761+
762+
763+
def test_to_awkward_on_dense_raises():
764+
pytest.importorskip("awkward")
765+
dense = wdx._build_dense(
766+
_daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
767+
)
768+
with pytest.raises(ValueError, match="expects a ragged Dataset"):
769+
wdx.to_awkward(dense)
770+
771+
725772
# --- ragged opt-out wiring --------------------------------------------------
726773

727774

0 commit comments

Comments
 (0)