feat(waterdata.xarray): add to_awkward() ragged -> awkward view

thodson-usgs · claude · thodson-usgs · commit 1e771581b96b · 2026-06-05T10:21:25.000-05:00
Convert a ragged (dense=False) Dataset to a one-record-per-series
awkward.Array: row_size is awkward's offsets and obs is its flat content, so
it is a near-zero-copy re-view -- each series carries its scalar identity
metadata plus jagged time/value/flag fields, no NaN fill, with per-series ops
vectorized across the whole collection (e.g. ak.mean(arr.value, axis=1)).

awkward is NOT a dependency: to_awkward lazy-imports it and raises an
informative ModuleNotFoundError ("pip install awkward") when absent. Object
columns route through ak.from_iter (NaN -&gt; missing) so flags become a clean
option[string]; numeric/datetime content stays numpy.

Adds offline tests (importorskip awkward; the missing-dep error is tested by
simulating the absent import) and a demo note.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py
@@ -84,6 +84,7 @@
 __all__ = [
     "clear_metadata_cache",
     "select_series",
+    "to_awkward",
     "get_continuous",
     "get_daily",
     "get_field_measurements",
@@ -593,6 +594,63 @@ def select_series(ds, **keys):
     return series.drop_vars("row_size", errors="ignore")
 
 
+def to_awkward(ds):
+    """Convert a ragged (``dense=False``) Dataset to a per-series ``awkward.Array``.
+
+    The CF contiguous-ragged layout (``row_size`` offsets + a flat ``obs``
+    dimension) is structurally identical to awkward's jagged ``ListOffsetArray``,
+    so this is a near-zero-copy re-view: each timeseries instance becomes one
+    record carrying its per-series identity/metadata (scalar fields such as
+    ``monitoring_location_id`` / ``parameter_code`` / ``longitude``) plus its
+    observations as variable-length jagged fields (``time`` / ``value`` / flags).
+    No NaN fill, each series on its own time axis -- per-series operations then
+    vectorize across every series at once, e.g. ``ak.mean(arr.value, axis=1)``::
+
+        ds = wdx.get_daily(..., dense=False)
+        arr = wdx.to_awkward(ds)
+        ak.mean(arr.value, axis=1)  # per-series means
+        arr[arr.parameter_code == "00060"]  # filter series by metadata
+
+    ``awkward`` is an optional dependency that is *not* installed with
+    ``dataretrieval``; install it separately (``pip install awkward``).
+    """
+    try:
+        import awkward as ak
+    except ModuleNotFoundError as exc:  # pragma: no cover - exercised only sans awkward
+        raise ModuleNotFoundError(
+            "to_awkward requires the optional 'awkward' dependency, which is not "
+            "installed with dataretrieval. Install it with:  pip install awkward"
+        ) from exc
+    if "row_size" not in ds.variables or "obs" not in ds.dims:
+        raise ValueError(
+            "to_awkward expects a ragged Dataset (from dense=False); the default "
+            "dense Dataset is already a (monitoring_location_id, time) grid."
+        )
+    counts = ds["row_size"].to_numpy()
+
+    def _content(values):
+        # awkward rejects numpy object dtype; route string/None columns through
+        # from_iter (normalizing NaN -> missing), and keep numeric/datetime
+        # content as numpy -- that part is the zero-copy re-view.
+        if values.dtype == object:
+            return ak.from_iter([_none_if_nan(v) for v in values.tolist()])
+        return values
+
+    # Per-series (timeseries-dim) coords -> scalar record fields; obs-dim
+    # variables/coords -> jagged fields (unflattened by row_size). ``row_size``
+    # itself is the offsets, already encoded in the jagged structure.
+    record = {}
+    for name in (*ds.data_vars, *ds.coords):
+        if name == "row_size":
+            continue
+        da = ds[name]
+        if da.dims == ("timeseries",):
+            record[name] = _content(da.to_numpy())
+        elif da.dims == ("obs",):
+            record[name] = ak.unflatten(_content(da.to_numpy()), counts)
+    return ak.zip(record, depth_limit=1)
+
+
 # === column schemas ========================================================
 
 # Water-quality samples (Samples DB / WQX) speak a different column vocabulary
diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb
@@ -271,6 +271,29 @@
     "regroup/decode the whole dataset for you."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The whole collection at once: `to_awkward`\n",
+    "\n",
+    "For analysis across *all* series at once, convert the ragged dataset to an\n",
+    "[awkward](https://awkward-array.org/) array. The contiguous-ragged layout\n",
+    "(`row_size` offsets + a flat `obs` axis) *is* awkward's jagged layout, so this is\n",
+    "a near-zero-copy re-view: each series becomes one record (its metadata as scalar\n",
+    "fields, its observations jagged), and per-series operations vectorize with no NaN\n",
+    "fill. `awkward` is an optional dependency that is *not* installed with\n",
+    "`dataretrieval` (`pip install awkward`):\n",
+    "\n",
+    "```python\n",
+    "import awkward as ak\n",
+    "\n",
+    "arr = wdx.to_awkward(ragged)        # one record per series\n",
+    "ak.mean(arr.value, axis=1)          # per-series means, all at once\n",
+    "arr[arr.parameter_code == \"00060\"]  # filter series by metadata\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py
@@ -722,6 +722,53 @@ def test_select_series_on_dense_raises_helpful_error():
         wdx.select_series(dense, monitoring_location_id="USGS-1")
 
 
+def test_to_awkward_missing_dependency_raises_informative(monkeypatch):
+    # awkward is NOT a dependency; calling to_awkward without it must raise a
+    # clear, actionable error rather than a bare ImportError. (Simulated so the
+    # test holds whether or not awkward happens to be installed.)
+    import builtins
+
+    real_import = builtins.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "awkward":
+            raise ModuleNotFoundError("No module named 'awkward'")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
+    ds = wdx._build_ragged(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    with pytest.raises(ModuleNotFoundError, match="pip install awkward"):
+        wdx.to_awkward(ds)
+
+
+def test_to_awkward_converts_ragged_to_jagged_records():
+    ak = pytest.importorskip("awkward")
+    ds = _two_instance_ragged()  # two series at USGS-1: 00060 and 00010
+    arr = wdx.to_awkward(ds)
+    assert len(arr) == ds.sizes["timeseries"]  # one record per series
+    # scalar identity fields + jagged observation fields
+    assert {"monitoring_location_id", "parameter_code", "value", "time"} <= set(
+        arr.fields
+    )
+    # faithful: per-series lengths == row_size, total obs preserved, no fill
+    assert ak.num(arr.value).tolist() == ds["row_size"].values.tolist()
+    assert int(ak.sum(ak.num(arr.value))) == ds.sizes["obs"]
+    # per-series reductions vectorize across all series at once
+    means = ak.mean(arr.value, axis=1)
+    assert len(means) == len(arr)
+
+
+def test_to_awkward_on_dense_raises():
+    pytest.importorskip("awkward")
+    dense = wdx._build_dense(
+        _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META
+    )
+    with pytest.raises(ValueError, match="expects a ragged Dataset"):
+        wdx.to_awkward(dense)
+
+
 # --- ragged opt-out wiring --------------------------------------------------