feat: update CSV to DSS conversion to use '15min' resampling and improve path handling

dwr-psandhu · Copilot · dwr-psandhu · commit 80a95d86ed6a · 2026-04-28T21:04:55.000-07:00
Co-authored-by: Copilot &lt;copilot@github.com&gt;
diff --git a/README.md b/README.md
@@ -138,10 +138,10 @@ pydsm csv-to-dss CSV_FILE DSS_FILE [OPTIONS]
 | `--unit` | `UNK` | Physical unit string |
 | `--period_type` | `INST-VAL` | Period type (e.g. `INST-VAL`, `PER-AVER`) |
 | `--multiplier` | `1.0` | Scale factor applied to all values |
-| `--resample_to` | `15T` | Pandas resample frequency (e.g. `15T`, `1H`, `1D`) |
+| `--resample_to` | `15min` | Pandas resample frequency (e.g. `15min`, `1h`, `1D`) |
 
 ```bash
-pydsm csv-to-dss observed_ec.csv observed_ec.dss --bpart RSAC075 --unit uS/cm --resample_to 15T
+pydsm csv-to-dss observed_ec.csv observed_ec.dss --bpart RSAC075 --unit uS/cm --resample_to 15min
 ```
 
 ---
diff --git a/pydsm/analysis/dssutils.py b/pydsm/analysis/dssutils.py
@@ -347,20 +347,32 @@ def csv_to_dss(
     unit="UNK",
     period_type="INST-VAL",
     multiplier=1.0,
-    resample_to="15T",
+    resample_to="15min",
 ):
+    """Convert a CSV file to a DSS file.
+
+    Column names in the CSV header are used as the B-part of each DSS path.
+    The index column (default 0) is parsed as the datetime index.
+    D and E parts are inferred from the time series.
+    """
     df = pd.read_csv(csv_file, index_col=index_col, parse_dates=True)
     df = df * multiplier
     df = df.resample(resample_to).mean()
-    for c in df.columns:
-        with pyhecdss.DSSFile(dss_file, create_new=True) as f:
-            for c in tqdm.tqdm(df.columns):
-                bpart = c
-                ts = df[c]
-                epart = ts.index.freqstr
-                pathname = f"/{apart}/{bpart}/{cpart}///{fpart}/"
-                print("Writing to ", pathname)
-                f.write_rts(pathname, ts, unit, period_type)
+    with pyhecdss.DSSFile(dss_file, create_new=True) as f:
+        for c in tqdm.tqdm(df.columns):
+            ts = df[c].dropna()
+            if ts.empty:
+                print(f"Skipping {c!r} — all values are NaN")
+                continue
+            # dropna() drops the freq attribute; restore it so write_rts can
+            # determine the E-part from the index frequency.
+            if ts.index.freq is None and len(ts) > 1:
+                inferred = pd.infer_freq(ts.index)
+                if inferred is not None:
+                    ts.index.freq = pd.tseries.frequencies.to_offset(inferred)
+            pathname = f"/{apart}/{c}/{cpart}///{fpart}/"
+            print("Writing to ", pathname)
+            f.write_rts(pathname, ts, unit, period_type)
     print("Done")
 
 
diff --git a/pydsm/cli.py b/pydsm/cli.py
@@ -282,7 +282,7 @@ def pretty_print_input(input_file, output_file=None):
 @click.argument("dss_file", type=click.Path(exists=False))
 @click.option("--index_col", default=0, help="Column to use as index")
 @click.option("--apart", default="A", help="A part of the DSS path")
-@click.option("--bpart", default="F", help="B part of the DSS path")
+@click.option("--cpart", default="FLOW", help="C part (variable/parameter) of the DSS path. B parts are read from the CSV column headers.")
 @click.option("--fpart", default="F", help="F part of the DSS path")
 @click.option("--unit", default="UNK", help="Unit of the data")
 @click.option(
@@ -296,30 +296,32 @@ def pretty_print_input(input_file, output_file=None):
 )
 @click.option(
     "--resample_to",
-    default="15T",
-    help="Resample frequency for the time series (e.g., '15T' for 15 minutes)",
+    default="15min",
+    help="Resample frequency for the time series (e.g., '15min' for 15 minutes)",
 )
 def csv_to_dss(
     csv_file,
     dss_file,
     index_col=0,
     apart="A",
-    bpart="F",
+    cpart="FLOW",
     fpart="F",
     unit="UNK",
     period_type="INST-VAL",
     multiplier=1.0,
-    resample_to="15T",
+    resample_to="15min",
 ):
     """
     Convert a CSV file to a DSS file.
+
+    Column headers in the CSV are used as the B part of each DSS path.
     """
     dssutils.csv_to_dss(
         csv_file,
         dss_file,
         index_col,
         apart,
-        bpart,
+        cpart,
         fpart,
         unit,
         period_type,
diff --git a/tests/test_csv_to_dss.py b/tests/test_csv_to_dss.py
@@ -0,0 +1,174 @@
+"""Tests for dssutils.csv_to_dss — CSV header → DSS B-part conversion."""
+
+import textwrap
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+import pyhecdss
+from pydsm.analysis.dssutils import csv_to_dss
+
+DATA_DIR = Path(__file__).parent / "data"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _read_dss_paths(dss_file):
+    """Return the list of DSS pathnames present in *dss_file*."""
+    with pyhecdss.DSSFile(str(dss_file)) as f:
+        return f.get_pathnames(f.read_catalog())
+
+
+def _read_dss_ts(dss_file, pathname):
+    """Read a single time series from *dss_file* and return a Series."""
+    with pyhecdss.DSSFile(str(dss_file)) as f:
+        ts, _unit, _period_type = f.read_rts(pathname)
+    if isinstance(ts, pd.DataFrame):
+        ts = ts.iloc[:, 0]
+    if isinstance(ts.index, pd.PeriodIndex):
+        ts.index = ts.index.to_timestamp()
+    return ts
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def daily_csv(tmp_path):
+    """A minimal CSV with three columns and a daily DatetimeIndex."""
+    content = textwrap.dedent("""\
+        ,flow_sac,flow_sjr,ec_bdl
+        2020-01-01,100.0,50.0,200.0
+        2020-01-02,110.0,55.0,210.0
+        2020-01-03,120.0,60.0,220.0
+        2020-01-04,130.0,65.0,230.0
+        2020-01-05,140.0,70.0,240.0
+    """)
+    p = tmp_path / "test_input.csv"
+    p.write_text(content)
+    return p
+
+
+@pytest.fixture
+def csv_with_nans(tmp_path):
+    """CSV where one column has NaN values (should be skipped for that record)."""
+    content = textwrap.dedent("""\
+        ,flow_a,flow_b
+        2020-01-01,100.0,
+        2020-01-02,110.0,55.0
+        2020-01-03,120.0,60.0
+    """)
+    p = tmp_path / "test_nans.csv"
+    p.write_text(content)
+    return p
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestCsvToDssColumnHeaders:
+    """Column names become B-parts in the DSS path."""
+
+    def test_paths_match_column_names(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        bparts = [p.split("/")[2].lower() for p in paths]
+        assert "flow_sac" in bparts
+        assert "flow_sjr" in bparts
+        assert "ec_bdl" in bparts
+
+    def test_number_of_paths(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        assert len(paths) == 3
+
+    def test_apart_used(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), apart="MYAPART", resample_to="1D")
+        paths = _read_dss_paths(dss)
+        assert all(p.split("/")[1].upper() == "MYAPART" for p in paths)
+
+    def test_cpart_used(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), cpart="FLOW", resample_to="1D")
+        paths = _read_dss_paths(dss)
+        assert all(p.split("/")[3].upper() == "FLOW" for p in paths)
+
+    def test_fpart_used(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), fpart="VER1", resample_to="1D")
+        paths = _read_dss_paths(dss)
+        assert all(p.split("/")[6].upper() == "VER1" for p in paths)
+
+
+class TestCsvToDssValues:
+    """Data values are written correctly."""
+
+    def test_values_match_input(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        sac_path = next(p for p in paths if "/FLOW_SAC/" in p)
+        ts = _read_dss_ts(dss, sac_path)
+        assert pytest.approx(ts.iloc[0], rel=1e-4) == 100.0
+        assert pytest.approx(ts.iloc[-1], rel=1e-4) == 140.0
+
+    def test_multiplier_applied(self, daily_csv, tmp_path):
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(daily_csv), str(dss), multiplier=2.0, resample_to="1D")
+        paths = _read_dss_paths(dss)
+        sac_path = next(p for p in paths if "/FLOW_SAC/" in p)
+        ts = _read_dss_ts(dss, sac_path)
+        assert pytest.approx(ts.iloc[0], rel=1e-4) == 200.0
+
+    def test_nan_rows_dropped(self, csv_with_nans, tmp_path):
+        """NaN values must not be written (dropna() before write_rts)."""
+        dss = tmp_path / "out.dss"
+        csv_to_dss(str(csv_with_nans), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        flow_b_path = next((p for p in paths if "/FLOW_B/" in p), None)
+        assert flow_b_path is not None
+        ts = _read_dss_ts(dss, flow_b_path)
+        assert ts.isna().sum() == 0
+
+    def test_all_nan_column_skipped(self, tmp_path):
+        """A column that is entirely NaN must be silently skipped."""
+        content = textwrap.dedent("""\
+            ,flow_a,all_nan
+            2020-01-01,100.0,
+            2020-01-02,110.0,
+            2020-01-03,120.0,
+        """)
+        csv_file = tmp_path / "all_nan.csv"
+        csv_file.write_text(content)
+        dss = tmp_path / "all_nan.dss"
+        csv_to_dss(str(csv_file), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        bparts = [p.split("/")[2].lower() for p in paths]
+        assert "flow_a" in bparts
+        assert "all_nan" not in bparts
+
+
+class TestCsvToDssSingleColumn:
+    """A single-column CSV writes one path whose B-part equals the column name."""
+
+    def test_single_column(self, tmp_path):
+        content = textwrap.dedent("""\
+            datetime,my_station
+            2021-06-01,500.0
+            2021-06-02,510.0
+        """)
+        csv_file = tmp_path / "single.csv"
+        csv_file.write_text(content)
+        dss = tmp_path / "single.dss"
+        csv_to_dss(str(csv_file), str(dss), resample_to="1D")
+        paths = _read_dss_paths(dss)
+        assert len(paths) == 1
+        assert paths[0].split("/")[2].lower() == "my_station"