fix: validate CTD numeric fields before sending to echosounder

beatfactor · beatfactor · commit 19cd9212a1d8 · 2026-04-13T17:50:01.000+02:00
- Coerce temperature/salinity/pressure/depth/sound_speed to float
  before including in ctdOutput payload
- Skip corrupt values (e.g. '1.!60') instead of forwarding as strings
- Add ADCP file patterns to watch_patterns config
- README updates for CTD monitoring and config docs
diff --git a/Dockerfile.Linux.arm64 b/Dockerfile.Linux.arm64
@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install oceanstream from git
 ARG OCEANSTREAM_BRANCH=main
 ARG OCEANSTREAM_REPO=https://github.com/OceanStreamIO/oceanstream-cli.git
+ARG OCEANSTREAM_CACHEBUST=1
 RUN git clone --depth 1 -b ${OCEANSTREAM_BRANCH} ${OCEANSTREAM_REPO} /tmp/oceanstream && \
     pip install --no-cache-dir "/tmp/oceanstream[geotrack,adcp]" && \
     rm -rf /tmp/oceanstream
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ Sensor data (TCP/UDP stream, file drop, IoT Edge message)
 | Sea-Bird CNV | `.cnv` | pandas | Processed CTD with header metadata |
 | Sea-Bird HEX | `.hex` + `.hdr` + `.XMLCON` | seabirdscientific + gsw | Raw CTD frequency → T/C/P/S/depth |
 | RDI ADCP | `.raw` | dolfyn | Beam→earth transform, ensemble averaging, u/v/w velocities |
+| Nortek AD2CP | `.ad2cp` | oceanstream | Echosounder Sv (volume backscatter) and/or velocity data |
 | tar.gz | `.tar.gz`, `.tgz` | tarfile | Extracts and processes contained files |
 
 ## Quick Start
@@ -84,20 +85,89 @@ Configuration is driven by IoT Hub module twin desired properties. All `EdgeConf
 
 ## Configuration
 
-All config flows through the `EdgeConfig` dataclass in `config.py`:
+All config flows through the `EdgeConfig` dataclass in `config.py`. In IoT Edge mode, every field is readable/writable via module twin desired properties (changes apply live). In standalone mode, fields come from environment variables and CLI args.
 
-| Field | Default | Description |
-|-------|---------|-------------|
-| `input_mode` | `both` | `stream`, `file`, or `both` |
-| `stream_format` | `auto` | `nmea`, `csv`, `hex`, or `auto` |
-| `stream_port` | `9100` | TCP/UDP listen port |
-| `watch_dir` | `/data/sensor` | Directory to watch for new files |
-| `batch_interval_seconds` | `60` | Stream batch flush interval |
-| `telemetry_downsample_seconds` | `30` | Min interval between D2C messages |
-| `storage_backend` | `azure-blob-edge` | `azure-blob-edge` or `local` |
-| `output_base_path` | `/app/processed` | Output root directory |
+### Input Mode
 
-See `config.py` for the full list. Standalone mode uses env vars and CLI args; IoT Edge mode uses twin desired properties.
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `input_mode` | `stream` \| `file` \| `both` | `both` | Which ingest sources to activate |
+
+### Network Stream
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `stream_protocol` | `tcp` \| `udp` \| `auto` | `auto` | Network protocol for stream listener |
+| `stream_host` | string | `0.0.0.0` | Bind address (server mode) or remote host (client mode) |
+| `stream_port` | int | `9100` | Listen port (server) or connect port (client) |
+| `stream_format` | `nmea` \| `csv` \| `hex` \| `auto` | `auto` | Expected data format on the stream |
+| `stream_connect_mode` | `server` \| `client` | `server` | TCP server (listen) or client (connect to remote) |
+
+### File Watcher
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `watch_dir` | string | `/data/sensor` | Directory to watch for new files |
+| `watch_patterns` | string | `*.csv,*.txt,*.hex,*.cnv,*.raw,*.ad2cp,*.tar.gz` | Comma-separated glob patterns |
+| `watch_polling` | bool | `false` | Use polling instead of inotify (required for SMB/NFS mounts) |
+| `watch_poll_interval` | int | `2` | Seconds between polls when `watch_polling` is true |
+| `backfill_minutes` | int | `0` | On startup, queue files modified within the last N minutes. `0` = skip all existing files (only process new arrivals). Set to e.g. `60` to reprocess the last hour of data after a restart |
+
+### Batching
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `batch_interval_seconds` | int | `60` | Stream batch flush interval (seconds) |
+| `batch_max_records` | int | `1000` | Stream batch flush when this many records buffered |
+
+### Metadata
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `campaign_id` | string | `""` | Campaign identifier — used as blob container name and output path prefix |
+| `platform_id` | string | `""` | Platform/vessel identifier |
+| `platform_name` | string | `""` | Human-readable platform name |
+| `provider` | string | `auto` | Oceanstream data provider for enrichment (`auto`, `generic`, or a named provider) |
+
+### Telemetry
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `telemetry_interval_seconds` | int | `300` | Periodic telemetry summary interval |
+| `telemetry_send_records` | bool | `true` | Send individual sensor records to IoT Hub |
+| `telemetry_send_summaries` | bool | `true` | Send periodic summary messages to IoT Hub |
+| `telemetry_downsample_seconds` | int | `30` | Minimum interval between D2C telemetry messages |
+
+### CTD File Monitor
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `ctd_enabled` | bool | `false` | Enable polling a CTD file for latest readings |
+| `ctd_file_path` | string | `/mnt/ctd/latest_ctd.csv` | Path to the CTD CSV file (updated in place by logger) |
+| `ctd_poll_interval_seconds` | int | `30` | How often to read the CTD file |
+| `ctd_observatory` | string | `munkholmen` | Observatory name for CTD provider enrichment |
+
+### Storage
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `storage_backend` | `azure-blob-edge` \| `local` | `azure-blob-edge` | Output storage backend |
+| `output_base_path` | string | `/app/processed` | Root path for local storage backend |
+| `processed_container` | string | `sensordata` | Subfolder within the campaign container for processed output |
+
+### Logging
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `log_level` | string | `INFO` | Logging level (`DEBUG`, `INFO`, `WARNING`, `ERROR`) |
+
+### Startup Behaviour
+
+By default (`backfill_minutes: 0`), the module does **not** process existing files when it starts. It only processes files that arrive after startup via the file watcher, stream listener, or IoT Edge messages. This prevents reprocessing the entire dataset on every container restart.
+
+To backfill recent data after a restart, set `backfill_minutes` to the desired window (e.g. `60` for the last hour). Only files whose modification time falls within that window are queued. This is useful when the module was down briefly and you want to catch up on missed files.
+
+See `config.py` for implementation details.
 
 ## Connecting Sensors
 
@@ -203,18 +273,23 @@ Test data is stored in Azure Blob Storage (`sensorstream-test` container) and do
 
 **Output**: D2C telemetry to IoT Hub (rate-limited by `telemetry_downsample_seconds`), GeoParquet + metadata JSON to blob storage.
 
-**Twin**: All config fields are readable/writable via module twin. Changes are applied live without restart.
+**Twin**: All config fields are readable/writable via module twin desired properties. Changes are applied live without restart. Twin property names map 1:1 to config field names, except `Log_Level` → `log_level`.
+
+**Backfill on restart**: By default the module skips existing files. Set `backfill_minutes` in the twin to process recent files after a restart (e.g. `60` for the last hour).
 
 ## Environment Variables
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `AZURE_CONNECTION_STRING` | — | Storage connection string (tests, E2E) |
+| `AZURE_STORAGE_CONNECTION_STRING` | — | Azure Blob Storage connection string (edge blob or cloud) |
 | `STORAGE_BACKEND` | `azure-blob-edge` | `local` for standalone |
 | `OUTPUT_BASE_PATH` | `/app/processed` | Output directory |
+| `PROCESSED_CONTAINER_NAME` | `sensordata` | Subfolder name within campaign container |
 | `WATCH_DIR` | `/data/sensor` | File watch directory |
+| `STREAM_HOST` | `0.0.0.0` | Stream listener bind address |
 | `STREAM_PORT` | `9100` | Stream listener port |
-| `CAMPAIGN_ID` | — | Campaign identifier |
+| `CAMPAIGN_ID` | — | Campaign identifier for output partitioning |
+| `PLATFORM_ID` | — | Platform/vessel identifier |
 | `LOG_LEVEL` | `INFO` | Logging level |
 
 ## Docker
diff --git a/config.py b/config.py
@@ -100,6 +100,7 @@ def _parse_dict(val: Any) -> Optional[Dict[str, Any]]:
     "telemetry_downsample_seconds",
     "watch_poll_interval",
     "ctd_poll_interval_seconds",
+    "backfill_minutes",
 }
 
 _FLOAT_FIELDS: set[str] = set()
@@ -126,6 +127,7 @@ class EdgeConfig:
     watch_patterns: str = "*.csv,*.txt,*.hex,*.cnv,*.raw,*.ad2cp,*.tar.gz"
     watch_polling: bool = False
     watch_poll_interval: int = 2
+    backfill_minutes: int = 0
 
     # --- Batching ---
     batch_interval_seconds: int = 60
diff --git a/exports/telemetry.py b/exports/telemetry.py
@@ -76,3 +76,24 @@ def send_record_telemetry(
     }
 
     send_to_hub(client, data=payload, output_name="output1")
+
+    # Forward CTD readings on a dedicated output for module-to-module routing
+    source = record.get("source", "")
+    if source in ("ctd_latest", "oceanlab_munkholmen") and any(
+        k in record for k in ("temperature", "salinity", "pressure")
+    ):
+        ctd_payload: dict = {
+            "type": "ctd_environment",
+            "time": record.get("time"),
+            "latitude": record.get("latitude"),
+            "longitude": record.get("longitude"),
+        }
+        # Coerce numeric CTD fields — skip corrupt values (e.g. "1.!60")
+        for key in ("temperature", "salinity", "pressure", "depth", "sound_speed"):
+            val = record.get(key)
+            if val is not None:
+                try:
+                    ctd_payload[key] = float(val)
+                except (ValueError, TypeError):
+                    pass  # omit corrupt value
+        send_to_hub(client, data=ctd_payload, output_name="ctdOutput")
diff --git a/ingest/adapter.py b/ingest/adapter.py
@@ -467,9 +467,16 @@ def parse_ad2cp_file(
                 "AD2CP %s has no echosounder data — trying velocity reader",
                 raw_path.name,
             )
-            from oceanstream.adcp.processor import (
-                process_ad2cp_velocity_file as _os_process_velocity,
-            )
+            try:
+                from oceanstream.adcp.processor import (
+                    process_ad2cp_velocity_file as _os_process_velocity,
+                )
+            except ImportError:
+                logger.warning(
+                    "AD2CP %s: velocity reader not yet available — skipping",
+                    raw_path.name,
+                )
+                return pd.DataFrame()
 
             try:
                 return _os_process_velocity(raw_path)
diff --git a/ingest/file_watcher.py b/ingest/file_watcher.py
@@ -93,25 +93,44 @@ async def stop(self) -> None:
         logger.info("File watcher stopped")
 
     async def scan_existing(self) -> int:
-        """Scan watch_dir for existing files and queue them.
+        """Scan watch_dir for recently created files and queue them.
+
+        Only files modified within the last ``config.backfill_minutes`` are
+        queued.  If ``backfill_minutes`` is 0 (the default), no existing
+        files are queued — only newly arriving files trigger processing.
 
         Returns the number of files queued.
         """
+        backfill = self.config.backfill_minutes
+        if backfill <= 0:
+            return 0
+
         watch_path = Path(self.config.watch_dir)
         if not watch_path.exists():
             return 0
 
+        import time as _time
+        cutoff = _time.time() - backfill * 60
         patterns = [p.strip() for p in self.config.watch_patterns.split(",")]
         count = 0
+        skipped = 0
         for pattern in patterns:
             for file_path in sorted(watch_path.rglob(pattern)):
                 if file_path.is_file() and str(file_path) not in self._seen:
                     self._seen.add(str(file_path))
+                    if file_path.stat().st_mtime < cutoff:
+                        skipped += 1
+                        continue
                     await self.queue.put(("file", str(file_path)))
                     count += 1
 
+        if skipped:
+            logger.info(
+                "Skipped %d files older than %d min from %s",
+                skipped, backfill, watch_path,
+            )
         if count:
-            logger.info("Queued %d existing files from %s", count, watch_path)
+            logger.info("Queued %d recent files from %s", count, watch_path)
         return count
 
     async def _on_new_file(self, file_path: str, patterns: list[str]) -> None: