|
2 | 2 |
|
3 | 3 | Groups raw files by UTC date and concatenates them for daily processing, |
4 | 4 | enabling efficient denoising and MVBS/NASC computation over 24-hour periods. |
| 5 | +
|
| 6 | +Includes utilities for: |
| 7 | +- Pulse-category splitting (short_pulse / long_pulse) based on frequency |
| 8 | + combinations in processed Zarr stores. |
| 9 | +- Time-window batch grouping for multi-day concatenation windows. |
5 | 10 | """ |
6 | 11 |
|
7 | 12 | from __future__ import annotations |
8 | 13 |
|
9 | 14 | import logging |
10 | 15 | import re |
11 | 16 | from collections import defaultdict |
12 | | -from datetime import datetime |
| 17 | +from datetime import datetime, timedelta |
13 | 18 | from pathlib import Path |
14 | 19 | from typing import TYPE_CHECKING, Optional |
15 | 20 |
|
@@ -394,4 +399,139 @@ def merge_location_data(ds: "xr.Dataset", location_data: list[dict]) -> "xr.Data |
394 | 399 | merged = merged.reset_coords("time", drop=True) |
395 | 400 |
|
396 | 401 | return merged |
397 | | - return dt |
| 402 | + |
| 403 | + |
| 404 | +# ============================================================================ |
| 405 | +# Pulse-category splitting utilities |
| 406 | +# ============================================================================ |
| 407 | + |
| 408 | +# Well-known pulse categories for Saildrone EK80. |
| 409 | +# Key = friendly name, value = comma-joined sorted frequency_nominal strings. |
| 410 | +PULSE_CATEGORY_CONFIG: dict[str, dict[str, Optional[str]]] = { |
| 411 | + "short_pulse": {"freq_key": "38000.0,200000.0"}, |
| 412 | + "long_pulse": {"freq_key": "38000.0"}, |
| 413 | + "exported_ds": {"freq_key": None}, # catch-all |
| 414 | +} |
| 415 | + |
| 416 | + |
| 417 | +def detect_pulse_category(ds: "xr.Dataset") -> str: |
| 418 | + """Classify a Sv dataset into a pulse category. |
| 419 | +
|
| 420 | + Classification is based on the sorted frequency_nominal values present |
| 421 | + in the ``channel`` dimension, matching the Saildrone EK80 convention: |
| 422 | +
|
| 423 | + - ``"short_pulse"`` → 38 kHz + 200 kHz (dual-frequency, short CW pulse) |
| 424 | + - ``"long_pulse"`` → 38 kHz only (single-frequency, long CW pulse) |
| 425 | + - ``"exported_ds"`` → anything else |
| 426 | +
|
| 427 | + Args: |
| 428 | + ds: Sv xarray.Dataset with a ``frequency_nominal`` coordinate or |
| 429 | + variable. |
| 430 | +
|
| 431 | + Returns: |
| 432 | + One of ``"short_pulse"``, ``"long_pulse"``, or ``"exported_ds"``. |
| 433 | + """ |
| 434 | + import numpy as np |
| 435 | + |
| 436 | + if "frequency_nominal" in ds: |
| 437 | + freqs = np.sort( |
| 438 | + np.unique(ds["frequency_nominal"].values.astype(float)) |
| 439 | + ) |
| 440 | + elif "channel" in ds.dims: |
| 441 | + freqs = np.sort( |
| 442 | + np.unique(ds["channel"].values.astype(float)) |
| 443 | + ) |
| 444 | + else: |
| 445 | + return "exported_ds" |
| 446 | + |
| 447 | + freq_str = ",".join(f"{f:.1f}" for f in freqs) |
| 448 | + |
| 449 | + for category, cfg in PULSE_CATEGORY_CONFIG.items(): |
| 450 | + if cfg["freq_key"] is None or freq_str == cfg["freq_key"]: |
| 451 | + return category |
| 452 | + |
| 453 | + return "exported_ds" |
| 454 | + |
| 455 | + |
| 456 | +def group_by_pulse_category( |
| 457 | + paths: list[Path], |
| 458 | +) -> dict[str, list[Path]]: |
| 459 | + """Group Zarr store paths by pulse category. |
| 460 | +
|
| 461 | + Opens each Zarr lazily to read ``frequency_nominal`` and assigns the |
| 462 | + file to a pulse category. |
| 463 | +
|
| 464 | + Args: |
| 465 | + paths: List of Sv Zarr store paths. |
| 466 | +
|
| 467 | + Returns: |
| 468 | + ``{category: [path, ...]}`` mapping. |
| 469 | + """ |
| 470 | + import xarray as xr |
| 471 | + |
| 472 | + groups: dict[str, list[Path]] = defaultdict(list) |
| 473 | + for p in paths: |
| 474 | + try: |
| 475 | + ds = xr.open_zarr(p) |
| 476 | + cat = detect_pulse_category(ds) |
| 477 | + except Exception: |
| 478 | + logger.warning(f"Could not classify {p}, assigning to exported_ds") |
| 479 | + cat = "exported_ds" |
| 480 | + groups[cat].append(p) |
| 481 | + return dict(groups) |
| 482 | + |
| 483 | + |
| 484 | +# ============================================================================ |
| 485 | +# Time-window batch grouping |
| 486 | +# ============================================================================ |
| 487 | + |
| 488 | +def batch_key( |
| 489 | + ts: datetime, |
| 490 | + window_days: int = 1, |
| 491 | +) -> str: |
| 492 | + """Return a filename-safe key that anchors *ts* to a fixed time window. |
| 493 | +
|
| 494 | + Args: |
| 495 | + ts: Timestamp (usually a file start time). |
| 496 | + window_days: Width of the batching window in days. |
| 497 | +
|
| 498 | + Returns: |
| 499 | + ``"YYYY-MM-DD"`` for single-day windows, or |
| 500 | + ``"YYYY-MM-DD_to_YYYY-MM-DD"`` for multi-day windows. |
| 501 | +
|
| 502 | + Examples: |
| 503 | + >>> batch_key(datetime(2023, 8, 10), 1) |
| 504 | + '2023-08-10' |
| 505 | + >>> batch_key(datetime(2023, 8, 10), 3) |
| 506 | + '2023-08-09_to_2023-08-11' |
| 507 | + """ |
| 508 | + anchor = datetime(ts.year, ts.month, ts.day) |
| 509 | + |
| 510 | + if window_days <= 1: |
| 511 | + return f"{anchor:%Y-%m-%d}" |
| 512 | + |
| 513 | + # Floor to start of rolling window |
| 514 | + anchor -= timedelta(days=(anchor - datetime.min).days % window_days) |
| 515 | + end = anchor + timedelta(days=window_days - 1) |
| 516 | + return f"{anchor:%Y-%m-%d}_to_{end:%Y-%m-%d}" |
| 517 | + |
| 518 | + |
| 519 | +def group_by_time_window( |
| 520 | + files: list[tuple[Path, datetime]], |
| 521 | + window_days: int = 1, |
| 522 | +) -> dict[str, list[Path]]: |
| 523 | + """Group files into time-window batches. |
| 524 | +
|
| 525 | + Args: |
| 526 | + files: List of ``(path, start_time)`` tuples. |
| 527 | + window_days: Width of each batch window in days. |
| 528 | +
|
| 529 | + Returns: |
| 530 | + ``{batch_key_str: [path, ...]}`` mapping, sorted by key. |
| 531 | + """ |
| 532 | + groups: dict[str, list[Path]] = defaultdict(list) |
| 533 | + for path, ts in files: |
| 534 | + key = batch_key(ts, window_days) |
| 535 | + groups[key].append(path) |
| 536 | + |
| 537 | + return dict(sorted(groups.items())) |
0 commit comments