feat(monitor): per-node nvidia-smi GPU performance monitoring

Aryan · Aryan · commit 8ddd99b06859 · 2026-05-28T10:40:08.000-07:00
Squashed rebase of NVIDIA/srt-slurm PR NVIDIA#35 (kdhruv/gweperf_integration) onto current main (which now includes default_bash_preamble, added since PR NVIDIA#35 was opened on 2026-04-13). Original PR NVIDIA#35 had three commits; their net effect is collapsed here to one because the second commit replaced the first's gweperf integration with a built-in poller. Adds: - src/srtctl/monitor/perfmon.py (new) - nvidia-smi polling, per-node perf_samples_<node>.csv + perf_summary_<node>.json output. - MonitoringConfig in src/srtctl/core/schema.py (new) - {enabled, sample_interval}, top-level SrtConfig field. - _start_perf_monitor / _stop_perf_monitor in BenchmarkStageMixin (new) - one process per worker node, started before bench, stopped SIGINT with 30s grace. - tests/test_monitoring.py (new) - 19 tests, all passing upstream. Consumed by SemiAnalysisAI/InferenceX#1574 via the pinned ref SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon. Will revert this fork pin to NVIDIA/srt-slurm@main once PR NVIDIA#35 merges upstream.
diff --git a/src/srtctl/cli/mixins/benchmark_stage.py b/src/srtctl/cli/mixins/benchmark_stage.py
@@ -9,6 +9,8 @@
 
 import logging
 import shlex
+import signal
+import subprocess
 import threading
 import time
 from pathlib import Path
@@ -159,17 +161,95 @@ def run_benchmark(
 
         logger.info("Running %s benchmark", runner.name)
 
+        # Start perf monitoring on all worker nodes (non-fatal if it fails)
+        perf_procs = self._start_perf_monitor()
+
         # Run the benchmark script
         benchmark_log = self.runtime.log_dir / "benchmark.out"
         exit_code = self._run_benchmark_script(runner, benchmark_log, stop_event)
 
+        # Stop monitoring regardless of benchmark outcome
+        self._stop_perf_monitor(perf_procs)
+
         if exit_code != 0:
             logger.error("Benchmark failed with exit code %d", exit_code)
         else:
             logger.info("Benchmark completed successfully")
 
         return exit_code
 
+    def _start_perf_monitor(self) -> list[tuple[str, "subprocess.Popen"]]:
+        """Start one perfmon process per worker node.
+
+        Failures are non-fatal: a warning is logged and that node is skipped.
+
+        Returns:
+            List of (node, Popen) pairs for processes that started successfully.
+        """
+        m = self.config.monitoring
+        if m is None or not m.enabled:
+            return []
+
+        worker_nodes = list(self.runtime.nodes.worker)
+        if not worker_nodes:
+            logger.warning("No worker nodes to monitor")
+            return []
+
+        perfmon_script = Path(__file__).parent.parent.parent / "monitor" / "perfmon.py"
+        mounts = dict(self.runtime.container_mounts)
+        mounts[perfmon_script] = Path("/tmp/srt_perfmon.py")
+
+        procs: list[tuple[str, subprocess.Popen]] = []
+        for node in worker_nodes:
+            cmd = [
+                "python3", "/tmp/srt_perfmon.py",
+                "--output-csv", f"/logs/perf_samples_{node}.csv",
+                "--output-json", f"/logs/perf_summary_{node}.json",
+                "--interval", str(m.sample_interval),
+            ]
+            perf_log = self.runtime.log_dir / f"perf_monitor_{node}.out"
+            try:
+                proc = start_srun_process(
+                    command=cmd,
+                    nodelist=[node],
+                    output=str(perf_log),
+                    container_image=str(self.runtime.container_image),
+                    container_mounts=mounts,
+                )
+                procs.append((node, proc))
+                logger.info("perf monitor started on %s (interval=%.1fs)", node, m.sample_interval)
+            except Exception as e:
+                logger.warning("Failed to start perf monitor on %s: %s - monitoring skipped for this node", node, e)
+
+        return procs
+
+    def _stop_perf_monitor(self, procs: list[tuple[str, "subprocess.Popen"]]) -> None:
+        """Stop all perfmon processes, allowing each to write its summary JSON.
+
+        Sends SIGINT (triggers perfmon's exit handler) and waits up to 30s.
+        Falls back to SIGKILL if the process does not exit cleanly.
+        """
+        if not procs:
+            return
+
+        logger.info("Stopping perf monitoring on %d node(s)", len(procs))
+        for node, proc in procs:
+            if proc.poll() is not None:
+                logger.warning("perf monitor on %s already exited (code %d)", node, proc.returncode)
+                continue
+            try:
+                proc.send_signal(signal.SIGINT)
+            except ProcessLookupError:
+                logger.warning("perf monitor on %s vanished before SIGINT", node)
+                continue
+            try:
+                proc.wait(timeout=30)
+                logger.info("perf monitor on %s stopped cleanly", node)
+            except subprocess.TimeoutExpired:
+                logger.warning("perf monitor on %s did not stop within 30s, killing", node)
+                proc.kill()
+                proc.wait()
+
     def _run_benchmark_script(
         self,
         runner: "BenchmarkRunner",
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
@@ -1227,6 +1227,25 @@ class OutputConfig:
     Schema: ClassVar[type[Schema]] = Schema
 
 
+@dataclass(frozen=True)
+class MonitoringConfig:
+    """Built-in GPU performance monitoring during benchmark execution.
+
+    When enabled, one perfmon process runs per worker node (excluding the head node)
+    and writes per-node output files to the job log directory:
+      - perf_samples_{node}.csv   per-second time-series (GPU util, memory, power, temp)
+      - perf_summary_{node}.json  aggregate statistics over the benchmark window
+
+    Uses nvidia-smi — no external dependencies required.
+    Failures are non-fatal: monitoring is skipped for affected nodes, benchmark continues.
+    """
+
+    enabled: bool = True
+    sample_interval: float = 1.0
+
+    Schema: ClassVar[type[Schema]] = Schema
+
+
 @dataclass(frozen=True)
 class HealthCheckConfig:
     """Health check configuration."""
@@ -1307,6 +1326,9 @@ class SrtConfig:
     # Reporting configuration (status API, future: logs to S3, etc.)
     reporting: ReportingConfig | None = None
 
+    # Built-in GPU performance monitoring (runs on all worker nodes during benchmark)
+    monitoring: MonitoringConfig | None = None
+
     Schema: ClassVar[type[Schema]] = Schema
 
     def __post_init__(self):
diff --git a/src/srtctl/monitor/perfmon.py b/src/srtctl/monitor/perfmon.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Lightweight GPU performance monitor.
+
+Polls nvidia-smi at a fixed interval and writes:
+  - per-second CSV samples   (--output-csv)
+  - aggregate summary JSON   (--output-json, written on SIGINT/exit)
+
+Usage:
+    python3 perfmon.py --output-csv /logs/perf_samples_node1.csv \\
+                       --output-json /logs/perf_summary_node1.json \\
+                       --interval 1.0
+"""
+
+import argparse
+import csv
+import json
+import signal
+import subprocess
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+_QUERY = "index,utilization.gpu,memory.used,memory.total,power.draw,temperature.gpu"
+_FIELDS = ["gpu", "util_pct", "mem_used_mb", "mem_total_mb", "power_w", "temp_c"]
+
+
+def _sample() -> list[dict]:
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", f"--query-gpu={_QUERY}", "--format=csv,noheader,nounits"],
+            text=True,
+        )
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return []
+    rows = []
+    for line in out.strip().splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) == len(_FIELDS):
+            rows.append(dict(zip(_FIELDS, parts)))
+    return rows
+
+
+def _summarize(samples: list[dict]) -> dict:
+    by_gpu: dict[str, list[dict]] = {}
+    for s in samples:
+        by_gpu.setdefault(s["gpu"], []).append(s)
+
+    summary = {}
+    for gpu_idx, gpu_samples in by_gpu.items():
+
+        def avg(field: str, _s: list[dict] = gpu_samples) -> float | None:
+            vals = [float(s[field]) for s in _s if s.get(field, "").strip() not in ("", "[N/A]")]
+            return round(sum(vals) / len(vals), 2) if vals else None
+
+        summary[f"gpu_{gpu_idx}"] = {
+            "samples": len(gpu_samples),
+            "avg_util_pct": avg("util_pct"),
+            "avg_mem_used_mb": avg("mem_used_mb"),
+            "mem_total_mb": avg("mem_total_mb"),
+            "avg_power_w": avg("power_w"),
+            "avg_temp_c": avg("temp_c"),
+        }
+    return summary
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output-csv", required=True)
+    parser.add_argument("--output-json", required=True)
+    parser.add_argument("--interval", type=float, default=1.0)
+    args = parser.parse_args()
+
+    samples: list[dict] = []
+    stop = False
+
+    def handle_sigint(sig, frame):
+        nonlocal stop
+        stop = True
+
+    signal.signal(signal.SIGINT, handle_sigint)
+
+    with Path(args.output_csv).open("w", newline="") as f:
+        writer: csv.DictWriter | None = None
+        while not stop:
+            ts = datetime.now(timezone.utc).isoformat()
+            for row in _sample():
+                record = {"timestamp": ts, **row}
+                if writer is None:
+                    writer = csv.DictWriter(f, fieldnames=list(record.keys()))
+                    writer.writeheader()
+                writer.writerow(record)
+                samples.append(record)
+            f.flush()
+            time.sleep(args.interval)
+
+    if samples:
+        Path(args.output_json).write_text(json.dumps(_summarize(samples), indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py