@@ -41,10 +41,18 @@ start_gpu_monitor() {
4141 GPU_MONITOR_PID=$!
4242 echo " [GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID , interval=${interval} s, output=$output )"
4343 elif command -v amd-smi & > /dev/null; then
44- # Use amd-smi native watch mode (-w) which includes timestamps automatically.
45- # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers.
44+ # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage,
45+ # -w <interval> native watch mode (emits a timestamp column per sample),
46+ # --csv. The awk filter keeps the first CSV header line and drops
47+ # amd-smi's preamble / repeated headers. Header match is case-insensitive
48+ # (tolower) so a capitalized "Timestamp," header — should amd-smi ever
49+ # emit one — still passes through; aggregate_power's column detection is
50+ # case-insensitive too. NOTE: amd-smi timestamps are node-local wall
51+ # clock, so multinode aggregation assumes cluster clocks are NTP-synced
52+ # (same assumption as nvidia-smi; aggregate_power windows by absolute
53+ # epoch from benchmark_serving.py).
4654 amd-smi metric -p -c -t -u -w " $interval " --csv 2> /dev/null \
47- | awk ' /^timestamp,/{if(!h){print;h=1};next} h{print}' > " $output " &
55+ | awk ' tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > " $output " &
4856 GPU_MONITOR_PID=$!
4957 echo " [GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID , interval=${interval} s, output=$output )"
5058 else
@@ -63,11 +71,75 @@ stop_gpu_monitor() {
6371 local lines
6472 lines=$( wc -l < " $GPU_METRICS_CSV " )
6573 echo " [GPU Monitor] Collected $lines rows -> $GPU_METRICS_CSV "
74+ # Echo the captured header so a vendor-SMI schema mismatch (the one
75+ # thing that silently yields 0 usable power samples downstream) is
76+ # visible in CI logs without re-running on hardware.
77+ echo " [GPU Monitor] CSV header: $( head -1 " $GPU_METRICS_CSV " 2> /dev/null) "
6678 fi
6779 fi
6880 GPU_MONITOR_PID=" "
6981}
7082
83+ # Start a per-node GPU power monitor for multi-node disaggregated runs.
84+ #
85+ # This is the AMD/SGLang/vLLM analogue of NVIDIA srt-slurm's per-node perfmon
86+ # (PR #35): there is no orchestrator to spawn nvidia-smi on each node, so each
87+ # node starts its own amd-smi/nvidia-smi monitor here. The output filename
88+ # encodes the worker role and index in exactly the format
89+ # utils/aggregate_power.py's _parse_perfmon_label expects:
90+ #
91+ # perf_samples_<role>_w<worker_idx>_<host>.csv
92+ #
93+ # so the downstream aggregation can attribute energy per worker and (for disagg)
94+ # per stage. role must be one of: prefill, decode, agg, frontend.
95+ #
96+ # Output goes to $PERFMON_OUTPUT_DIR, which job.slurm points at the NFS-shared
97+ # /benchmark_logs/perfmon mount so every node's CSV lands in one directory the
98+ # runner can collect. The monitor runs for the whole server lifetime;
99+ # aggregate_power.py windows the samples down to each concurrency's benchmark
100+ # load window using the timestamps benchmark_serving.py writes.
101+ #
102+ # Best-effort by design: an unset output dir, an unknown role, or a missing
103+ # amd-smi/nvidia-smi is a no-op that returns 0 — a monitoring hiccup must never
104+ # fail the benchmark.
105+ #
106+ # Usage: start_perf_monitor <role> <worker_idx> [interval_seconds]
107+ start_perf_monitor () {
108+ local role=" $1 "
109+ local worker_idx=" $2 "
110+ local interval=" ${3:- ${PERFMON_SAMPLE_INTERVAL:- 1} } "
111+
112+ local out_dir=" ${PERFMON_OUTPUT_DIR:- } "
113+ if [[ -z " $out_dir " ]]; then
114+ echo " [perfmon] PERFMON_OUTPUT_DIR unset — skipping per-node power monitor"
115+ return 0
116+ fi
117+ case " $role " in
118+ prefill|decode|agg|frontend) ;;
119+ * )
120+ echo " [perfmon] unknown role '$role ' (expected prefill|decode|agg|frontend) — skipping monitor"
121+ return 0
122+ ;;
123+ esac
124+ if ! mkdir -p " $out_dir " 2> /dev/null; then
125+ echo " [perfmon] cannot create $out_dir — skipping per-node power monitor"
126+ return 0
127+ fi
128+
129+ # Sanitize the host component so the filename stays parseable by
130+ # aggregate_power's regex (role/idx anchors are unambiguous, but keep the
131+ # host free of separators that could confuse a future tightening). Prefer
132+ # the short hostname; fall back to the FQDN.
133+ local host
134+ host=$( hostname -s 2> /dev/null || hostname)
135+ host=$( printf ' %s' " $host " | tr -c ' A-Za-z0-9.-' ' _' )
136+
137+ local out=" ${out_dir} /perf_samples_${role} _w${worker_idx} _${host} .csv"
138+ echo " [perfmon] starting per-node power monitor: role=$role worker=$worker_idx host=$host interval=${interval} s -> $out "
139+ start_gpu_monitor --output " $out " --interval " $interval "
140+ return 0
141+ }
142+
71143# Check if required environment variables are set
72144# Usage: check_env_vars VAR1 VAR2 VAR3 ...
73145# Exits with code 1 if any variable is not set
0 commit comments