|
57 | 57 | - Power: timestamp + column whose name contains "power" (excluding |
58 | 58 | "limit"/"cap"/"max"/"min"). NVIDIA: "power.draw [W]". AMD: "socket_power". |
59 | 59 | srt-slurm: "power_w". |
60 | | - - Temperature: column name contains "temp". NVIDIA: "temperature.gpu". AMD: |
61 | | - "temperature". srt-slurm: "temp_c". Unit: Celsius. |
62 | | - - Utilization: column name starts with "utilization" or contains "util". |
| 60 | + - Temperature: column name contains "temp"; hotspot/junction columns are |
| 61 | + preferred over the first match because data-center AMD parts report edge |
| 62 | + temperature as N/A. NVIDIA: "temperature.gpu". AMD amd-smi: "edge_temperature" |
| 63 | + / "hotspot_temperature" (junction picked). srt-slurm: "temp_c". Unit: Celsius. |
| 64 | + - Utilization: column starts with "utilization" or contains "util", or is |
| 65 | + amd-smi's "gfx_activity" (umc_activity / mm_activity are not matched). |
63 | 66 | NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent. |
64 | | - - Memory: column name contains "mem" but not "total"/"clock"/"util" — so |
65 | | - "memory.total", "clocks.current.memory" (a frequency), and |
66 | | - "utilization.memory" (a percent) are all rejected; only memory *used* is |
67 | | - picked. NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB. |
| 67 | + - Memory used: column mentions memory/vram AND "used" — picks NVIDIA |
| 68 | + "memory.used [MiB]", srt-slurm "mem_used_mb", amd-smi "used_vram"; rejects |
| 69 | + decoys lacking "used" (memory.total / total_vram / free_vram, the memory |
| 70 | + *clock* "clocks.current.memory", utilization.memory, mem_temperature, |
| 71 | + mem_voltage). Unit: MiB/MB. |
68 | 72 |
|
69 | 73 | Power is required for aggregation to fire; the other metrics degrade gracefully |
70 | 74 | when their columns are absent (those fields are simply omitted from the output). |
|
90 | 94 | _POWER_COL_RE = re.compile(r"power", re.IGNORECASE) |
91 | 95 | _POWER_EXCLUDE_RE = re.compile(r"limit|cap|max|min", re.IGNORECASE) |
92 | 96 | _TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE) |
93 | | -_UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE) |
94 | | -_MEM_COL_RE = re.compile(r"mem", re.IGNORECASE) |
95 | | -# Exclude "total" (memory.total), "clock" (clocks.current.memory — a frequency, |
96 | | -# not memory used), and "util" (utilization.memory — a percent). nvidia-smi's |
97 | | -# query emits clocks.current.memory BEFORE any used-memory column, so without |
98 | | -# these excludes _MEM_COL_RE would grab the memory *clock* (~2500 MHz) as |
99 | | -# avg_mem_used_mb. |
100 | | -_MEM_EXCLUDE_RE = re.compile(r"total|clock|util", re.IGNORECASE) |
| 97 | +# Data-center AMD parts (MI300/MI355) report edge temperature as N/A and expose |
| 98 | +# the real die temperature as hotspot/junction; prefer those when present so |
| 99 | +# avg_temp_c isn't computed over an all-N/A edge column. NVIDIA's single |
| 100 | +# "temperature.gpu" and srt-slurm's "temp_c" have neither token and fall through |
| 101 | +# to the first temperature column unchanged. |
| 102 | +_TEMP_PREFER_RE = re.compile(r"hotspot|junction", re.IGNORECASE) |
| 103 | +# Utilization: NVIDIA "utilization.gpu", srt-slurm "util_pct", AMD amd-smi |
| 104 | +# "gfx_activity" (the GPU/graphics-engine busy percent). amd-smi's other usage |
| 105 | +# columns — umc_activity (memory controller), mm_activity (multimedia) — are |
| 106 | +# intentionally NOT matched so gfx_activity is the one picked. |
| 107 | +_UTIL_COL_RE = re.compile(r"^utilization|util|gfx_activity", re.IGNORECASE) |
| 108 | +# Memory *used*: match positively on a column that mentions both memory/vram and |
| 109 | +# "used" rather than broad "mem" + a growing exclude list. This naturally picks |
| 110 | +# NVIDIA "memory.used [MiB]", srt-slurm "mem_used_mb", and amd-smi "used_vram" |
| 111 | +# while rejecting same-prefix decoys that lack "used": memory.total / total_vram / |
| 112 | +# free_vram, clocks.current.memory (a frequency), utilization.memory (a percent), |
| 113 | +# and amd-smi's mem_temperature / mem_voltage. |
| 114 | +_MEM_COL_RE = re.compile(r"(?:mem|vram).*used|used.*(?:mem|vram)", re.IGNORECASE) |
101 | 115 | _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE) |
102 | 116 | _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE) |
103 | 117 | _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?") |
@@ -208,12 +222,14 @@ def _detect_all_columns(header: list[str]) -> dict[str, str | None]: |
208 | 222 | (c for c in header if _POWER_COL_RE.search(c) and not _POWER_EXCLUDE_RE.search(c)), |
209 | 223 | None, |
210 | 224 | ) |
211 | | - temp_col = next((c for c in header if _TEMP_COL_RE.search(c)), None) |
212 | | - util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None) |
213 | | - mem_col = next( |
214 | | - (c for c in header if _MEM_COL_RE.search(c) and not _MEM_EXCLUDE_RE.search(c)), |
215 | | - None, |
| 225 | + temp_cols = [c for c in header if _TEMP_COL_RE.search(c)] |
| 226 | + # Prefer hotspot/junction (the real die temp on data-center AMD parts) over |
| 227 | + # the first temperature column (edge on AMD, temperature.gpu on NVIDIA). |
| 228 | + temp_col = next((c for c in temp_cols if _TEMP_PREFER_RE.search(c)), None) or ( |
| 229 | + temp_cols[0] if temp_cols else None |
216 | 230 | ) |
| 231 | + util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None) |
| 232 | + mem_col = next((c for c in header if _MEM_COL_RE.search(c)), None) |
217 | 233 | gpu_col = next((c for c in header if _GPU_INDEX_COL_RE.match(c.strip())), None) |
218 | 234 | return { |
219 | 235 | "timestamp": timestamp_col, |
|
0 commit comments