You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
gpu.clock.speed.graphics,gauge,16,megahertz,,Current clock speed for the graphics domain,0,gpu,clock.speed.graphics,,
3
6
gpu.clock.speed.graphics.max,gauge,16,megahertz,,Maximum clock speed for the graphics domain,0,gpu,clock.speed.graphics.max,,
4
7
gpu.clock.speed.memory,gauge,16,megahertz,,Current clock speed for the memory,0,gpu,clock.speed.memory,,
@@ -15,9 +18,14 @@ gpu.clock.throttle_reasons.sw_power_cap,gauge,16,,,GPU clocks that are throttled
15
18
gpu.clock.throttle_reasons.sw_thermal_slowdown,gauge,16,,,GPU clocks that are throttled to avoid exceeding temperature limits,0,gpu,clock.throttle_reasons.sw_thermal_slowdown,,
16
19
gpu.clock.throttle_reasons.sync_boost,gauge,16,,,GPU clocks that are throttled to match clock speed of another GPU in the current sync boost group,0,gpu,clock.throttle_reasons.sync_boost,,
17
20
gpu.core.limit,gauge,16,core,,Number of GPU cores that the process/container/device has available,0,gpu,core.limit,,
21
+
gpu.decoder_active,gauge,16,percent,,Percentage of time the decoder was active,0,gpu,decoder_active,,
18
22
gpu.decoder_utilization,gauge,16,percent,,Percentage of time the decoder was active,0,gpu,decoder_utilization,,
19
23
gpu.device.total,gauge,16,,,Number of GPU devices found in the host,0,gpu,device.total,,
24
+
gpu.device.unhealthy,gauge,16,,,Whether the GPU device is unhealthy,0,gpu,device.unhealthy,,
20
25
gpu.dram_active,gauge,16,percent,,Percentage of time the DRAM was active,0,gpu,dram_active,,
26
+
gpu.ecc.repair_pending.channel,gauge,16,,,Whether a channel repair is pending due to ECC errors,0,gpu,ecc.repair_pending.channel,,
27
+
gpu.ecc.repair_pending.tpc,gauge,16,,,Whether a TPC repair is pending due to ECC errors,0,gpu,ecc.repair_pending.tpc,,
28
+
gpu.encoder_active,gauge,16,percent,,Percentage of time the encoder was active,0,gpu,encoder_active,,
21
29
gpu.encoder_utilization,gauge,16,percent,,Percentage of time the encoder was active,0,gpu,encoder_utilization,,
22
30
gpu.errors.ecc.corrected.total,gauge,16,,,"Corrected ECC (Error Correcting Code) errors indicate GPU memory corruption that was detected and successfully corrected by the hardware. While these errors do not immediately impact application correctness, recurring corrected errors can be an early sign of degrading memory or underlying hardware issues.",0,gpu,errors.ecc.corrected.total,,
23
31
gpu.errors.ecc.uncorrected.total,gauge,16,,,"Uncorrectable ECC (Error Correcting Code) errors indicate GPU memory corruption that could not be automatically fixed. These errors can lead to data corruption, application crashes, or incorrect computation results.",0,gpu,errors.ecc.uncorrected.total,,
@@ -33,31 +41,57 @@ gpu.memory.bar1.total,gauge,16,byte,,Total BAR1 memory (in bytes).,0,gpu,memory.
33
41
gpu.memory.bar1.used,gauge,16,byte,,Allocated used memory (in bytes),0,gpu,memory.bar1.used,,
34
42
gpu.memory.free,gauge,16,byte,,Unallocated device memory (in bytes).,0,gpu,memory.free,,
35
43
gpu.memory.limit,gauge,16,byte,,The maximum amount of memory a process/container/device could allocate,0,gpu,memory.limit,,
36
-
gpu.memory.reserved,gauge,16,byte,,Device memory (in bytes) reserved for system use (driver or firmware)..,0,gpu,memory.reserved,,
44
+
gpu.memory.reserved,gauge,16,byte,,Device memory (in bytes) reserved for system use (driver or firmware).,0,gpu,memory.reserved,,
37
45
gpu.memory.temperature,gauge,16,degree celsius,,Temperature of the memory chip,0,gpu,memory.temperature,,
46
+
gpu.nvlink.ber.effective,gauge,16,,,NVLink effective error counter total for all links (errors not corrected by FEC/recovery mechanisms).,0,gpu,nvlink.ber.effective,,
47
+
gpu.nvlink.ber.symbol,gauge,16,,,Symbol bit error rate for all NVLINK links,0,gpu,nvlink.ber.symbol,,
38
48
gpu.nvlink.count.active,gauge,16,,,Number of active nvlinks for the device,0,gpu,,,
39
49
gpu.nvlink.count.inactive,gauge,16,,,Number of inactive nvlinks for the device,0,gpu,,,
40
50
gpu.nvlink.count.total,gauge,16,,,Number of total nvlinks for the device,0,gpu,,,
51
+
gpu.nvlink.errors.buffer.overrun,gauge,16,,,NVLink buffer overrun error counter total for all links,0,gpu,nvlink.errors.buffer.overrun,,
41
52
gpu.nvlink.errors.crc.data,gauge,16,,,NVLink data CRC error counter total for all links,0,gpu,nvlink.errors.crc.data,,
42
53
gpu.nvlink.errors.crc.flit,gauge,16,,,NVLink flow control CRC error counter total for all links,0,gpu,nvlink.errors.crc.flit,,
43
54
gpu.nvlink.errors.ecc,gauge,16,,,NVLink ECC error counter total for all links,0,gpu,nvlink.errors.ecc,,
55
+
gpu.nvlink.errors.effective,gauge,16,,,NVLink effective error counter total for all links,0,gpu,nvlink.errors.effective,,
56
+
gpu.nvlink.errors.local.link.integrity,gauge,16,,,NVLink local link integrity error counter total for all links,0,gpu,nvlink.errors.local.link.integrity,,
57
+
gpu.nvlink.errors.malformed.packet,gauge,16,,,NVLink malformed packet error counter total for all links,0,gpu,nvlink.errors.malformed.packet,,
44
58
gpu.nvlink.errors.recovery,gauge,16,,,NVLink recovery error counter total for all links,0,gpu,nvlink.errors.recovery,,
45
59
gpu.nvlink.errors.replay,gauge,16,,,NVLink replay error counter total for all links,0,gpu,nvlink.errors.replay,,
60
+
gpu.nvlink.errors.rx,gauge,16,,,NVLink receive error counter total for all links,0,gpu,nvlink.errors.rx,,
61
+
gpu.nvlink.errors.rx.general,gauge,16,,,NVLink general receive error counter total for all links,0,gpu,nvlink.errors.rx.general,,
62
+
gpu.nvlink.errors.rx.remote,gauge,16,,,NVLink remote receive error counter total for all links,0,gpu,nvlink.errors.rx.remote,,
63
+
gpu.nvlink.errors.symbol,gauge,16,,,NVLink symbol error counter total for all links,0,gpu,nvlink.errors.symbol,,
46
64
gpu.nvlink.nvswitch_connected,gauge,16,,,Number of NVLinks connected to the NVSwitch,0,gpu,nvlink.nvswitch_connected,,
65
+
gpu.nvlink.plr.codes_loss,gauge,16,,,NVLink PLR codes loss counter for the port,0,gpu,nvlink.plr.codes_loss,,
66
+
gpu.nvlink.plr.rx.code_err,gauge,16,,,NVLink PLR RX code error counter for the port,0,gpu,nvlink.plr.rx.code_err,,
67
+
gpu.nvlink.plr.rx.codes,gauge,16,,,NVLink PLR RX codes counter for the port,0,gpu,nvlink.plr.rx.codes,,
68
+
gpu.nvlink.plr.rx.uncorrectable_code,gauge,16,,,NVLink PLR RX uncorrectable code counter for the port,0,gpu,nvlink.plr.rx.uncorrectable_code,,
69
+
gpu.nvlink.plr.tx.codes,gauge,16,,,NVLink PLR TX codes counter for the port,0,gpu,nvlink.plr.tx.codes,,
70
+
gpu.nvlink.plr.tx.retry_codes,gauge,16,,,NVLink PLR TX retry codes counter for the port,0,gpu,nvlink.plr.tx.retry_codes,,
71
+
gpu.nvlink.plr.tx.retry_events,gauge,16,,,NVLink PLR TX retry events counter for the port,0,gpu,nvlink.plr.tx.retry_events,,
72
+
gpu.nvlink.plr.tx.retry_events_within_t_sec_max,gauge,16,,,Maximum NVLink PLR TX retry events within the configured time window for the port,0,gpu,nvlink.plr.tx.retry_events_within_t_sec_max,,
73
+
gpu.nvlink.plr.tx.sync_events,gauge,16,,,NVLink PLR TX sync events counter for the port,0,gpu,nvlink.plr.tx.sync_events,,
74
+
gpu.nvlink.recovery.events.failed,gauge,16,,,Failed NVLink recovery events total for all links,0,gpu,nvlink.recovery.events.failed,,
75
+
gpu.nvlink.recovery.events.successful,gauge,16,,,Successful NVLink recovery events total for all links,0,gpu,nvlink.recovery.events.successful,,
76
+
gpu.nvlink.rx.packets,gauge,16,,,NVLink received packet counter total for all links,0,gpu,nvlink.rx.packets,,
47
77
gpu.nvlink.speed,gauge,16,megabit,second,common speed of all NVLINK links,0,gpu,nvlink.speed,,
48
78
gpu.nvlink.throughput.data.rx,gauge,16,kilobyte,second,Total RX data throughput of all NVLINK links,0,gpu,nvlink.throughput.data.rx,,
49
79
gpu.nvlink.throughput.data.tx,gauge,16,kilobyte,second,Total TX data throughput of all NVLINK links,0,gpu,nvlink.throughput.data.tx,,
50
80
gpu.nvlink.throughput.raw.rx,gauge,16,kilobyte,second,Total RX of all NVLINK links,0,gpu,nvlink.throughput.raw.rx,,
51
81
gpu.nvlink.throughput.raw.tx,gauge,16,kilobyte,second,Total TX of all NVLINK links,0,gpu,nvlink.throughput.raw.tx,,
82
+
gpu.nvlink.tx.discards,gauge,16,,,NVLink transmitted discard counter total for all links,0,gpu,nvlink.tx.discards,,
83
+
gpu.nvlink.tx.packets,gauge,16,,,NVLink transmitted packet counter total for all links,0,gpu,nvlink.tx.packets,,
gpu.pci.throughput.rx,gauge,16,byte,second,Bytes received through PCI to the GPU device per second,0,gpu,pci.throughput.rx,,
54
86
gpu.pci.throughput.tx,gauge,16,byte,second,Bytes transmitted through PCI from the GPU device per second,0,gpu,pci.throughput.tx,,
55
87
gpu.performance_state,gauge,16,,,Returns the current performance state of the device,0,gpu,performance_state,,
56
88
gpu.power.management_limit,gauge,16,milliwatt,,Upper boundary for the device power draw.,0,gpu,power.management_limit,,
57
89
gpu.power.usage,gauge,16,milliwatt,,"Power usage for the GPU device. On GA100 and older architectures this is the instantaneous power at that moment, in newer ones it represents the average power draw over one second",0,gpu,power.usage,,
58
90
gpu.process.core.usage,gauge,16,core,,Average number of GPU cores that a process was using in the interval. Only emitted when processes are active.,0,gpu,process.core.usage,,
91
+
gpu.process.decoder_active,gauge,16,percent,,Percentage of time the decoder was active for a specific process,0,gpu,process.decoder_active,,
59
92
gpu.process.decoder_utilization,gauge,16,percent,,Percentage of time the decoder was active for a specific process,0,gpu,process.decoder_utilization,,
60
93
gpu.process.dram_active,gauge,16,percent,,Percentage of time the DRAM was active for a specific process,0,gpu,process.dram_active,,
94
+
gpu.process.encoder_active,gauge,16,percent,,Percentage of time the encoder was active for a specific process,0,gpu,process.encoder_active,,
61
95
gpu.process.encoder_utilization,gauge,16,percent,,Percentage of time the encoder was active for a specific process,0,gpu,process.encoder_utilization,,
62
96
gpu.process.memory.usage,gauge,16,byte,,The memory used by this process at the point the metric was given. Only emitted when processes are active.,0,gpu,process.memory.usage,,
63
97
gpu.process.sm_active,gauge,16,percent,,Percentage of time the streaming multiprocessor was active for a specific process,0,gpu,process.sm_active,,
@@ -67,7 +101,8 @@ gpu.remapped_rows.pending,gauge,16,,,Number of rows pending remapping,0,gpu,rema
67
101
gpu.remapped_rows.uncorrectable,gauge,16,,,Number of rows remapped due to uncorrectable errors,0,gpu,remapped_rows.uncorrectable,,
gpu.sm_active,gauge,16,percent,,Percentage of time the streaming multiprocessor was active,0,gpu,sm_active,,
70
-
gpu.sm_occupancy,gauge,16,percent,,Percentage of the Streaming Multiprocessors that were active in the interval,0,gpu,sm_occupancy,,
104
+
gpu.sm_occupancy,gauge,16,percent,,Percentage of the warps available in the Streaming Multiprocessors that were active in the interval,0,gpu,sm_occupancy,,
105
+
gpu.sm_utilization,gauge,16,percent,,Percentage of the Streaming Multiprocessors that were active in the interval,0,gpu,sm_utilization,,
71
106
gpu.temperature,gauge,16,degree celsius,,Temperature of the GPU device,0,gpu,temperature,,
72
107
gpu.tensor_active,gauge,16,percent,,Percentage of the time that the tensor calculation engine was active. Only for Hopper and newer GPUs,0,gpu,tensor_active,,
73
108
gpu.total_energy_consumption,gauge,16,millijoule,,Total energy consumed by the device since the driver was reloaded,0,gpu,total_energy_consumption,,
0 commit comments