Skip to content

Commit 94d61dd

Browse files
authored
gpu: update metric metadata (DataDog#23380)
* Update metadata * PR comments
1 parent b96f8c0 commit 94d61dd

1 file changed

Lines changed: 37 additions & 2 deletions

File tree

gpu/metadata.csv

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
2+
gpu.c2c.errors.interrupt,gauge,16,,,C2C interrupt error counter total,0,gpu,c2c.errors.interrupt,,
3+
gpu.c2c.errors.replay,gauge,16,,,C2C replay error counter total,0,gpu,c2c.errors.replay,,
4+
gpu.c2c.errors.replay.b2b,gauge,16,,,C2C back-to-back replay error counter total,0,gpu,c2c.errors.replay.b2b,,
25
gpu.clock.speed.graphics,gauge,16,megahertz,,Current clock speed for the graphics domain,0,gpu,clock.speed.graphics,,
36
gpu.clock.speed.graphics.max,gauge,16,megahertz,,Maximum clock speed for the graphics domain,0,gpu,clock.speed.graphics.max,,
47
gpu.clock.speed.memory,gauge,16,megahertz,,Current clock speed for the memory,0,gpu,clock.speed.memory,,
@@ -15,9 +18,14 @@ gpu.clock.throttle_reasons.sw_power_cap,gauge,16,,,GPU clocks that are throttled
1518
gpu.clock.throttle_reasons.sw_thermal_slowdown,gauge,16,,,GPU clocks that are throttled to avoid exceeding temperature limits,0,gpu,clock.throttle_reasons.sw_thermal_slowdown,,
1619
gpu.clock.throttle_reasons.sync_boost,gauge,16,,,GPU clocks that are throttled to match clock speed of another GPU in the current sync boost group,0,gpu,clock.throttle_reasons.sync_boost,,
1720
gpu.core.limit,gauge,16,core,,Number of GPU cores that the process/container/device has available,0,gpu,core.limit,,
21+
gpu.decoder_active,gauge,16,percent,,Percentage of time the decoder was active,0,gpu,decoder_active,,
1822
gpu.decoder_utilization,gauge,16,percent,,Percentage of time the decoder was active,0,gpu,decoder_utilization,,
1923
gpu.device.total,gauge,16,,,Number of GPU devices found in the host,0,gpu,device.total,,
24+
gpu.device.unhealthy,gauge,16,,,Whether the GPU device is unhealthy,0,gpu,device.unhealthy,,
2025
gpu.dram_active,gauge,16,percent,,Percentage of time the DRAM was active,0,gpu,dram_active,,
26+
gpu.ecc.repair_pending.channel,gauge,16,,,Whether a channel repair is pending due to ECC errors,0,gpu,ecc.repair_pending.channel,,
27+
gpu.ecc.repair_pending.tpc,gauge,16,,,Whether a TPC repair is pending due to ECC errors,0,gpu,ecc.repair_pending.tpc,,
28+
gpu.encoder_active,gauge,16,percent,,Percentage of time the encoder was active,0,gpu,encoder_active,,
2129
gpu.encoder_utilization,gauge,16,percent,,Percentage of time the encoder was active,0,gpu,encoder_utilization,,
2230
gpu.errors.ecc.corrected.total,gauge,16,,,"Corrected ECC (Error Correcting Code) errors indicate GPU memory corruption that was detected and successfully corrected by the hardware. While these errors do not immediately impact application correctness, recurring corrected errors can be an early sign of degrading memory or underlying hardware issues.",0,gpu,errors.ecc.corrected.total,,
2331
gpu.errors.ecc.uncorrected.total,gauge,16,,,"Uncorrectable ECC (Error Correcting Code) errors indicate GPU memory corruption that could not be automatically fixed. These errors can lead to data corruption, application crashes, or incorrect computation results.",0,gpu,errors.ecc.uncorrected.total,,
@@ -33,31 +41,57 @@ gpu.memory.bar1.total,gauge,16,byte,,Total BAR1 memory (in bytes).,0,gpu,memory.
3341
gpu.memory.bar1.used,gauge,16,byte,,Allocated used memory (in bytes),0,gpu,memory.bar1.used,,
3442
gpu.memory.free,gauge,16,byte,,Unallocated device memory (in bytes).,0,gpu,memory.free,,
3543
gpu.memory.limit,gauge,16,byte,,The maximum amount of memory a process/container/device could allocate,0,gpu,memory.limit,,
36-
gpu.memory.reserved,gauge,16,byte,,Device memory (in bytes) reserved for system use (driver or firmware)..,0,gpu,memory.reserved,,
44+
gpu.memory.reserved,gauge,16,byte,,Device memory (in bytes) reserved for system use (driver or firmware).,0,gpu,memory.reserved,,
3745
gpu.memory.temperature,gauge,16,degree celsius,,Temperature of the memory chip,0,gpu,memory.temperature,,
46+
gpu.nvlink.ber.effective,gauge,16,,,NVLink effective error counter total for all links (errors not corrected by FEC/recovery mechanisms).,0,gpu,nvlink.ber.effective,,
47+
gpu.nvlink.ber.symbol,gauge,16,,,Symbol bit error rate for all NVLINK links,0,gpu,nvlink.ber.symbol,,
3848
gpu.nvlink.count.active,gauge,16,,,Number of active nvlinks for the device,0,gpu,,,
3949
gpu.nvlink.count.inactive,gauge,16,,,Number of inactive nvlinks for the device,0,gpu,,,
4050
gpu.nvlink.count.total,gauge,16,,,Number of total nvlinks for the device,0,gpu,,,
51+
gpu.nvlink.errors.buffer.overrun,gauge,16,,,NVLink buffer overrun error counter total for all links,0,gpu,nvlink.errors.buffer.overrun,,
4152
gpu.nvlink.errors.crc.data,gauge,16,,,NVLink data CRC error counter total for all links,0,gpu,nvlink.errors.crc.data,,
4253
gpu.nvlink.errors.crc.flit,gauge,16,,,NVLink flow control CRC error counter total for all links,0,gpu,nvlink.errors.crc.flit,,
4354
gpu.nvlink.errors.ecc,gauge,16,,,NVLink ECC error counter total for all links,0,gpu,nvlink.errors.ecc,,
55+
gpu.nvlink.errors.effective,gauge,16,,,NVLink effective error counter total for all links,0,gpu,nvlink.errors.effective,,
56+
gpu.nvlink.errors.local.link.integrity,gauge,16,,,NVLink local link integrity error counter total for all links,0,gpu,nvlink.errors.local.link.integrity,,
57+
gpu.nvlink.errors.malformed.packet,gauge,16,,,NVLink malformed packet error counter total for all links,0,gpu,nvlink.errors.malformed.packet,,
4458
gpu.nvlink.errors.recovery,gauge,16,,,NVLink recovery error counter total for all links,0,gpu,nvlink.errors.recovery,,
4559
gpu.nvlink.errors.replay,gauge,16,,,NVLink replay error counter total for all links,0,gpu,nvlink.errors.replay,,
60+
gpu.nvlink.errors.rx,gauge,16,,,NVLink receive error counter total for all links,0,gpu,nvlink.errors.rx,,
61+
gpu.nvlink.errors.rx.general,gauge,16,,,NVLink general receive error counter total for all links,0,gpu,nvlink.errors.rx.general,,
62+
gpu.nvlink.errors.rx.remote,gauge,16,,,NVLink remote receive error counter total for all links,0,gpu,nvlink.errors.rx.remote,,
63+
gpu.nvlink.errors.symbol,gauge,16,,,NVLink symbol error counter total for all links,0,gpu,nvlink.errors.symbol,,
4664
gpu.nvlink.nvswitch_connected,gauge,16,,,Number of NVLinks connected to the NVSwitch,0,gpu,nvlink.nvswitch_connected,,
65+
gpu.nvlink.plr.codes_loss,gauge,16,,,NVLink PLR codes loss counter for the port,0,gpu,nvlink.plr.codes_loss,,
66+
gpu.nvlink.plr.rx.code_err,gauge,16,,,NVLink PLR RX code error counter for the port,0,gpu,nvlink.plr.rx.code_err,,
67+
gpu.nvlink.plr.rx.codes,gauge,16,,,NVLink PLR RX codes counter for the port,0,gpu,nvlink.plr.rx.codes,,
68+
gpu.nvlink.plr.rx.uncorrectable_code,gauge,16,,,NVLink PLR RX uncorrectable code counter for the port,0,gpu,nvlink.plr.rx.uncorrectable_code,,
69+
gpu.nvlink.plr.tx.codes,gauge,16,,,NVLink PLR TX codes counter for the port,0,gpu,nvlink.plr.tx.codes,,
70+
gpu.nvlink.plr.tx.retry_codes,gauge,16,,,NVLink PLR TX retry codes counter for the port,0,gpu,nvlink.plr.tx.retry_codes,,
71+
gpu.nvlink.plr.tx.retry_events,gauge,16,,,NVLink PLR TX retry events counter for the port,0,gpu,nvlink.plr.tx.retry_events,,
72+
gpu.nvlink.plr.tx.retry_events_within_t_sec_max,gauge,16,,,Maximum NVLink PLR TX retry events within the configured time window for the port,0,gpu,nvlink.plr.tx.retry_events_within_t_sec_max,,
73+
gpu.nvlink.plr.tx.sync_events,gauge,16,,,NVLink PLR TX sync events counter for the port,0,gpu,nvlink.plr.tx.sync_events,,
74+
gpu.nvlink.recovery.events.failed,gauge,16,,,Failed NVLink recovery events total for all links,0,gpu,nvlink.recovery.events.failed,,
75+
gpu.nvlink.recovery.events.successful,gauge,16,,,Successful NVLink recovery events total for all links,0,gpu,nvlink.recovery.events.successful,,
76+
gpu.nvlink.rx.packets,gauge,16,,,NVLink received packet counter total for all links,0,gpu,nvlink.rx.packets,,
4777
gpu.nvlink.speed,gauge,16,megabit,second,common speed of all NVLINK links,0,gpu,nvlink.speed,,
4878
gpu.nvlink.throughput.data.rx,gauge,16,kilobyte,second,Total RX data throughput of all NVLINK links,0,gpu,nvlink.throughput.data.rx,,
4979
gpu.nvlink.throughput.data.tx,gauge,16,kilobyte,second,Total TX data throughput of all NVLINK links,0,gpu,nvlink.throughput.data.tx,,
5080
gpu.nvlink.throughput.raw.rx,gauge,16,kilobyte,second,Total RX of all NVLINK links,0,gpu,nvlink.throughput.raw.rx,,
5181
gpu.nvlink.throughput.raw.tx,gauge,16,kilobyte,second,Total TX of all NVLINK links,0,gpu,nvlink.throughput.raw.tx,,
82+
gpu.nvlink.tx.discards,gauge,16,,,NVLink transmitted discard counter total for all links,0,gpu,nvlink.tx.discards,,
83+
gpu.nvlink.tx.packets,gauge,16,,,NVLink transmitted packet counter total for all links,0,gpu,nvlink.tx.packets,,
5284
gpu.pci.replay_counter,gauge,16,,,PCI replay counter,0,gpu,pci.replay_counter,,
5385
gpu.pci.throughput.rx,gauge,16,byte,second,Bytes received through PCI to the GPU device per second,0,gpu,pci.throughput.rx,,
5486
gpu.pci.throughput.tx,gauge,16,byte,second,Bytes transmitted through PCI from the GPU device per second,0,gpu,pci.throughput.tx,,
5587
gpu.performance_state,gauge,16,,,Returns the current performance state of the device,0,gpu,performance_state,,
5688
gpu.power.management_limit,gauge,16,milliwatt,,Upper boundary for the device power draw.,0,gpu,power.management_limit,,
5789
gpu.power.usage,gauge,16,milliwatt,,"Power usage for the GPU device. On GA100 and older architectures this is the instantaneous power at that moment, in newer ones it represents the average power draw over one second",0,gpu,power.usage,,
5890
gpu.process.core.usage,gauge,16,core,,Average number of GPU cores that a process was using in the interval. Only emitted when processes are active.,0,gpu,process.core.usage,,
91+
gpu.process.decoder_active,gauge,16,percent,,Percentage of time the decoder was active for a specific process,0,gpu,process.decoder_active,,
5992
gpu.process.decoder_utilization,gauge,16,percent,,Percentage of time the decoder was active for a specific process,0,gpu,process.decoder_utilization,,
6093
gpu.process.dram_active,gauge,16,percent,,Percentage of time the DRAM was active for a specific process,0,gpu,process.dram_active,,
94+
gpu.process.encoder_active,gauge,16,percent,,Percentage of time the encoder was active for a specific process,0,gpu,process.encoder_active,,
6195
gpu.process.encoder_utilization,gauge,16,percent,,Percentage of time the encoder was active for a specific process,0,gpu,process.encoder_utilization,,
6296
gpu.process.memory.usage,gauge,16,byte,,The memory used by this process at the point the metric was given. Only emitted when processes are active.,0,gpu,process.memory.usage,,
6397
gpu.process.sm_active,gauge,16,percent,,Percentage of time the streaming multiprocessor was active for a specific process,0,gpu,process.sm_active,,
@@ -67,7 +101,8 @@ gpu.remapped_rows.pending,gauge,16,,,Number of rows pending remapping,0,gpu,rema
67101
gpu.remapped_rows.uncorrectable,gauge,16,,,Number of rows remapped due to uncorrectable errors,0,gpu,remapped_rows.uncorrectable,,
68102
gpu.slowdown_temperature,gauge,16,degree celsius,,Slowdown temperature,0,gpu,slowdown_temperature,,
69103
gpu.sm_active,gauge,16,percent,,Percentage of time the streaming multiprocessor was active,0,gpu,sm_active,,
70-
gpu.sm_occupancy,gauge,16,percent,,Percentage of the Streaming Multiprocessors that were active in the interval,0,gpu,sm_occupancy,,
104+
gpu.sm_occupancy,gauge,16,percent,,Percentage of the warps available in the Streaming Multiprocessors that were active in the interval,0,gpu,sm_occupancy,,
105+
gpu.sm_utilization,gauge,16,percent,,Percentage of the Streaming Multiprocessors that were active in the interval,0,gpu,sm_utilization,,
71106
gpu.temperature,gauge,16,degree celsius,,Temperature of the GPU device,0,gpu,temperature,,
72107
gpu.tensor_active,gauge,16,percent,,Percentage of the time that the tensor calculation engine was active. Only for Hopper and newer GPUs,0,gpu,tensor_active,,
73108
gpu.total_energy_consumption,gauge,16,millijoule,,Total energy consumed by the device since the driver was reloaded,0,gpu,total_energy_consumption,,

0 commit comments

Comments
 (0)