Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions plugins/inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/schema_v11"
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/schema_v12"
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/schema_v13"
)

//go:embed sample.conf
Expand Down Expand Up @@ -114,14 +115,16 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
return schema_v11.Parse(acc, data)
case "v12":
return schema_v12.Parse(acc, data)
case "v13":
return schema_v13.Parse(acc, data)
}

smi.once.Do(func() {
smi.Log.Warnf(`Unknown schema version %q, using latest know schema for parsing.
smi.Log.Warnf(`Unknown schema version %q, using latest known schema for parsing.
Please report this as an issue to https://github.com/influxdata/telegraf together
with a sample output of 'nvidia_smi -q -x'!`, schema)
})
return schema_v12.Parse(acc, data)
return schema_v13.Parse(acc, data)
}

func init() {
Expand Down
53 changes: 53 additions & 0 deletions plugins/inputs/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,59 @@ func TestGatherValidXML(t *testing.T) {
time.Unix(1689872450, 0)),
},
},
{
name: "RTX 3080 schema v13",
filename: "rtx-3080-v13.xml",
expected: []telegraf.Metric{
metric.New(
"nvidia_smi",
map[string]string{
"arch": "Ampere",
"compute_mode": "Default",
"index": "0",
"name": "NVIDIA GeForce RTX 3080",
"pstate": "P2",
"uuid": "GPU-19d6d965-2acc-f646-00f8-4c76979aabb4",
},
map[string]interface{}{
"clocks_current_graphics": 1950,
"clocks_current_memory": 9251,
"clocks_current_sm": 1950,
"clocks_current_video": 1710,
"cuda_version": "13.2",
"display_active": "Disabled",
"display_mode": "Requested",
"driver_version": "595.58.03",
"ecc_errors_channel_repair_pending": "No",
"ecc_errors_tpc_repair_pending": "No",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"encoder_stats_session_count": 0,
"fan_speed": 0,
"fbc_stats_average_fps": 0,
"fbc_stats_average_latency": 0,
"fbc_stats_session_count": 0,
"memory_free": 660,
"memory_reserved": 397,
"memory_total": 10240,
"memory_used": 9184,
"pcie_link_gen_current": 4,
"pcie_link_width_current": 16,
"power_draw": 142.33,
"power_limit": 320.0,
"serial": "REDACTED",
"temperature_gpu": 27,
"utilization_decoder": 0,
"utilization_encoder": 0,
"utilization_gpu": 65,
"utilization_jpeg": 0,
"utilization_memory": 9,
"utilization_ofa": 0,
"vbios_version": "94.02.71.40.72",
},
time.Unix(1744280581, 0)),
},
},
{
name: "RTC 3090 schema v12",
filename: "rtx-3090-v12.xml",
Expand Down
150 changes: 150 additions & 0 deletions plugins/inputs/nvidia_smi/schema_v13/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package schema_v13

import (
"encoding/xml"
"strconv"
"time"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/common"
)

// Parse parses the XML-encoded data from nvidia-smi and adds measurements.
func Parse(acc telegraf.Accumulator, buf []byte) error {
var s smi
if err := xml.Unmarshal(buf, &s); err != nil {
return err
}

timestamp := time.Now()
if s.Timestamp != "" {
if t, err := time.ParseInLocation(time.ANSIC, s.Timestamp, time.Local); err == nil {
timestamp = t
}
}

for i := range s.Gpu {
gpu := &s.Gpu[i]

tags := map[string]string{
"index": strconv.Itoa(i),
}
fields := make(map[string]interface{}, 49)

common.SetTagIfUsed(tags, "pstate", gpu.PerformanceState)
common.SetTagIfUsed(tags, "name", gpu.ProductName)
common.SetTagIfUsed(tags, "arch", gpu.ProductArchitecture)
common.SetTagIfUsed(tags, "uuid", gpu.UUID)
common.SetTagIfUsed(tags, "compute_mode", gpu.ComputeMode)

common.SetIfUsed("str", fields, "driver_version", s.DriverVersion)
common.SetIfUsed("str", fields, "cuda_version", s.CudaVersion)
common.SetIfUsed("str", fields, "serial", gpu.Serial)
common.SetIfUsed("str", fields, "vbios_version", gpu.VbiosVersion)
common.SetIfUsed("str", fields, "display_active", gpu.DisplayActive)
common.SetIfUsed("str", fields, "display_mode", gpu.DisplayMode)
common.SetIfUsed("str", fields, "current_ecc", gpu.EccMode.CurrentEcc)
common.SetIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
common.SetIfUsed("int", fields, "memory_total", gpu.FbMemoryUsage.Total)
common.SetIfUsed("int", fields, "memory_used", gpu.FbMemoryUsage.Used)
common.SetIfUsed("int", fields, "memory_free", gpu.FbMemoryUsage.Free)
common.SetIfUsed("int", fields, "memory_reserved", gpu.FbMemoryUsage.Reserved)
common.SetIfUsed("int", fields, "ecc_errors_volatile_dram_correctable", gpu.EccErrors.Volatile.DramCorrectable)
common.SetIfUsed("int", fields, "ecc_errors_volatile_dram_uncorrectable", gpu.EccErrors.Volatile.DramUncorrectable)
common.SetIfUsed("int", fields, "ecc_errors_volatile_sram_correctable", gpu.EccErrors.Volatile.SramCorrectable)
common.SetIfUsed("int", fields, "ecc_errors_volatile_sram_uncorrectable", gpu.EccErrors.Volatile.SramUncorrectable)
common.SetIfUsed("int", fields, "ecc_errors_volatile_sram_uncorrectable_parity", gpu.EccErrors.Volatile.SramUncorrectableParity)
common.SetIfUsed("int", fields, "ecc_errors_volatile_sram_uncorrectable_secded", gpu.EccErrors.Volatile.SramUncorrectableSecded)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_dram_correctable", gpu.EccErrors.Aggregate.DramCorrectable)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_dram_uncorrectable", gpu.EccErrors.Aggregate.DramUncorrectable)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_correctable", gpu.EccErrors.Aggregate.SramCorrectable)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable", gpu.EccErrors.Aggregate.SramUncorrectable)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_parity", gpu.EccErrors.Aggregate.SramUncorrectableParity)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_secded", gpu.EccErrors.Aggregate.SramUncorrectableSecded)
common.SetIfUsed("str", fields, "ecc_errors_aggregate_sram_threshold_exceeded", gpu.EccErrors.Aggregate.SramThresholdExceeded)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_l2", gpu.EccErrors.AggregateUncorrectableSramSources.SramL2)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_microcontroller",
gpu.EccErrors.AggregateUncorrectableSramSources.SramMicrocontroller)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_other", gpu.EccErrors.AggregateUncorrectableSramSources.SramOther)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_pcie", gpu.EccErrors.AggregateUncorrectableSramSources.SramPcie)
common.SetIfUsed("int", fields, "ecc_errors_aggregate_sram_uncorrectable_sm", gpu.EccErrors.AggregateUncorrectableSramSources.SramSm)
common.SetIfUsed("str", fields, "ecc_errors_channel_repair_pending", gpu.EccErrors.ChannelRepairPending)
common.SetIfUsed("str", fields, "ecc_errors_tpc_repair_pending", gpu.EccErrors.TpcRepairPending)
common.SetIfUsed("str", fields, "ecc_errors_unrepairable_memory", gpu.EccErrors.UnrepairableMemory)
common.SetIfUsed("int", fields, "retired_pages_multiple_single_bit", gpu.RetiredPages.MultipleSingleBitRetirement.RetiredCount)
common.SetIfUsed("int", fields, "retired_pages_double_bit", gpu.RetiredPages.DoubleBitRetirement.RetiredCount)
common.SetIfUsed("str", fields, "retired_pages_blacklist", gpu.RetiredPages.PendingBlacklist)
common.SetIfUsed("str", fields, "retired_pages_pending", gpu.RetiredPages.PendingRetirement)
common.SetIfUsed("int", fields, "remapped_rows_correctable", gpu.RemappedRows.RemappedRowCorr)
common.SetIfUsed("int", fields, "remapped_rows_uncorrectable", gpu.RemappedRows.RemappedRowUnc)
common.SetIfUsed("str", fields, "remapped_rows_pending", gpu.RemappedRows.RemappedRowPending)
common.SetIfUsed("str", fields, "remapped_rows_failure", gpu.RemappedRows.RemappedRowFailure)
common.SetIfUsed("int", fields, "temperature_gpu", gpu.Temperature.GpuTemp)
common.SetIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GpuUtil)
common.SetIfUsed("int", fields, "utilization_memory", gpu.Utilization.MemoryUtil)
common.SetIfUsed("int", fields, "utilization_encoder", gpu.Utilization.EncoderUtil)
common.SetIfUsed("int", fields, "utilization_decoder", gpu.Utilization.DecoderUtil)
common.SetIfUsed("int", fields, "utilization_jpeg", gpu.Utilization.JpegUtil)
common.SetIfUsed("int", fields, "utilization_ofa", gpu.Utilization.OfaUtil)
common.SetIfUsed("int", fields, "pcie_link_gen_current", gpu.Pci.PciGpuLinkInfo.PcieGen.CurrentLinkGen)
common.SetIfUsed("int", fields, "pcie_link_width_current", gpu.Pci.PciGpuLinkInfo.LinkWidths.CurrentLinkWidth)
common.SetIfUsed("int", fields, "encoder_stats_session_count", gpu.EncoderStats.SessionCount)
common.SetIfUsed("int", fields, "encoder_stats_average_fps", gpu.EncoderStats.AverageFps)
common.SetIfUsed("int", fields, "encoder_stats_average_latency", gpu.EncoderStats.AverageLatency)
common.SetIfUsed("int", fields, "fbc_stats_session_count", gpu.FbcStats.SessionCount)
common.SetIfUsed("int", fields, "fbc_stats_average_fps", gpu.FbcStats.AverageFps)
common.SetIfUsed("int", fields, "fbc_stats_average_latency", gpu.FbcStats.AverageLatency)
common.SetIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.GraphicsClock)
common.SetIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SmClock)
common.SetIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.MemClock)
common.SetIfUsed("int", fields, "clocks_current_video", gpu.Clocks.VideoClock)
common.SetIfUsed("float", fields, "power_draw", gpu.PowerReadings.PowerDraw)
common.SetIfUsed("float", fields, "power_draw", gpu.PowerReadings.InstantPowerDraw)
common.SetIfUsed("float", fields, "power_limit", gpu.PowerReadings.PowerLimit)
common.SetIfUsed("float", fields, "power_draw", gpu.GpuPowerReadings.PowerDraw)
common.SetIfUsed("float", fields, "power_draw", gpu.GpuPowerReadings.InstantPowerDraw)
common.SetIfUsed("float", fields, "power_limit", gpu.GpuPowerReadings.CurrentPowerLimit)
common.SetIfUsed("float", fields, "power_limit", gpu.GpuPowerReadings.PowerLimit)
common.SetIfUsed("float", fields, "module_power_draw", gpu.ModulePowerReadings.PowerDraw)
common.SetIfUsed("float", fields, "module_power_draw", gpu.ModulePowerReadings.InstantPowerDraw)
acc.AddFields("nvidia_smi", fields, tags, timestamp)

for _, device := range gpu.MigDevices.MigDevice {
tags := make(map[string]string, 8)
common.SetTagIfUsed(tags, "index", device.Index)
common.SetTagIfUsed(tags, "gpu_index", device.GpuInstanceID)
common.SetTagIfUsed(tags, "compute_index", device.ComputeInstanceID)
common.SetTagIfUsed(tags, "pstate", gpu.PerformanceState)
common.SetTagIfUsed(tags, "name", gpu.ProductName)
common.SetTagIfUsed(tags, "arch", gpu.ProductArchitecture)
common.SetTagIfUsed(tags, "uuid", gpu.UUID)
common.SetTagIfUsed(tags, "compute_mode", gpu.ComputeMode)

fields := make(map[string]interface{}, 8)
common.SetIfUsed("int", fields, "sram_uncorrectable", device.EccErrorCount.VolatileCount.SramUncorrectable)
common.SetIfUsed("int", fields, "memory_fb_total", device.FbMemoryUsage.Total)
common.SetIfUsed("int", fields, "memory_fb_reserved", device.FbMemoryUsage.Reserved)
common.SetIfUsed("int", fields, "memory_fb_used", device.FbMemoryUsage.Used)
common.SetIfUsed("int", fields, "memory_fb_free", device.FbMemoryUsage.Free)
common.SetIfUsed("int", fields, "memory_bar1_total", device.Bar1MemoryUsage.Total)
common.SetIfUsed("int", fields, "memory_bar1_used", device.Bar1MemoryUsage.Used)
common.SetIfUsed("int", fields, "memory_bar1_free", device.Bar1MemoryUsage.Free)

acc.AddFields("nvidia_smi_mig", fields, tags, timestamp)
}

for _, process := range gpu.Processes.ProcessInfo {
tags := make(map[string]string, 2)
common.SetTagIfUsed(tags, "name", process.ProcessName)
common.SetTagIfUsed(tags, "type", process.Type)

fields := make(map[string]interface{}, 2)
common.SetIfUsed("int", fields, "pid", process.Pid)
common.SetIfUsed("int", fields, "used_memory", process.UsedMemory)

acc.AddFields("nvidia_smi_process", fields, tags, timestamp)
}
}

return nil
}
Loading
Loading