Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions cmd/gpu-kubelet-plugin/allocatable.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ type AllocatableDevice struct {
MigDynamic *MigSpec
MigStatic *MigDeviceInfo
Vfio *VfioDeviceInfo

// taints holds DRA device taints set by the health monitor. Published
// as part of the device's ResourceSlice entry so the scheduler and
// kubelet honour them per KEP-5055.
taints []resourceapi.DeviceTaint
}

func (d AllocatableDevice) Type() string {
Expand Down Expand Up @@ -145,22 +150,6 @@ func (d *AllocatableDevice) GetGPUPCIBusID() string {
panic("unexpected type for AllocatableDevice")
}

func (d *AllocatableDevice) IsHealthy() bool {
switch d.Type() {
case GpuDeviceType:
return d.Gpu.health == Healthy
case MigStaticDeviceType:
// TODO: review -- what about the parent?
return d.MigStatic.health == Healthy
case MigDynamicDeviceType:
// TODOMIG: For now, pretend health -- this device maybe hasn't
// manifested yet. Or has it? We could adopt the health status of the
// parent, but that's also not meaningful I think.
return true
}
panic("unexpected type for AllocatableDevice")
}

func (d AllocatableDevices) GetGPUs() []*AllocatableDevice {
var devices []*AllocatableDevice
for _, device := range d {
Expand Down Expand Up @@ -324,3 +313,36 @@ func (d *PerGPUAllocatableDevices) RemoveSiblingDevices(device *AllocatableDevic
}
}
}

// Taints returns a copy of the device's taints to prevent data races
// when being read concurrently by the ResourceSlice builder.
func (d *AllocatableDevice) Taints() []resourceapi.DeviceTaint {
return slices.Clone(d.taints)
}

// AddOrUpdateTaint adds a new taint or updates an existing one with the same
// key. The value and effect are always updated to the latest event received.
// Meaning, if a device receives multiple events for the same taint dimension
// (e.g., XID 48 followed by XID 63), the value is overwritten and only the most recent event data is retained.
// Returns true if the taint set was modified.
func (d *AllocatableDevice) AddOrUpdateTaint(taint *resourceapi.DeviceTaint) bool {
for i, existing := range d.taints {
if existing.Key == taint.Key {

// 1. If nothing actually changed, exit early to avoid API calls
if existing.Value == taint.Value && existing.Effect == taint.Effect {
return false
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AddOrUpdateTaint deduplicates by key, so if a device gets XID 48 then XID 63, the value overwrites to "63" and the first XID info is lost. This is consistent with Option A's one-key-per-dimension design, but should be documented — e.g., a comment here noting that only the most recent XID is retained per device.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


// 2. Otherwise, update the fields and the timestamp
d.taints[i].Value = taint.Value
d.taints[i].Effect = taint.Effect
d.taints[i].TimeAdded = nil // reset timestamp for the API server
return true
}
}

// 3. Key doesn't exist yet, append the dereferenced struct
d.taints = append(d.taints, *taint)
return true
}
185 changes: 143 additions & 42 deletions cmd/gpu-kubelet-plugin/device_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,93 @@ import (
"sync"

"github.com/NVIDIA/go-nvml/pkg/nvml"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/klog/v2"
)

const (
FullGPUInstanceID uint32 = 0xFFFFFFFF
)

const (
TaintKeyXID = DriverName + "/xid"
TaintKeyGPULost = DriverName + "/gpu-lost"
TaintKeyUnmonitored = DriverName + "/unmonitored"
)

// TODO(issue #1001):Remove this hardcoded constant and switch to the upstream
// resourceapi.DeviceTaintEffectNone once the k8s.io/api dependency is bumped
// to a version that includes it (KEP-5055 beta).
// DeviceTaintEffectNone is an informational effect that does not affect
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hardcoded constant is fragile. If the upstream DeviceTaintEffectNone lands with a different string value, this silently breaks at runtime with no compile-time safety net.

Consider pinning the vendored k8s.io/api dependency to a version that includes it, or at minimum add a build-time or init-time assertion that validates the value matches upstream once the dependency is bumped. The TODO is good but should also reference a tracking issue.

Copy link
Copy Markdown
Contributor Author

@guptaNswati guptaNswati Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmm its going to be the same https://pkg.go.dev/k8s.io/kubernetes/pkg/apis/resource#DeviceTaintEffect

I created the issue: #1001

// scheduling or eviction.
const DeviceTaintEffectNone resourceapi.DeviceTaintEffect = "None"

// DeviceHealthEventType classifies the category of health event detected by
// the NVML health monitor.
type DeviceHealthEventType string

const (
HealthEventXID DeviceHealthEventType = "xid"
HealthEventGPULost DeviceHealthEventType = "gpu-lost"
HealthEventUnmonitored DeviceHealthEventType = "unmonitored"
)

// DeviceHealthEvent carries a typed health notification from the NVML health
// monitor to the driver's event handler, enabling the driver to set the
// appropriate DRA device taint per the Option A schema (KEP-5055).
// Devices is a batch: for GPU_LOST and unmonitored events where all affected devices
// are aggregated into a single event so the consumer applies one ResourceSlice
// update instead of N.
type DeviceHealthEvent struct {
Devices []*AllocatableDevice
EventType DeviceHealthEventType
// inspired by NVML Event type and only meaningful for xid errors.
// may have to create a custom type based on future device-api
EventData uint64
}

// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA
// DeviceTaint using the Option A taint key schema: one key per health
// dimension under the gpu.nvidia.com domain.
func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint {
switch event.EventType {
case HealthEventXID:
effect := resourceapi.DeviceTaintEffectNoSchedule
if monitor != nil && monitor.IsEventNonFatal(event) {
effect = DeviceTaintEffectNone
}
return &resourceapi.DeviceTaint{
Key: TaintKeyXID,
Value: strconv.FormatUint(event.EventData, 10),
Effect: effect,
}
case HealthEventGPULost:
return &resourceapi.DeviceTaint{
Key: TaintKeyGPULost,
Effect: resourceapi.DeviceTaintEffectNoSchedule,
}
case HealthEventUnmonitored:
return &resourceapi.DeviceTaint{
Key: TaintKeyUnmonitored,
Effect: DeviceTaintEffectNone,
}
default:
klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType)
return &resourceapi.DeviceTaint{
Key: TaintKeyUnmonitored,
Effect: DeviceTaintEffectNone,
}
}
}

// For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>.
// For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements.
type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice

type nvmlDeviceHealthMonitor struct {
nvmllib nvml.Interface
eventSet nvml.EventSet
unhealthy chan *AllocatableDevice
unhealthy chan *DeviceHealthEvent
deviceByPlacement devicePlacementMap
skippedXids map[uint64]bool
wg sync.WaitGroup
Expand All @@ -61,7 +133,7 @@ func newNvmlDeviceHealthMonitor(config *Config, perGPUAllocatable *PerGPUAllocat
all := perGPUAllocatable.GetAllDevices()
m := &nvmlDeviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(all)),
unhealthy: make(chan *DeviceHealthEvent, len(all)),
deviceByPlacement: getDevicePlacementMap(all),
skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore),
}
Expand Down Expand Up @@ -106,25 +178,25 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
for parentUUID, giMap := range m.deviceByPlacement {
gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID)
if ret != nvml.SUCCESS {
klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret)
m.markAllMigDevicesUnhealthy(giMap)
klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking devices as unmonitored", parentUUID, ret)
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
continue
}

supportedEvents, ret := gpu.GetSupportedEventTypes()
if ret != nvml.SUCCESS {
klog.Warningf("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret)
m.markAllMigDevicesUnhealthy(giMap)
klog.Warningf("unable to determine the supported events for %s: %v; marking devices as unmonitored", parentUUID, ret)
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
continue
}

ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", parentUUID)
}
if ret != nvml.SUCCESS {
klog.Warningf("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret)
m.markAllMigDevicesUnhealthy(giMap)
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
} else if ret != nvml.SUCCESS {
klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret)
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
}
}
}
Expand Down Expand Up @@ -162,8 +234,8 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
// Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c].
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_GPU_IS_LOST {
klog.Warningf("GPU is lost error: %v; Marking all devices as unhealthy", ret)
m.markAllDevicesUnhealthy()
klog.Warningf("GPU is lost error: %v; Tainting all devices with %s", ret, TaintKeyGPULost)
m.sendHealthEventForAllDevices(HealthEventGPULost)
continue
}
klog.V(6).Infof("Error waiting for NVML event: %v. Retrying...", ret)
Expand All @@ -180,19 +252,14 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
continue
}

if m.skippedXids[xid] {
klog.V(6).Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", xid, eType, gi, ci)
continue
}

klog.V(4).Infof("Processing event XID=%d event", xid)
// this seems an extreme action.
// should we just log the error and proceed anyway.
// TODO: look into how to properly handle this error.
eventUUID, ret := event.Device.GetUUID()
if ret != nvml.SUCCESS {
klog.Warningf("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
m.markAllDevicesUnhealthy()
klog.Warningf("Failed to determine uuid for event %v: %v; Tainting all devices with %s", event, ret, TaintKeyGPULost)
m.sendHealthEventForAllDevices(HealthEventGPULost)
continue
}
affectedDevice := m.deviceByPlacement.get(eventUUID, gi, ci)
Expand All @@ -201,41 +268,65 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
continue
}

klog.V(4).Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, xid)
m.unhealthy <- affectedDevice
klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID())
m.unhealthy <- &DeviceHealthEvent{
Devices: []*AllocatableDevice{affectedDevice},
EventType: HealthEventXID,
EventData: xid,
}
}
}
}

func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
return m.unhealthy
}

func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
// sendHealthEventForAllDevices aggregates every device across all GPUs into a
// single batched DeviceHealthEvent so the consumer makes one ResourceSlice
// update.
func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) {
var devices []*AllocatableDevice
for _, giMap := range m.deviceByPlacement {
m.markAllMigDevicesUnhealthy(giMap)
devices = append(devices, flattenMIGDeviceMap(giMap)...)
}
m.sendBatchedHealthEvent(devices, eventType)
}

// sendHealthEventForDevices aggregates all devices under a single parent GPU
// into one batched DeviceHealthEvent.
func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) {
m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType)
}

// markAllMigDevicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy.
func (m *nvmlDeviceHealthMonitor) markAllMigDevicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) {
// flattenMIGDeviceMap flattens a GI→CI device map into a slice.
func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
var devices []*AllocatableDevice
for _, ciMap := range giMap {
for _, dev := range ciMap {
// Non-blocking send to avoid deadlocks if channel is full.
select {
case m.unhealthy <- dev:
klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID())
// TODO: The non-blocking send protects the health-monitor goroutine from deadlocks,
// but dropping an unhealthy notification means the device's health transition may
// never reach the consumer. Consider follow-up improvements:
// - increase the channel buffer beyond len(allocatable) to reduce backpressure;
// - introduce a special "all devices unhealthy" message when bulk updates occur;
// - or revisit whether blocking briefly here is acceptable.
default:
klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID())
}
devices = append(devices, dev)
}
}
return devices
}

// sendBatchedHealthEvent sends a single DeviceHealthEvent containing all
// affected devices. Uses a non-blocking send to protect the monitor goroutine
// from deadlocks when the channel is full.
func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) {
if len(devices) == 0 {
return
}
event := &DeviceHealthEvent{
Devices: devices,
EventType: eventType,
}
select {
case m.unhealthy <- event:
klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices))
default:
klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices))
}
}

// The purpose of this function is to allow for a O(1) lookup of
Expand Down Expand Up @@ -268,7 +359,7 @@ func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap {
continue
}
giID = d.MigStatic.gIInfo.Id
ciID = d.MigStatic.gIInfo.Id
ciID = d.MigStatic.cIInfo.Id

default:
// This may be a problem; and should be logged
Expand Down Expand Up @@ -304,8 +395,8 @@ func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice {
}

// getAdditionalXids returns a list of additional Xids to skip from the specified string.
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
// Invalid values nare ignored.
// The input is treated as a comma-separated string and all valid uint64 values are considered as Xid values.
// Invalid values are ignored.
// TODO: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443].
func getAdditionalXids(input string) []uint64 {
if input == "" {
Expand Down Expand Up @@ -353,3 +444,13 @@ func xidsToSkip(additionalXids string) map[uint64]bool {
}
return skippedXids
}

// IsEventNonFatal evaluates whether a hardware event is considered an application-level
// warning (None) rather than a critical hardware failure (NoSchedule).
// Currently, it only checks for XID events.
func (m *nvmlDeviceHealthMonitor) IsEventNonFatal(event *DeviceHealthEvent) bool {
if event.EventType == HealthEventXID {
return m.skippedXids[event.EventData]
}
return false
}
Loading
Loading