kubernetes-sigs · guptaNswati · Apr 10, 2026 · Apr 3, 2026 · Apr 7, 2026 · Apr 9, 2026
diff --git a/cmd/gpu-kubelet-plugin/allocatable.go b/cmd/gpu-kubelet-plugin/allocatable.go
@@ -50,6 +50,11 @@ type AllocatableDevice struct {
 	MigDynamic *MigSpec
 	MigStatic  *MigDeviceInfo
 	Vfio       *VfioDeviceInfo
+
+	// taints holds DRA device taints set by the health monitor. Published
+	// as part of the device's ResourceSlice entry so the scheduler and
+	// kubelet honour them per KEP-5055.
+	taints []resourceapi.DeviceTaint
 }
 
 func (d AllocatableDevice) Type() string {
@@ -145,22 +150,6 @@ func (d *AllocatableDevice) GetGPUPCIBusID() string {
 	panic("unexpected type for AllocatableDevice")
 }
 
-func (d *AllocatableDevice) IsHealthy() bool {
-	switch d.Type() {
-	case GpuDeviceType:
-		return d.Gpu.health == Healthy
-	case MigStaticDeviceType:
-		// TODO: review -- what about the parent?
-		return d.MigStatic.health == Healthy
-	case MigDynamicDeviceType:
-		// TODOMIG: For now, pretend health -- this device maybe hasn't
-		// manifested yet. Or has it? We could adopt the health status of the
-		// parent, but that's also not meaningful I think.
-		return true
-	}
-	panic("unexpected type for AllocatableDevice")
-}
-
 func (d AllocatableDevices) GetGPUs() []*AllocatableDevice {
 	var devices []*AllocatableDevice
 	for _, device := range d {
@@ -324,3 +313,36 @@ func (d *PerGPUAllocatableDevices) RemoveSiblingDevices(device *AllocatableDevic
 		}
 	}
 }
+
+// Taints returns a copy of the device's taints to prevent data races
+// when being read concurrently by the ResourceSlice builder.
+func (d *AllocatableDevice) Taints() []resourceapi.DeviceTaint {
+	return slices.Clone(d.taints)
+}
+
+// AddOrUpdateTaint adds a new taint or updates an existing one with the same
+// key. The value and effect are always updated to the latest event received.
+// Meaning, if a device receives multiple events for the same taint dimension
+// (e.g., XID 48 followed by XID 63), the value is overwritten and only the most recent event data is retained.
+// Returns true if the taint set was modified.
+func (d *AllocatableDevice) AddOrUpdateTaint(taint *resourceapi.DeviceTaint) bool {
+	for i, existing := range d.taints {
+		if existing.Key == taint.Key {
+
+			// 1. If nothing actually changed, exit early to avoid API calls
+			if existing.Value == taint.Value && existing.Effect == taint.Effect {
+				return false
+			}
+
+			// 2. Otherwise, update the fields and the timestamp
+			d.taints[i].Value = taint.Value
+			d.taints[i].Effect = taint.Effect
+			d.taints[i].TimeAdded = nil // reset timestamp for the API server
+			return true
+		}
+	}
+
+	// 3. Key doesn't exist yet, append the dereferenced struct
+	d.taints = append(d.taints, *taint)
+	return true
+}
diff --git a/cmd/gpu-kubelet-plugin/device_health.go b/cmd/gpu-kubelet-plugin/device_health.go
@@ -24,21 +24,93 @@ import (
 	"sync"
 
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	resourceapi "k8s.io/api/resource/v1"
 	"k8s.io/klog/v2"
 )
 
 const (
 	FullGPUInstanceID uint32 = 0xFFFFFFFF
 )
 
+const (
+	TaintKeyXID         = DriverName + "/xid"
+	TaintKeyGPULost     = DriverName + "/gpu-lost"
+	TaintKeyUnmonitored = DriverName + "/unmonitored"
+)
+
+// TODO(issue #1001):Remove this hardcoded constant and switch to the upstream
+// resourceapi.DeviceTaintEffectNone once the k8s.io/api dependency is bumped
+// to a version that includes it (KEP-5055 beta).
+// DeviceTaintEffectNone is an informational effect that does not affect
+// scheduling or eviction.
+const DeviceTaintEffectNone resourceapi.DeviceTaintEffect = "None"
+
+// DeviceHealthEventType classifies the category of health event detected by
+// the NVML health monitor.
+type DeviceHealthEventType string
+
+const (
+	HealthEventXID         DeviceHealthEventType = "xid"
+	HealthEventGPULost     DeviceHealthEventType = "gpu-lost"
+	HealthEventUnmonitored DeviceHealthEventType = "unmonitored"
+)
+
+// DeviceHealthEvent carries a typed health notification from the NVML health
+// monitor to the driver's event handler, enabling the driver to set the
+// appropriate DRA device taint per the Option A schema (KEP-5055).
+// Devices is a batch: for GPU_LOST and unmonitored events where all affected devices
+// are aggregated into a single event so the consumer applies one ResourceSlice
+// update instead of N.
+type DeviceHealthEvent struct {
+	Devices   []*AllocatableDevice
+	EventType DeviceHealthEventType
+	// inspired by NVML Event type and only meaningful for xid errors.
+	// may have to create a custom type based on future device-api
+	EventData uint64
+}
+
+// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA
+// DeviceTaint using the Option A taint key schema: one key per health
+// dimension under the gpu.nvidia.com domain.
+func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint {
+	switch event.EventType {
+	case HealthEventXID:
+		effect := resourceapi.DeviceTaintEffectNoSchedule
+		if monitor != nil && monitor.IsEventNonFatal(event) {
+			effect = DeviceTaintEffectNone
+		}
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyXID,
+			Value:  strconv.FormatUint(event.EventData, 10),
+			Effect: effect,
+		}
+	case HealthEventGPULost:
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyGPULost,
+			Effect: resourceapi.DeviceTaintEffectNoSchedule,
+		}
+	case HealthEventUnmonitored:
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyUnmonitored,
+			Effect: DeviceTaintEffectNone,
+		}
+	default:
+		klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType)
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyUnmonitored,
+			Effect: DeviceTaintEffectNone,
+		}
+	}
+}
+
 // For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>.
 // For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements.
 type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice
 
 type nvmlDeviceHealthMonitor struct {
 	nvmllib           nvml.Interface
 	eventSet          nvml.EventSet
-	unhealthy         chan *AllocatableDevice
+	unhealthy         chan *DeviceHealthEvent
 	deviceByPlacement devicePlacementMap
 	skippedXids       map[uint64]bool
 	wg                sync.WaitGroup
@@ -61,7 +133,7 @@ func newNvmlDeviceHealthMonitor(config *Config, perGPUAllocatable *PerGPUAllocat
 	all := perGPUAllocatable.GetAllDevices()
 	m := &nvmlDeviceHealthMonitor{
 		nvmllib:           nvdevlib.nvmllib,
-		unhealthy:         make(chan *AllocatableDevice, len(all)),
+		unhealthy:         make(chan *DeviceHealthEvent, len(all)),
 		deviceByPlacement: getDevicePlacementMap(all),
 		skippedXids:       xidsToSkip(config.flags.additionalXidsToIgnore),
 	}
@@ -106,25 +178,25 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
 	for parentUUID, giMap := range m.deviceByPlacement {
 		gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID)
 		if ret != nvml.SUCCESS {
-			klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 			continue
 		}
 
 		supportedEvents, ret := gpu.GetSupportedEventTypes()
 		if ret != nvml.SUCCESS {
-			klog.Warningf("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			klog.Warningf("unable to determine the supported events for %s: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 			continue
 		}
 
 		ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
 		if ret == nvml.ERROR_NOT_SUPPORTED {
 			klog.Warningf("Device %v is too old to support healthchecking.", parentUUID)
-		}
-		if ret != nvml.SUCCESS {
-			klog.Warningf("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
+		} else if ret != nvml.SUCCESS {
+			klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 		}
 	}
 }
@@ -162,8 +234,8 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 			// Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c].
 			if ret != nvml.SUCCESS {
 				if ret == nvml.ERROR_GPU_IS_LOST {
-					klog.Warningf("GPU is lost error: %v; Marking all devices as unhealthy", ret)
-					m.markAllDevicesUnhealthy()
+					klog.Warningf("GPU is lost error: %v; Tainting all devices with %s", ret, TaintKeyGPULost)
+					m.sendHealthEventForAllDevices(HealthEventGPULost)
 					continue
 				}
 				klog.V(6).Infof("Error waiting for NVML event: %v. Retrying...", ret)
@@ -180,19 +252,14 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 				continue
 			}
 
-			if m.skippedXids[xid] {
-				klog.V(6).Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", xid, eType, gi, ci)
-				continue
-			}
-
 			klog.V(4).Infof("Processing event XID=%d event", xid)
 			// this seems an extreme action.
 			// should we just log the error and proceed anyway.
 			// TODO: look into how to properly handle this error.
 			eventUUID, ret := event.Device.GetUUID()
 			if ret != nvml.SUCCESS {
-				klog.Warningf("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
-				m.markAllDevicesUnhealthy()
+				klog.Warningf("Failed to determine uuid for event %v: %v; Tainting all devices with %s", event, ret, TaintKeyGPULost)
+				m.sendHealthEventForAllDevices(HealthEventGPULost)
 				continue
 			}
 			affectedDevice := m.deviceByPlacement.get(eventUUID, gi, ci)
@@ -201,41 +268,65 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 				continue
 			}
 
-			klog.V(4).Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, xid)
-			m.unhealthy <- affectedDevice
+			klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID())
+			m.unhealthy <- &DeviceHealthEvent{
+				Devices:   []*AllocatableDevice{affectedDevice},
+				EventType: HealthEventXID,
+				EventData: xid,
+			}
 		}
 	}
 }
 
-func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
+func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
 	return m.unhealthy
 }
 
-func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
+// sendHealthEventForAllDevices aggregates every device across all GPUs into a
+// single batched DeviceHealthEvent so the consumer makes one ResourceSlice
+// update.
+func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) {
+	var devices []*AllocatableDevice
 	for _, giMap := range m.deviceByPlacement {
-		m.markAllMigDevicesUnhealthy(giMap)
+		devices = append(devices, flattenMIGDeviceMap(giMap)...)
 	}
+	m.sendBatchedHealthEvent(devices, eventType)
+}
+
+// sendHealthEventForDevices aggregates all devices under a single parent GPU
+// into one batched DeviceHealthEvent.
+func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) {
+	m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType)
 }
 
-// markAllMigDevicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy.
-func (m *nvmlDeviceHealthMonitor) markAllMigDevicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) {
+// flattenMIGDeviceMap flattens a GI→CI device map into a slice.
+func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
+	var devices []*AllocatableDevice
 	for _, ciMap := range giMap {
 		for _, dev := range ciMap {
-			// Non-blocking send to avoid deadlocks if channel is full.
-			select {
-			case m.unhealthy <- dev:
-				klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID())
-			// TODO: The non-blocking send protects the health-monitor goroutine from deadlocks,
-			// but dropping an unhealthy notification means the device's health transition may
-			// never reach the consumer. Consider follow-up improvements:
-			//   - increase the channel buffer beyond len(allocatable) to reduce backpressure;
-			//   - introduce a special "all devices unhealthy" message when bulk updates occur;
-			//   - or revisit whether blocking briefly here is acceptable.
-			default:
-				klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID())
-			}
+			devices = append(devices, dev)
 		}
 	}
+	return devices
+}
+
+// sendBatchedHealthEvent sends a single DeviceHealthEvent containing all
+// affected devices. Uses a non-blocking send to protect the monitor goroutine
+// from deadlocks when the channel is full.
+func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) {
+	if len(devices) == 0 {
+		return
+	}
+	event := &DeviceHealthEvent{
+		Devices:   devices,
+		EventType: eventType,
+	}
+	select {
+	case m.unhealthy <- event:
+		klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices))
+	default:
+		klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices))
+	}
 }
 
 // The purpose of this function is to allow for a O(1) lookup of
@@ -268,7 +359,7 @@ func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap {
 				continue
 			}
 			giID = d.MigStatic.gIInfo.Id
-			ciID = d.MigStatic.gIInfo.Id
+			ciID = d.MigStatic.cIInfo.Id
 
 		default:
 			// This may be a problem; and should be logged
@@ -304,8 +395,8 @@ func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice {
 }
 
 // getAdditionalXids returns a list of additional Xids to skip from the specified string.
-// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
-// Invalid values nare ignored.
+// The input is treated as a comma-separated string and all valid uint64 values are considered as Xid values.
+// Invalid values are ignored.
 // TODO: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443].
 func getAdditionalXids(input string) []uint64 {
 	if input == "" {
@@ -353,3 +444,13 @@ func xidsToSkip(additionalXids string) map[uint64]bool {
 	}
 	return skippedXids
 }
+
+// IsEventNonFatal evaluates whether a hardware event is considered an application-level
+// warning (None) rather than a critical hardware failure (NoSchedule).
+// Currently, it only checks for XID events.
+func (m *nvmlDeviceHealthMonitor) IsEventNonFatal(event *DeviceHealthEvent) bool {
+	if event.EventType == HealthEventXID {
+		return m.skippedXids[event.EventData]
+	}
+	return false
+}