batch GPU_LOST and unmonitored health events to reduce ResourceSlice updates

guptaNswati · guptaNswati · commit 31dfda9516b1 · 2026-04-09T22:29:14.000Z
Signed-off-by: Swati Gupta &lt;swatig@nvidia.com&gt;
diff --git a/cmd/gpu-kubelet-plugin/device_health.go b/cmd/gpu-kubelet-plugin/device_health.go
@@ -58,8 +58,11 @@ const (
 // DeviceHealthEvent carries a typed health notification from the NVML health
 // monitor to the driver's event handler, enabling the driver to set the
 // appropriate DRA device taint per the Option A schema (KEP-5055).
+// Devices is a batch: for GPU_LOST and unmonitored events where all affected devices
+// are aggregated into a single event so the consumer applies one ResourceSlice
+// update instead of N.
 type DeviceHealthEvent struct {
-	Device    *AllocatableDevice
+	Devices   []*AllocatableDevice
 	EventType DeviceHealthEventType
 	// inspired by NVML Event type and only meaningful for xid errors.
 	// may have to create a custom type based on future device-api
@@ -267,7 +270,7 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 
 			klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID())
 			m.unhealthy <- &DeviceHealthEvent{
-				Device:    affectedDevice,
+				Devices:   []*AllocatableDevice{affectedDevice},
 				EventType: HealthEventXID,
 				EventData: xid,
 			}
@@ -279,30 +282,51 @@ func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
 	return m.unhealthy
 }
 
+// sendHealthEventForAllDevices aggregates every device across all GPUs into a
+// single batched DeviceHealthEvent so the consumer makes one ResourceSlice
+// update.
 func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) {
+	var devices []*AllocatableDevice
 	for _, giMap := range m.deviceByPlacement {
-		m.sendHealthEventForDevices(giMap, eventType)
+		devices = append(devices, collectDevices(giMap)...)
 	}
+	m.sendBatchedHealthEvent(devices, eventType)
 }
 
-// sendHealthEventForDevices sends a DeviceHealthEvent for every device under
-// the given parent-level map. Uses non-blocking sends to protect the monitor
-// goroutine from deadlocks when the channel is full.
+// sendHealthEventForDevices aggregates all devices under a single parent GPU
+// into one batched DeviceHealthEvent.
 func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) {
+	m.sendBatchedHealthEvent(collectDevices(giMap), eventType)
+}
+
+// collectDevices flattens a GI→CI device map into a slice.
+func collectDevices(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
+	var devices []*AllocatableDevice
 	for _, ciMap := range giMap {
 		for _, dev := range ciMap {
-			event := &DeviceHealthEvent{
-				Device:    dev,
-				EventType: eventType,
-			}
-			select {
-			case m.unhealthy <- event:
-				klog.V(6).Infof("Sent %s health event for device %s", eventType, dev.UUID())
-			default:
-				klog.Errorf("Health event channel full; dropping %s event for device %s", eventType, dev.UUID())
-			}
+			devices = append(devices, dev)
 		}
 	}
+	return devices
+}
+
+// sendBatchedHealthEvent sends a single DeviceHealthEvent containing all
+// affected devices. Uses a non-blocking send to protect the monitor goroutine
+// from deadlocks when the channel is full.
+func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) {
+	if len(devices) == 0 {
+		return
+	}
+	event := &DeviceHealthEvent{
+		Devices:   devices,
+		EventType: eventType,
+	}
+	select {
+	case m.unhealthy <- event:
+		klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices))
+	default:
+		klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices))
+	}
 }
 
 // The purpose of this function is to allow for a O(1) lookup of
diff --git a/cmd/gpu-kubelet-plugin/driver.go b/cmd/gpu-kubelet-plugin/driver.go
@@ -470,13 +470,19 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
 				klog.V(6).Info("Health monitor channel closed")
 				return
 			}
-			uuid := event.Device.UUID()
-			klog.Warningf("Received %s health event for device %s", event.EventType, uuid)
 
 			taint := healthEventToTaint(event, d.deviceHealthMonitor)
-			if !d.state.AddDeviceTaint(event.Device, taint) {
+			modified := false
+			for _, dev := range event.Devices {
+				klog.Warningf("Received %s health event for device %s", event.EventType, dev.UUID())
+				if d.state.AddDeviceTaint(dev, taint) {
+					modified = true
+				}
+			}
+			if !modified {
 				continue
 			}
+
 			var resourceSlice resourceslice.Slice
 			for _, devices := range d.state.perGPUAllocatable.allocatablesMap {
 				for _, dev := range devices {
@@ -501,21 +507,25 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
 			// This is a temporary compromise while device taints/tolerations (KEP-5055)
 			// are available as a Beta feature. An interim improvement could be adding
 			// a retry/backoff or switch to patch updates instead of full republish.
-			klog.V(4).Infof("Republishing ResourceSlice: device %s tainted with %s=%q (effect=%s)",
-				uuid, taint.Key, taint.Value, taint.Effect)
+			klog.V(4).Infof("Republishing ResourceSlice: %d device(s) tainted with %s=%q (effect=%s)",
+				len(event.Devices), taint.Key, taint.Value, taint.Effect)
 
 			resources := resourceslice.DriverResources{
 				Pools: map[string]resourceslice.Pool{
 					nodeName: {Slices: []resourceslice.Slice{resourceSlice}},
 				},
 			}
 
-			// TODO: Instead of acting on per event basis, add event aggregation on the receiving side before `publishResources`.
-			// Evaluate two batching strategies:
-			// 1. Channel drain: Non-blocking pull of all pending events (Pro: zero latency; Con: susceptible to NVML lag).
+			// NOTE: GPU_LOST and unmonitored events are already batched at the
+			// sender (all affected devices arrive in a single DeviceHealthEvent).
+			// XID events are still per-device and may cause repeated publishes.
+			// TODO: Add receiver-side event aggregation before PublishResources.
+			// Evaluate two strategies:
+			// 1. Channel drain: non-blocking pull of all pending events (Pro: zero latency; Con: susceptible to NVML lag).
 			// 2. Timer debounce: e.g., 50ms window (Pro: standard K8s API protection; Con: slight delay).
+			// This also needs to be handle properly in the recovery path.
 			if err := d.pluginhelper.PublishResources(ctx, resources); err != nil {
-				klog.Errorf("Failed to publish resources after taint update for device %s: %v", uuid, err)
+				klog.Errorf("Failed to publish resources after taint update: %v", err)
 			}
 		}
 	}