@@ -58,8 +58,11 @@ const (
5858// DeviceHealthEvent carries a typed health notification from the NVML health
5959// monitor to the driver's event handler, enabling the driver to set the
6060// appropriate DRA device taint per the Option A schema (KEP-5055).
61+ // Devices is a batch: for GPU_LOST and unmonitored events where all affected devices
62+ // are aggregated into a single event so the consumer applies one ResourceSlice
63+ // update instead of N.
6164type DeviceHealthEvent struct {
62- Device * AllocatableDevice
65+ Devices [] * AllocatableDevice
6366 EventType DeviceHealthEventType
6467 // inspired by NVML Event type and only meaningful for xid errors.
6568 // may have to create a custom type based on future device-api
@@ -267,7 +270,7 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
267270
268271 klog .V (4 ).Infof ("Sending XID=%d health event for device %s" , xid , affectedDevice .UUID ())
269272 m .unhealthy <- & DeviceHealthEvent {
270- Device : affectedDevice ,
273+ Devices : [] * AllocatableDevice { affectedDevice } ,
271274 EventType : HealthEventXID ,
272275 EventData : xid ,
273276 }
@@ -279,30 +282,51 @@ func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
279282 return m .unhealthy
280283}
281284
285+ // sendHealthEventForAllDevices aggregates every device across all GPUs into a
286+ // single batched DeviceHealthEvent so the consumer makes one ResourceSlice
287+ // update.
282288func (m * nvmlDeviceHealthMonitor ) sendHealthEventForAllDevices (eventType DeviceHealthEventType ) {
289+ var devices []* AllocatableDevice
283290 for _ , giMap := range m .deviceByPlacement {
284- m . sendHealthEventForDevices ( giMap , eventType )
291+ devices = append ( devices , collectDevices ( giMap ) ... )
285292 }
293+ m .sendBatchedHealthEvent (devices , eventType )
286294}
287295
288- // sendHealthEventForDevices sends a DeviceHealthEvent for every device under
289- // the given parent-level map. Uses non-blocking sends to protect the monitor
290- // goroutine from deadlocks when the channel is full.
296+ // sendHealthEventForDevices aggregates all devices under a single parent GPU
297+ // into one batched DeviceHealthEvent.
291298func (m * nvmlDeviceHealthMonitor ) sendHealthEventForDevices (giMap map [uint32 ]map [uint32 ]* AllocatableDevice , eventType DeviceHealthEventType ) {
299+ m .sendBatchedHealthEvent (collectDevices (giMap ), eventType )
300+ }
301+
302+ // collectDevices flattens a GI→CI device map into a slice.
303+ func collectDevices (giMap map [uint32 ]map [uint32 ]* AllocatableDevice ) []* AllocatableDevice {
304+ var devices []* AllocatableDevice
292305 for _ , ciMap := range giMap {
293306 for _ , dev := range ciMap {
294- event := & DeviceHealthEvent {
295- Device : dev ,
296- EventType : eventType ,
297- }
298- select {
299- case m .unhealthy <- event :
300- klog .V (6 ).Infof ("Sent %s health event for device %s" , eventType , dev .UUID ())
301- default :
302- klog .Errorf ("Health event channel full; dropping %s event for device %s" , eventType , dev .UUID ())
303- }
307+ devices = append (devices , dev )
304308 }
305309 }
310+ return devices
311+ }
312+
313+ // sendBatchedHealthEvent sends a single DeviceHealthEvent containing all
314+ // affected devices. Uses a non-blocking send to protect the monitor goroutine
315+ // from deadlocks when the channel is full.
316+ func (m * nvmlDeviceHealthMonitor ) sendBatchedHealthEvent (devices []* AllocatableDevice , eventType DeviceHealthEventType ) {
317+ if len (devices ) == 0 {
318+ return
319+ }
320+ event := & DeviceHealthEvent {
321+ Devices : devices ,
322+ EventType : eventType ,
323+ }
324+ select {
325+ case m .unhealthy <- event :
326+ klog .V (6 ).Infof ("Sent batched %s health event for %d device(s)" , eventType , len (devices ))
327+ default :
328+ klog .Errorf ("Health event channel full; dropping batched %s event for %d device(s)" , eventType , len (devices ))
329+ }
306330}
307331
308332// The purpose of this function is to allow for a O(1) lookup of
0 commit comments