@@ -72,31 +72,31 @@ type DeviceHealthEvent struct {
7272// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA
7373// DeviceTaint using the Option A taint key schema: one key per health
7474// dimension under the gpu.nvidia.com domain.
75- func healthEventToTaint (event * DeviceHealthEvent , monitor deviceHealthMonitor ) resourceapi.DeviceTaint {
75+ func healthEventToTaint (monitor deviceHealthMonitor , event * DeviceHealthEvent ) * resourceapi.DeviceTaint {
7676 switch event .EventType {
7777 case HealthEventXID :
7878 effect := resourceapi .DeviceTaintEffectNoSchedule
7979 if monitor != nil && monitor .IsEventNonFatal (event ) {
8080 effect = DeviceTaintEffectNone
8181 }
82- return resourceapi.DeviceTaint {
82+ return & resourceapi.DeviceTaint {
8383 Key : TaintKeyXID ,
8484 Value : strconv .FormatUint (event .EventData , 10 ),
8585 Effect : effect ,
8686 }
8787 case HealthEventGPULost :
88- return resourceapi.DeviceTaint {
88+ return & resourceapi.DeviceTaint {
8989 Key : TaintKeyGPULost ,
9090 Effect : resourceapi .DeviceTaintEffectNoSchedule ,
9191 }
9292 case HealthEventUnmonitored :
93- return resourceapi.DeviceTaint {
93+ return & resourceapi.DeviceTaint {
9494 Key : TaintKeyUnmonitored ,
9595 Effect : DeviceTaintEffectNone ,
9696 }
9797 default :
9898 klog .Errorf ("Unknown health event type %q, defaulting to unmonitored taint" , event .EventType )
99- return resourceapi.DeviceTaint {
99+ return & resourceapi.DeviceTaint {
100100 Key : TaintKeyUnmonitored ,
101101 Effect : DeviceTaintEffectNone ,
102102 }
@@ -193,8 +193,8 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
193193 ret = gpu .RegisterEvents (eventMask & supportedEvents , m .eventSet )
194194 if ret == nvml .ERROR_NOT_SUPPORTED {
195195 klog .Warningf ("Device %v is too old to support healthchecking." , parentUUID )
196- }
197- if ret != nvml .SUCCESS {
196+ m . sendHealthEventForDevices ( giMap , HealthEventUnmonitored )
197+ } else if ret != nvml .SUCCESS {
198198 klog .Warningf ("unable to register events for %s: %v; marking devices as unmonitored" , parentUUID , ret )
199199 m .sendHealthEventForDevices (giMap , HealthEventUnmonitored )
200200 }
@@ -288,19 +288,19 @@ func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
288288func (m * nvmlDeviceHealthMonitor ) sendHealthEventForAllDevices (eventType DeviceHealthEventType ) {
289289 var devices []* AllocatableDevice
290290 for _ , giMap := range m .deviceByPlacement {
291- devices = append (devices , collectDevices (giMap )... )
291+ devices = append (devices , flattenMIGDeviceMap (giMap )... )
292292 }
293293 m .sendBatchedHealthEvent (devices , eventType )
294294}
295295
296296// sendHealthEventForDevices aggregates all devices under a single parent GPU
297297// into one batched DeviceHealthEvent.
298298func (m * nvmlDeviceHealthMonitor ) sendHealthEventForDevices (giMap map [uint32 ]map [uint32 ]* AllocatableDevice , eventType DeviceHealthEventType ) {
299- m .sendBatchedHealthEvent (collectDevices (giMap ), eventType )
299+ m .sendBatchedHealthEvent (flattenMIGDeviceMap (giMap ), eventType )
300300}
301301
302- // collectDevices flattens a GI→CI device map into a slice.
303- func collectDevices (giMap map [uint32 ]map [uint32 ]* AllocatableDevice ) []* AllocatableDevice {
302+ // flattenMIGDeviceMap flattens a GI→CI device map into a slice.
303+ func flattenMIGDeviceMap (giMap map [uint32 ]map [uint32 ]* AllocatableDevice ) []* AllocatableDevice {
304304 var devices []* AllocatableDevice
305305 for _ , ciMap := range giMap {
306306 for _ , dev := range ciMap {
0 commit comments