Skip to content

Commit 7e8685e

Browse files
committed
Add unit test for health events
Signed-off-by: Swati Gupta <swatig@nvidia.com> Signed-off-by: Swati Gupta <swatig@nvidia.com> Signed-off-by: Swati Gupta <swatig@nvidia.com> Signed-off-by: Swati Gupta <swatig@nvidia.com>
1 parent f591b45 commit 7e8685e

6 files changed

Lines changed: 310 additions & 33 deletions

File tree

cmd/gpu-kubelet-plugin/allocatable.go

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -325,31 +325,24 @@ func (d *AllocatableDevice) Taints() []resourceapi.DeviceTaint {
325325
// Meaning, if a device receives multiple events for the same taint dimension
326326
// (e.g., XID 48 followed by XID 63), the value is overwritten and only the most recent event data is retained.
327327
// Returns true if the taint set was modified.
328-
func (d *AllocatableDevice) AddOrUpdateTaint(taint resourceapi.DeviceTaint) bool {
328+
func (d *AllocatableDevice) AddOrUpdateTaint(taint *resourceapi.DeviceTaint) bool {
329329
for i, existing := range d.taints {
330330
if existing.Key == taint.Key {
331-
changed := false
332331

333-
// Overwrite Value if it changed
334-
if existing.Value != taint.Value {
335-
d.taints[i].Value = taint.Value
336-
changed = true
332+
// 1. If nothing actually changed, exit early to avoid API calls
333+
if existing.Value == taint.Value && existing.Effect == taint.Effect {
334+
return false
337335
}
338336

339-
if existing.Effect != taint.Effect {
340-
d.taints[i].Effect = taint.Effect
341-
changed = true
342-
}
343-
344-
if changed {
345-
d.taints[i].TimeAdded = nil // reset timestamp for the API server
346-
}
347-
348-
return changed
337+
// 2. Otherwise, update the fields and the timestamp
338+
d.taints[i].Value = taint.Value
339+
d.taints[i].Effect = taint.Effect
340+
d.taints[i].TimeAdded = nil // reset timestamp for the API server
341+
return true
349342
}
350343
}
351344

352-
// Key doesn't exist yet, append the new taint
353-
d.taints = append(d.taints, taint)
345+
// 3. Key doesn't exist yet, append the dereferenced struct
346+
d.taints = append(d.taints, *taint)
354347
return true
355348
}

cmd/gpu-kubelet-plugin/device_health.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,31 +72,31 @@ type DeviceHealthEvent struct {
7272
// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA
7373
// DeviceTaint using the Option A taint key schema: one key per health
7474
// dimension under the gpu.nvidia.com domain.
75-
func healthEventToTaint(event *DeviceHealthEvent, monitor deviceHealthMonitor) resourceapi.DeviceTaint {
75+
func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint {
7676
switch event.EventType {
7777
case HealthEventXID:
7878
effect := resourceapi.DeviceTaintEffectNoSchedule
7979
if monitor != nil && monitor.IsEventNonFatal(event) {
8080
effect = DeviceTaintEffectNone
8181
}
82-
return resourceapi.DeviceTaint{
82+
return &resourceapi.DeviceTaint{
8383
Key: TaintKeyXID,
8484
Value: strconv.FormatUint(event.EventData, 10),
8585
Effect: effect,
8686
}
8787
case HealthEventGPULost:
88-
return resourceapi.DeviceTaint{
88+
return &resourceapi.DeviceTaint{
8989
Key: TaintKeyGPULost,
9090
Effect: resourceapi.DeviceTaintEffectNoSchedule,
9191
}
9292
case HealthEventUnmonitored:
93-
return resourceapi.DeviceTaint{
93+
return &resourceapi.DeviceTaint{
9494
Key: TaintKeyUnmonitored,
9595
Effect: DeviceTaintEffectNone,
9696
}
9797
default:
9898
klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType)
99-
return resourceapi.DeviceTaint{
99+
return &resourceapi.DeviceTaint{
100100
Key: TaintKeyUnmonitored,
101101
Effect: DeviceTaintEffectNone,
102102
}
@@ -193,8 +193,8 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
193193
ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
194194
if ret == nvml.ERROR_NOT_SUPPORTED {
195195
klog.Warningf("Device %v is too old to support healthchecking.", parentUUID)
196-
}
197-
if ret != nvml.SUCCESS {
196+
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
197+
} else if ret != nvml.SUCCESS {
198198
klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret)
199199
m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
200200
}
@@ -288,19 +288,19 @@ func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
288288
func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) {
289289
var devices []*AllocatableDevice
290290
for _, giMap := range m.deviceByPlacement {
291-
devices = append(devices, collectDevices(giMap)...)
291+
devices = append(devices, flattenMIGDeviceMap(giMap)...)
292292
}
293293
m.sendBatchedHealthEvent(devices, eventType)
294294
}
295295

296296
// sendHealthEventForDevices aggregates all devices under a single parent GPU
297297
// into one batched DeviceHealthEvent.
298298
func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) {
299-
m.sendBatchedHealthEvent(collectDevices(giMap), eventType)
299+
m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType)
300300
}
301301

302-
// collectDevices flattens a GI→CI device map into a slice.
303-
func collectDevices(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
302+
// flattenMIGDeviceMap flattens a GI→CI device map into a slice.
303+
func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
304304
var devices []*AllocatableDevice
305305
for _, ciMap := range giMap {
306306
for _, dev := range ciMap {

0 commit comments

Comments
 (0)