diff --git a/cmd/gpu-kubelet-plugin/allocatable.go b/cmd/gpu-kubelet-plugin/allocatable.go index 3c37542b6..fce548ac9 100644 --- a/cmd/gpu-kubelet-plugin/allocatable.go +++ b/cmd/gpu-kubelet-plugin/allocatable.go @@ -50,6 +50,11 @@ type AllocatableDevice struct { MigDynamic *MigSpec MigStatic *MigDeviceInfo Vfio *VfioDeviceInfo + + // taints holds DRA device taints set by the health monitor. Published + // as part of the device's ResourceSlice entry so the scheduler and + // kubelet honour them per KEP-5055. + taints []resourceapi.DeviceTaint } func (d AllocatableDevice) Type() string { @@ -145,22 +150,6 @@ func (d *AllocatableDevice) GetGPUPCIBusID() string { panic("unexpected type for AllocatableDevice") } -func (d *AllocatableDevice) IsHealthy() bool { - switch d.Type() { - case GpuDeviceType: - return d.Gpu.health == Healthy - case MigStaticDeviceType: - // TODO: review -- what about the parent? - return d.MigStatic.health == Healthy - case MigDynamicDeviceType: - // TODOMIG: For now, pretend health -- this device maybe hasn't - // manifested yet. Or has it? We could adopt the health status of the - // parent, but that's also not meaningful I think. - return true - } - panic("unexpected type for AllocatableDevice") -} - func (d AllocatableDevices) GetGPUs() []*AllocatableDevice { var devices []*AllocatableDevice for _, device := range d { @@ -324,3 +313,36 @@ func (d *PerGPUAllocatableDevices) RemoveSiblingDevices(device *AllocatableDevic } } } + +// Taints returns a copy of the device's taints to prevent data races +// when being read concurrently by the ResourceSlice builder. +func (d *AllocatableDevice) Taints() []resourceapi.DeviceTaint { + return slices.Clone(d.taints) +} + +// AddOrUpdateTaint adds a new taint or updates an existing one with the same +// key. The value and effect are always updated to the latest event received. +// Meaning, if a device receives multiple events for the same taint dimension +// (e.g., XID 48 followed by XID 63), the value is overwritten and only the most recent event data is retained. +// Returns true if the taint set was modified. +func (d *AllocatableDevice) AddOrUpdateTaint(taint *resourceapi.DeviceTaint) bool { + for i, existing := range d.taints { + if existing.Key == taint.Key { + + // 1. If nothing actually changed, exit early to avoid API calls + if existing.Value == taint.Value && existing.Effect == taint.Effect { + return false + } + + // 2. Otherwise, update the fields and the timestamp + d.taints[i].Value = taint.Value + d.taints[i].Effect = taint.Effect + d.taints[i].TimeAdded = nil // reset timestamp for the API server + return true + } + } + + // 3. Key doesn't exist yet, append the dereferenced struct + d.taints = append(d.taints, *taint) + return true +} diff --git a/cmd/gpu-kubelet-plugin/device_health.go b/cmd/gpu-kubelet-plugin/device_health.go index 3f9a1a01e..76b13f666 100644 --- a/cmd/gpu-kubelet-plugin/device_health.go +++ b/cmd/gpu-kubelet-plugin/device_health.go @@ -24,6 +24,7 @@ import ( "sync" "github.com/NVIDIA/go-nvml/pkg/nvml" + resourceapi "k8s.io/api/resource/v1" "k8s.io/klog/v2" ) @@ -31,6 +32,77 @@ const ( FullGPUInstanceID uint32 = 0xFFFFFFFF ) +const ( + TaintKeyXID = DriverName + "/xid" + TaintKeyGPULost = DriverName + "/gpu-lost" + TaintKeyUnmonitored = DriverName + "/unmonitored" +) + +// TODO(issue #1001):Remove this hardcoded constant and switch to the upstream +// resourceapi.DeviceTaintEffectNone once the k8s.io/api dependency is bumped +// to a version that includes it (KEP-5055 beta). +// DeviceTaintEffectNone is an informational effect that does not affect +// scheduling or eviction. +const DeviceTaintEffectNone resourceapi.DeviceTaintEffect = "None" + +// DeviceHealthEventType classifies the category of health event detected by +// the NVML health monitor. +type DeviceHealthEventType string + +const ( + HealthEventXID DeviceHealthEventType = "xid" + HealthEventGPULost DeviceHealthEventType = "gpu-lost" + HealthEventUnmonitored DeviceHealthEventType = "unmonitored" +) + +// DeviceHealthEvent carries a typed health notification from the NVML health +// monitor to the driver's event handler, enabling the driver to set the +// appropriate DRA device taint per the Option A schema (KEP-5055). +// Devices is a batch: for GPU_LOST and unmonitored events where all affected devices +// are aggregated into a single event so the consumer applies one ResourceSlice +// update instead of N. +type DeviceHealthEvent struct { + Devices []*AllocatableDevice + EventType DeviceHealthEventType + // inspired by NVML Event type and only meaningful for xid errors. + // may have to create a custom type based on future device-api + EventData uint64 +} + +// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA +// DeviceTaint using the Option A taint key schema: one key per health +// dimension under the gpu.nvidia.com domain. +func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint { + switch event.EventType { + case HealthEventXID: + effect := resourceapi.DeviceTaintEffectNoSchedule + if monitor != nil && monitor.IsEventNonFatal(event) { + effect = DeviceTaintEffectNone + } + return &resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: strconv.FormatUint(event.EventData, 10), + Effect: effect, + } + case HealthEventGPULost: + return &resourceapi.DeviceTaint{ + Key: TaintKeyGPULost, + Effect: resourceapi.DeviceTaintEffectNoSchedule, + } + case HealthEventUnmonitored: + return &resourceapi.DeviceTaint{ + Key: TaintKeyUnmonitored, + Effect: DeviceTaintEffectNone, + } + default: + klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType) + return &resourceapi.DeviceTaint{ + Key: TaintKeyUnmonitored, + Effect: DeviceTaintEffectNone, + } + } +} + // For a MIG device the placement is defined by the 3-tuple . // For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements. type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice @@ -38,7 +110,7 @@ type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice type nvmlDeviceHealthMonitor struct { nvmllib nvml.Interface eventSet nvml.EventSet - unhealthy chan *AllocatableDevice + unhealthy chan *DeviceHealthEvent deviceByPlacement devicePlacementMap skippedXids map[uint64]bool wg sync.WaitGroup @@ -61,7 +133,7 @@ func newNvmlDeviceHealthMonitor(config *Config, perGPUAllocatable *PerGPUAllocat all := perGPUAllocatable.GetAllDevices() m := &nvmlDeviceHealthMonitor{ nvmllib: nvdevlib.nvmllib, - unhealthy: make(chan *AllocatableDevice, len(all)), + unhealthy: make(chan *DeviceHealthEvent, len(all)), deviceByPlacement: getDevicePlacementMap(all), skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore), } @@ -106,25 +178,25 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() { for parentUUID, giMap := range m.deviceByPlacement { gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID) if ret != nvml.SUCCESS { - klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret) - m.markAllMigDevicesUnhealthy(giMap) + klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking devices as unmonitored", parentUUID, ret) + m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) continue } supportedEvents, ret := gpu.GetSupportedEventTypes() if ret != nvml.SUCCESS { - klog.Warningf("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret) - m.markAllMigDevicesUnhealthy(giMap) + klog.Warningf("unable to determine the supported events for %s: %v; marking devices as unmonitored", parentUUID, ret) + m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) continue } ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet) if ret == nvml.ERROR_NOT_SUPPORTED { klog.Warningf("Device %v is too old to support healthchecking.", parentUUID) - } - if ret != nvml.SUCCESS { - klog.Warningf("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret) - m.markAllMigDevicesUnhealthy(giMap) + m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) + } else if ret != nvml.SUCCESS { + klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret) + m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) } } } @@ -162,8 +234,8 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { // Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c]. if ret != nvml.SUCCESS { if ret == nvml.ERROR_GPU_IS_LOST { - klog.Warningf("GPU is lost error: %v; Marking all devices as unhealthy", ret) - m.markAllDevicesUnhealthy() + klog.Warningf("GPU is lost error: %v; Tainting all devices with %s", ret, TaintKeyGPULost) + m.sendHealthEventForAllDevices(HealthEventGPULost) continue } klog.V(6).Infof("Error waiting for NVML event: %v. Retrying...", ret) @@ -180,19 +252,14 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { continue } - if m.skippedXids[xid] { - klog.V(6).Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", xid, eType, gi, ci) - continue - } - klog.V(4).Infof("Processing event XID=%d event", xid) // this seems an extreme action. // should we just log the error and proceed anyway. // TODO: look into how to properly handle this error. eventUUID, ret := event.Device.GetUUID() if ret != nvml.SUCCESS { - klog.Warningf("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret) - m.markAllDevicesUnhealthy() + klog.Warningf("Failed to determine uuid for event %v: %v; Tainting all devices with %s", event, ret, TaintKeyGPULost) + m.sendHealthEventForAllDevices(HealthEventGPULost) continue } affectedDevice := m.deviceByPlacement.get(eventUUID, gi, ci) @@ -201,41 +268,65 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { continue } - klog.V(4).Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, xid) - m.unhealthy <- affectedDevice + klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID()) + m.unhealthy <- &DeviceHealthEvent{ + Devices: []*AllocatableDevice{affectedDevice}, + EventType: HealthEventXID, + EventData: xid, + } } } } -func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice { +func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent { return m.unhealthy } -func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() { +// sendHealthEventForAllDevices aggregates every device across all GPUs into a +// single batched DeviceHealthEvent so the consumer makes one ResourceSlice +// update. +func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) { + var devices []*AllocatableDevice for _, giMap := range m.deviceByPlacement { - m.markAllMigDevicesUnhealthy(giMap) + devices = append(devices, flattenMIGDeviceMap(giMap)...) } + m.sendBatchedHealthEvent(devices, eventType) +} + +// sendHealthEventForDevices aggregates all devices under a single parent GPU +// into one batched DeviceHealthEvent. +func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) { + m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType) } -// markAllMigDevicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy. -func (m *nvmlDeviceHealthMonitor) markAllMigDevicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) { +// flattenMIGDeviceMap flattens a GI→CI device map into a slice. +func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice { + var devices []*AllocatableDevice for _, ciMap := range giMap { for _, dev := range ciMap { - // Non-blocking send to avoid deadlocks if channel is full. - select { - case m.unhealthy <- dev: - klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID()) - // TODO: The non-blocking send protects the health-monitor goroutine from deadlocks, - // but dropping an unhealthy notification means the device's health transition may - // never reach the consumer. Consider follow-up improvements: - // - increase the channel buffer beyond len(allocatable) to reduce backpressure; - // - introduce a special "all devices unhealthy" message when bulk updates occur; - // - or revisit whether blocking briefly here is acceptable. - default: - klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID()) - } + devices = append(devices, dev) } } + return devices +} + +// sendBatchedHealthEvent sends a single DeviceHealthEvent containing all +// affected devices. Uses a non-blocking send to protect the monitor goroutine +// from deadlocks when the channel is full. +func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) { + if len(devices) == 0 { + return + } + event := &DeviceHealthEvent{ + Devices: devices, + EventType: eventType, + } + select { + case m.unhealthy <- event: + klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices)) + default: + klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices)) + } } // The purpose of this function is to allow for a O(1) lookup of @@ -268,7 +359,7 @@ func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap { continue } giID = d.MigStatic.gIInfo.Id - ciID = d.MigStatic.gIInfo.Id + ciID = d.MigStatic.cIInfo.Id default: // This may be a problem; and should be logged @@ -304,8 +395,8 @@ func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice { } // getAdditionalXids returns a list of additional Xids to skip from the specified string. -// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. -// Invalid values nare ignored. +// The input is treated as a comma-separated string and all valid uint64 values are considered as Xid values. +// Invalid values are ignored. // TODO: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443]. func getAdditionalXids(input string) []uint64 { if input == "" { @@ -353,3 +444,13 @@ func xidsToSkip(additionalXids string) map[uint64]bool { } return skippedXids } + +// IsEventNonFatal evaluates whether a hardware event is considered an application-level +// warning (None) rather than a critical hardware failure (NoSchedule). +// Currently, it only checks for XID events. +func (m *nvmlDeviceHealthMonitor) IsEventNonFatal(event *DeviceHealthEvent) bool { + if event.EventType == HealthEventXID { + return m.skippedXids[event.EventData] + } + return false +} diff --git a/cmd/gpu-kubelet-plugin/device_health_test.go b/cmd/gpu-kubelet-plugin/device_health_test.go new file mode 100644 index 000000000..c71443f73 --- /dev/null +++ b/cmd/gpu-kubelet-plugin/device_health_test.go @@ -0,0 +1,286 @@ +/* +Copyright The Kubernetes Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "testing" + + resourceapi "k8s.io/api/resource/v1" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockHealthMonitor implements deviceHealthMonitor for testing healthEventToTaint. +type mockHealthMonitor struct { + nonFatalXids map[uint64]bool +} + +func (m *mockHealthMonitor) Start(context.Context) error { return nil } +func (m *mockHealthMonitor) Stop() {} +func (m *mockHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent { return nil } +func (m *mockHealthMonitor) IsEventNonFatal(e *DeviceHealthEvent) bool { + if e.EventType == HealthEventXID { + return m.nonFatalXids[e.EventData] + } + return false +} + +func TestAddOrUpdateTaint_NewTaint(t *testing.T) { + dev := &AllocatableDevice{} + taint := &resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + } + + changed := dev.AddOrUpdateTaint(taint) + + require.True(t, changed) + require.Len(t, dev.Taints(), 1) + assert.Equal(t, TaintKeyXID, dev.Taints()[0].Key) + assert.Equal(t, "48", dev.Taints()[0].Value) + assert.Equal(t, resourceapi.DeviceTaintEffectNoSchedule, dev.Taints()[0].Effect) +} + +func TestAddOrUpdateTaint_DuplicateNoChange(t *testing.T) { + dev := &AllocatableDevice{} + taint := &resourceapi.DeviceTaint{ + Key: TaintKeyGPULost, + Effect: resourceapi.DeviceTaintEffectNoSchedule, + } + + dev.AddOrUpdateTaint(taint) + changed := dev.AddOrUpdateTaint(taint) + + assert.False(t, changed, "identical taint should not count as a change") + assert.Len(t, dev.Taints(), 1) +} + +func TestAddOrUpdateTaint_UpdateValue(t *testing.T) { + dev := &AllocatableDevice{} + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + + changed := dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "63", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + + require.True(t, changed) + require.Len(t, dev.Taints(), 1) + assert.Equal(t, "63", dev.Taints()[0].Value, "value should be overwritten to latest XID") +} + +func TestAddOrUpdateTaint_UpdateEffect(t *testing.T) { + dev := &AllocatableDevice{} + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: DeviceTaintEffectNone, + }) + + changed := dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + + require.True(t, changed) + assert.Equal(t, resourceapi.DeviceTaintEffectNoSchedule, dev.Taints()[0].Effect) +} + +func TestAddOrUpdateTaint_DifferentKeysAppended(t *testing.T) { + dev := &AllocatableDevice{} + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyGPULost, + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + + taints := dev.Taints() + require.Len(t, taints, 2) + assert.Equal(t, TaintKeyXID, taints[0].Key) + assert.Equal(t, TaintKeyGPULost, taints[1].Key) +} + +func TestAddOrUpdateTaint_TimeAddedResetOnChange(t *testing.T) { + dev := &AllocatableDevice{} + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "48", + Effect: DeviceTaintEffectNone, + }) + + dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{ + Key: TaintKeyXID, + Value: "63", + Effect: resourceapi.DeviceTaintEffectNoSchedule, + }) + + assert.Nil(t, dev.Taints()[0].TimeAdded, "TimeAdded should be nil so the API server sets a fresh timestamp") +} + +func TestHealthEventToTaint(t *testing.T) { + monitor := &mockHealthMonitor{ + nonFatalXids: map[uint64]bool{13: true, 31: true}, + } + + tests := []struct { + name string + event *DeviceHealthEvent + monitor deviceHealthMonitor + expectedKey string + expectedValue string + expectedEffect resourceapi.DeviceTaintEffect + }{ + { + name: "fatal XID", + event: &DeviceHealthEvent{ + EventType: HealthEventXID, + EventData: 48, + }, + monitor: monitor, + expectedKey: TaintKeyXID, + expectedValue: "48", + expectedEffect: resourceapi.DeviceTaintEffectNoSchedule, + }, + { + name: "non-fatal XID (skipped)", + event: &DeviceHealthEvent{ + EventType: HealthEventXID, + EventData: 13, + }, + monitor: monitor, + expectedKey: TaintKeyXID, + expectedValue: "13", + expectedEffect: DeviceTaintEffectNone, + }, + { + name: "XID with nil monitor defaults to fatal", + event: &DeviceHealthEvent{ + EventType: HealthEventXID, + EventData: 13, + }, + monitor: nil, + expectedKey: TaintKeyXID, + expectedValue: "13", + expectedEffect: resourceapi.DeviceTaintEffectNoSchedule, + }, + { + name: "GPU lost", + event: &DeviceHealthEvent{ + EventType: HealthEventGPULost, + }, + monitor: monitor, + expectedKey: TaintKeyGPULost, + expectedValue: "", + expectedEffect: resourceapi.DeviceTaintEffectNoSchedule, + }, + { + name: "unmonitored", + event: &DeviceHealthEvent{ + EventType: HealthEventUnmonitored, + }, + monitor: monitor, + expectedKey: TaintKeyUnmonitored, + expectedValue: "", + expectedEffect: DeviceTaintEffectNone, + }, + { + name: "unknown event type defaults to unmonitored", + event: &DeviceHealthEvent{ + EventType: DeviceHealthEventType("bogus"), + }, + monitor: monitor, + expectedKey: TaintKeyUnmonitored, + expectedValue: "", + expectedEffect: DeviceTaintEffectNone, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + taint := healthEventToTaint(tc.monitor, tc.event) + assert.Equal(t, tc.expectedKey, taint.Key) + assert.Equal(t, tc.expectedValue, taint.Value) + assert.Equal(t, tc.expectedEffect, taint.Effect) + }) + } +} + +func TestIsEventNonFatal(t *testing.T) { + m := &nvmlDeviceHealthMonitor{ + skippedXids: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + }, + } + + tests := []struct { + name string + event *DeviceHealthEvent + expected bool + }{ + { + name: "skipped XID is non-fatal", + event: &DeviceHealthEvent{ + EventType: HealthEventXID, + EventData: 13, + }, + expected: true, + }, + { + name: "non-skipped XID is fatal", + event: &DeviceHealthEvent{ + EventType: HealthEventXID, + EventData: 48, + }, + expected: false, + }, + { + name: "GPU_LOST is always fatal", + event: &DeviceHealthEvent{ + EventType: HealthEventGPULost, + }, + expected: false, + }, + { + name: "unmonitored is not an XID event", + event: &DeviceHealthEvent{ + EventType: HealthEventUnmonitored, + }, + expected: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, m.IsEventNonFatal(tc.event)) + }) + } +} diff --git a/cmd/gpu-kubelet-plugin/device_state.go b/cmd/gpu-kubelet-plugin/device_state.go index a7f73b5f5..ff5d2254f 100644 --- a/cmd/gpu-kubelet-plugin/device_state.go +++ b/cmd/gpu-kubelet-plugin/device_state.go @@ -654,12 +654,6 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res if device == nil { return nil, fmt.Errorf("allocatable not found for device %q", result.Device) } - // only proceed with config mapping if device is healthy. - if featuregates.Enabled(featuregates.NVMLDeviceHealthCheck) { - if !device.IsHealthy() { - return nil, fmt.Errorf("device %q is not healthy", result.Device) - } - } for _, c := range slices.Backward(configs) { if slices.Contains(c.Requests, result.Request) { if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType { @@ -1090,31 +1084,6 @@ func GetOpaqueDeviceConfigs( return resultConfigs, nil } -// TODO: this needs a code comment. -func (s *DeviceState) UpdateDeviceHealthStatus(d *AllocatableDevice, hs HealthStatus) { - s.Lock() - defer s.Unlock() - - switch d.Type() { - case GpuDeviceType: - d.Gpu.health = hs - case MigDynamicDeviceType: - // Here we do not have access to a concrete MIG device. The - // 'allocatable' MIG device is an abstract representation to a specific - // MIG device. - klog.Warningf("UpdateDeviceHealthStatus() called for abstract, dynamic MIG device %s: %s", d.MigDynamic.CanonicalName(), hs) - case MigStaticDeviceType: - // Does it make sense to update the health for a MIG device? Do we - // receive health events that are specific to individual MIG devices? If - // yes: does this allow for making conclusions about the parent device? - d.MigStatic.health = hs - default: - klog.V(6).Infof("Cannot update health status for unknown device type: %s", d.Type()) - return - } - klog.V(4).Infof("Updated device: %s health status to %s", d.UUID(), hs) -} - // requestedNonAdminDevices returns the set of device names requested by the claim, // excluding admin-access allocations. func (s *DeviceState) requestedNonAdminDevices(claim *resourceapi.ResourceClaim) map[string]struct{} { @@ -1232,3 +1201,11 @@ func syncPreparedDevicesGaugeFromCheckpoint(nodeName string, cp *Checkpoint) { } } } + +// AddDeviceTaint adds or updates a DRA device taint on the given device under +// the DeviceState lock. Returns true if the taint set was actually modified. +func (s *DeviceState) AddDeviceTaint(d *AllocatableDevice, taint *resourceapi.DeviceTaint) bool { + s.Lock() + defer s.Unlock() + return d.AddOrUpdateTaint(taint) +} diff --git a/cmd/gpu-kubelet-plugin/deviceinfo.go b/cmd/gpu-kubelet-plugin/deviceinfo.go index 6089ea4cf..33f5b662f 100644 --- a/cmd/gpu-kubelet-plugin/deviceinfo.go +++ b/cmd/gpu-kubelet-plugin/deviceinfo.go @@ -27,15 +27,6 @@ import ( "k8s.io/utils/ptr" ) -// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy. -type HealthStatus string - -const ( - Healthy HealthStatus = "Healthy" - // With NVMLDeviceHealthCheck, Unhealthy means that there are critcal xid errors on the device. - Unhealthy HealthStatus = "Unhealthy" -) - // Represents a specific, full, physical GPU device. type GpuInfo struct { UUID string `json:"uuid"` @@ -54,7 +45,6 @@ type GpuInfo struct { pcieRootAttr *deviceattribute.DeviceAttribute migProfiles []*MigProfileInfo addressingMode *string - health HealthStatus // The following properties that can only be known after inspecting MIG // profiles. @@ -88,7 +78,6 @@ type MigDeviceInfo struct { parent *GpuInfo giProfileInfo *nvml.GpuInstanceProfileInfo ciProfileInfo *nvml.ComputeInstanceProfileInfo - health HealthStatus } type VfioDeviceInfo struct { diff --git a/cmd/gpu-kubelet-plugin/driver.go b/cmd/gpu-kubelet-plugin/driver.go index 8c8cf5bc0..35eb5d0e8 100644 --- a/cmd/gpu-kubelet-plugin/driver.go +++ b/cmd/gpu-kubelet-plugin/driver.go @@ -47,7 +47,9 @@ const DriverPrepUprepFlockFileName = "pu.lock" type deviceHealthMonitor interface { Start(context.Context) error Stop() - Unhealthy() <-chan *AllocatableDevice + Unhealthy() <-chan *DeviceHealthEvent + // Allows the driver to query the HealthMonitor's health policy + IsEventNonFatal(event *DeviceHealthEvent) bool } type driver struct { @@ -462,47 +464,38 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) { case <-ctx.Done(): klog.V(6).Info("Stop processing device health notifications") return - case device, ok := <-d.deviceHealthMonitor.Unhealthy(): + case event, ok := <-d.deviceHealthMonitor.Unhealthy(): if !ok { // NVML based deviceHealthMonitor is expected to close only during driver Shutdown. klog.V(6).Info("Health monitor channel closed") return } - uuid := device.UUID() - klog.Warningf("Received unhealthy notification for device: %s", uuid) - - if !device.IsHealthy() { - klog.V(6).Infof("Device: %s is aleady marked unhealthy. Skip republishing ResourceSlice", uuid) + taint := healthEventToTaint(d.deviceHealthMonitor, event) + modified := false + for _, dev := range event.Devices { + klog.Warningf("Received %s health event for device %s", event.EventType, dev.UUID()) + if d.state.AddDeviceTaint(dev, taint) { + modified = true + } + } + if !modified { continue } - // Mark device as unhealthy. - d.state.UpdateDeviceHealthStatus(device, Unhealthy) - - // Republish resource slice with only healthy devices - // There is no remediation loop right now meaning if the unhealthy device is fixed, - // driver needs to be restarted to publish the ResourceSlice with all devices var resourceSlice resourceslice.Slice for _, devices := range d.state.perGPUAllocatable.allocatablesMap { for _, dev := range devices { - uuid := dev.UUID() - if dev.IsHealthy() { - klog.V(6).Infof("Device: %s is healthy, added to ResoureSlice", uuid) - resourceSlice.Devices = append(resourceSlice.Devices, dev.GetDevice()) - } else { - klog.Warningf("Device: %s is unhealthy, will be removed from ResoureSlice", uuid) + d := dev.GetDevice() + + taints := dev.Taints() + if len(taints) > 0 { + d.Taints = taints } + resourceSlice.Devices = append(resourceSlice.Devices, d) } } - klog.V(4).Info("Rebulishing resourceslice with healthy devices") - resources := resourceslice.DriverResources{ - Pools: map[string]resourceslice.Pool{ - nodeName: {Slices: []resourceslice.Slice{resourceSlice}}, - }, - } - // NOTE: We only log an error on publish failure and do not retry. // If this publish fails, our in-memory health update succeeds but the // ResourceSlice in the API server remains stale and still advertises the @@ -514,10 +507,25 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) { // This is a temporary compromise while device taints/tolerations (KEP-5055) // are available as a Beta feature. An interim improvement could be adding // a retry/backoff or switch to patch updates instead of full republish. + klog.V(4).Infof("Republishing ResourceSlice: %d device(s) tainted with %s=%q (effect=%s)", + len(event.Devices), taint.Key, taint.Value, taint.Effect) + + resources := resourceslice.DriverResources{ + Pools: map[string]resourceslice.Pool{ + nodeName: {Slices: []resourceslice.Slice{resourceSlice}}, + }, + } + + // NOTE: GPU_LOST and unmonitored events are already batched at the + // sender (all affected devices arrive in a single DeviceHealthEvent). + // XID events are still per-device and may cause repeated publishes. + // TODO: Add receiver-side event aggregation before PublishResources. + // Evaluate two strategies: + // 1. Channel drain: non-blocking pull of all pending events (Pro: zero latency; Con: susceptible to NVML lag). + // 2. Timer debounce: e.g., 50ms window (Pro: standard K8s API protection; Con: slight delay). + // This also needs to be handle properly in the recovery path. if err := d.pluginhelper.PublishResources(ctx, resources); err != nil { - klog.Errorf("Failed to publish resources after device health status update: %v", err) - } else { - klog.V(4).Info("Successfully republished resources without unhealthy device") + klog.Errorf("Failed to publish resources after taint update: %v", err) } } } diff --git a/cmd/gpu-kubelet-plugin/nvlib.go b/cmd/gpu-kubelet-plugin/nvlib.go index fa620c3e6..3e18005cf 100644 --- a/cmd/gpu-kubelet-plugin/nvlib.go +++ b/cmd/gpu-kubelet-plugin/nvlib.go @@ -570,7 +570,6 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) pciBusIDAttr: pciBusIDAttr, pcieRootAttr: pcieRootAttr, migProfiles: migProfiles, - health: Healthy, addressingMode: addressingMode, } @@ -749,7 +748,6 @@ func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, e gIInfo: &giInfo, ciProfileInfo: ciProfileInfo, cIInfo: &ciInfo, - health: Healthy, } return nil })