-
Notifications
You must be signed in to change notification settings - Fork 155
Add taints on health events #983
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e58c6da
8007167
f591b45
7e8685e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,21 +24,93 @@ import ( | |
| "sync" | ||
|
|
||
| "github.com/NVIDIA/go-nvml/pkg/nvml" | ||
| resourceapi "k8s.io/api/resource/v1" | ||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
||
| const ( | ||
| FullGPUInstanceID uint32 = 0xFFFFFFFF | ||
| ) | ||
|
|
||
| const ( | ||
| TaintKeyXID = DriverName + "/xid" | ||
| TaintKeyGPULost = DriverName + "/gpu-lost" | ||
| TaintKeyUnmonitored = DriverName + "/unmonitored" | ||
| ) | ||
|
|
||
| // TODO(issue #1001):Remove this hardcoded constant and switch to the upstream | ||
| // resourceapi.DeviceTaintEffectNone once the k8s.io/api dependency is bumped | ||
| // to a version that includes it (KEP-5055 beta). | ||
| // DeviceTaintEffectNone is an informational effect that does not affect | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This hardcoded constant is fragile. If the upstream Consider pinning the vendored
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmm its going to be the same https://pkg.go.dev/k8s.io/kubernetes/pkg/apis/resource#DeviceTaintEffect I created the issue: #1001 |
||
| // scheduling or eviction. | ||
| const DeviceTaintEffectNone resourceapi.DeviceTaintEffect = "None" | ||
|
|
||
| // DeviceHealthEventType classifies the category of health event detected by | ||
| // the NVML health monitor. | ||
| type DeviceHealthEventType string | ||
|
|
||
| const ( | ||
| HealthEventXID DeviceHealthEventType = "xid" | ||
| HealthEventGPULost DeviceHealthEventType = "gpu-lost" | ||
| HealthEventUnmonitored DeviceHealthEventType = "unmonitored" | ||
| ) | ||
|
|
||
| // DeviceHealthEvent carries a typed health notification from the NVML health | ||
| // monitor to the driver's event handler, enabling the driver to set the | ||
| // appropriate DRA device taint per the Option A schema (KEP-5055). | ||
| // Devices is a batch: for GPU_LOST and unmonitored events where all affected devices | ||
| // are aggregated into a single event so the consumer applies one ResourceSlice | ||
| // update instead of N. | ||
| type DeviceHealthEvent struct { | ||
| Devices []*AllocatableDevice | ||
| EventType DeviceHealthEventType | ||
| // inspired by NVML Event type and only meaningful for xid errors. | ||
| // may have to create a custom type based on future device-api | ||
| EventData uint64 | ||
| } | ||
|
|
||
| // healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA | ||
| // DeviceTaint using the Option A taint key schema: one key per health | ||
| // dimension under the gpu.nvidia.com domain. | ||
| func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint { | ||
| switch event.EventType { | ||
| case HealthEventXID: | ||
| effect := resourceapi.DeviceTaintEffectNoSchedule | ||
| if monitor != nil && monitor.IsEventNonFatal(event) { | ||
| effect = DeviceTaintEffectNone | ||
| } | ||
| return &resourceapi.DeviceTaint{ | ||
| Key: TaintKeyXID, | ||
| Value: strconv.FormatUint(event.EventData, 10), | ||
| Effect: effect, | ||
| } | ||
| case HealthEventGPULost: | ||
| return &resourceapi.DeviceTaint{ | ||
| Key: TaintKeyGPULost, | ||
| Effect: resourceapi.DeviceTaintEffectNoSchedule, | ||
| } | ||
| case HealthEventUnmonitored: | ||
| return &resourceapi.DeviceTaint{ | ||
| Key: TaintKeyUnmonitored, | ||
| Effect: DeviceTaintEffectNone, | ||
| } | ||
| default: | ||
| klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType) | ||
| return &resourceapi.DeviceTaint{ | ||
| Key: TaintKeyUnmonitored, | ||
| Effect: DeviceTaintEffectNone, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>. | ||
| // For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements. | ||
| type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice | ||
|
|
||
| type nvmlDeviceHealthMonitor struct { | ||
| nvmllib nvml.Interface | ||
| eventSet nvml.EventSet | ||
| unhealthy chan *AllocatableDevice | ||
| unhealthy chan *DeviceHealthEvent | ||
| deviceByPlacement devicePlacementMap | ||
| skippedXids map[uint64]bool | ||
| wg sync.WaitGroup | ||
|
|
@@ -61,7 +133,7 @@ func newNvmlDeviceHealthMonitor(config *Config, perGPUAllocatable *PerGPUAllocat | |
| all := perGPUAllocatable.GetAllDevices() | ||
| m := &nvmlDeviceHealthMonitor{ | ||
| nvmllib: nvdevlib.nvmllib, | ||
| unhealthy: make(chan *AllocatableDevice, len(all)), | ||
| unhealthy: make(chan *DeviceHealthEvent, len(all)), | ||
| deviceByPlacement: getDevicePlacementMap(all), | ||
| skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore), | ||
| } | ||
|
|
@@ -106,25 +178,25 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() { | |
| for parentUUID, giMap := range m.deviceByPlacement { | ||
| gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID) | ||
| if ret != nvml.SUCCESS { | ||
| klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllMigDevicesUnhealthy(giMap) | ||
| klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking devices as unmonitored", parentUUID, ret) | ||
| m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) | ||
| continue | ||
| } | ||
|
|
||
| supportedEvents, ret := gpu.GetSupportedEventTypes() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Warningf("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllMigDevicesUnhealthy(giMap) | ||
| klog.Warningf("unable to determine the supported events for %s: %v; marking devices as unmonitored", parentUUID, ret) | ||
| m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) | ||
| continue | ||
| } | ||
|
|
||
| ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet) | ||
| if ret == nvml.ERROR_NOT_SUPPORTED { | ||
| klog.Warningf("Device %v is too old to support healthchecking.", parentUUID) | ||
| } | ||
| if ret != nvml.SUCCESS { | ||
| klog.Warningf("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllMigDevicesUnhealthy(giMap) | ||
| m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) | ||
| } else if ret != nvml.SUCCESS { | ||
| klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret) | ||
| m.sendHealthEventForDevices(giMap, HealthEventUnmonitored) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -162,8 +234,8 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { | |
| // Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c]. | ||
| if ret != nvml.SUCCESS { | ||
| if ret == nvml.ERROR_GPU_IS_LOST { | ||
| klog.Warningf("GPU is lost error: %v; Marking all devices as unhealthy", ret) | ||
| m.markAllDevicesUnhealthy() | ||
| klog.Warningf("GPU is lost error: %v; Tainting all devices with %s", ret, TaintKeyGPULost) | ||
| m.sendHealthEventForAllDevices(HealthEventGPULost) | ||
| continue | ||
| } | ||
| klog.V(6).Infof("Error waiting for NVML event: %v. Retrying...", ret) | ||
|
|
@@ -180,19 +252,14 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { | |
| continue | ||
| } | ||
|
|
||
| if m.skippedXids[xid] { | ||
| klog.V(6).Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", xid, eType, gi, ci) | ||
| continue | ||
| } | ||
|
|
||
| klog.V(4).Infof("Processing event XID=%d event", xid) | ||
| // this seems an extreme action. | ||
| // should we just log the error and proceed anyway. | ||
| // TODO: look into how to properly handle this error. | ||
| eventUUID, ret := event.Device.GetUUID() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Warningf("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret) | ||
| m.markAllDevicesUnhealthy() | ||
| klog.Warningf("Failed to determine uuid for event %v: %v; Tainting all devices with %s", event, ret, TaintKeyGPULost) | ||
| m.sendHealthEventForAllDevices(HealthEventGPULost) | ||
| continue | ||
| } | ||
| affectedDevice := m.deviceByPlacement.get(eventUUID, gi, ci) | ||
|
|
@@ -201,41 +268,65 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { | |
| continue | ||
| } | ||
|
|
||
| klog.V(4).Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, xid) | ||
| m.unhealthy <- affectedDevice | ||
| klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID()) | ||
| m.unhealthy <- &DeviceHealthEvent{ | ||
| Devices: []*AllocatableDevice{affectedDevice}, | ||
| EventType: HealthEventXID, | ||
| EventData: xid, | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice { | ||
| func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent { | ||
| return m.unhealthy | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() { | ||
| // sendHealthEventForAllDevices aggregates every device across all GPUs into a | ||
| // single batched DeviceHealthEvent so the consumer makes one ResourceSlice | ||
| // update. | ||
| func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) { | ||
| var devices []*AllocatableDevice | ||
| for _, giMap := range m.deviceByPlacement { | ||
| m.markAllMigDevicesUnhealthy(giMap) | ||
| devices = append(devices, flattenMIGDeviceMap(giMap)...) | ||
| } | ||
| m.sendBatchedHealthEvent(devices, eventType) | ||
| } | ||
|
|
||
| // sendHealthEventForDevices aggregates all devices under a single parent GPU | ||
| // into one batched DeviceHealthEvent. | ||
| func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) { | ||
| m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType) | ||
| } | ||
|
|
||
| // markAllMigDevicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy. | ||
| func (m *nvmlDeviceHealthMonitor) markAllMigDevicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) { | ||
| // flattenMIGDeviceMap flattens a GI→CI device map into a slice. | ||
| func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice { | ||
| var devices []*AllocatableDevice | ||
| for _, ciMap := range giMap { | ||
| for _, dev := range ciMap { | ||
| // Non-blocking send to avoid deadlocks if channel is full. | ||
| select { | ||
| case m.unhealthy <- dev: | ||
| klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID()) | ||
| // TODO: The non-blocking send protects the health-monitor goroutine from deadlocks, | ||
| // but dropping an unhealthy notification means the device's health transition may | ||
| // never reach the consumer. Consider follow-up improvements: | ||
| // - increase the channel buffer beyond len(allocatable) to reduce backpressure; | ||
| // - introduce a special "all devices unhealthy" message when bulk updates occur; | ||
| // - or revisit whether blocking briefly here is acceptable. | ||
| default: | ||
| klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID()) | ||
| } | ||
| devices = append(devices, dev) | ||
| } | ||
| } | ||
| return devices | ||
| } | ||
|
|
||
| // sendBatchedHealthEvent sends a single DeviceHealthEvent containing all | ||
| // affected devices. Uses a non-blocking send to protect the monitor goroutine | ||
| // from deadlocks when the channel is full. | ||
| func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) { | ||
| if len(devices) == 0 { | ||
| return | ||
| } | ||
| event := &DeviceHealthEvent{ | ||
| Devices: devices, | ||
| EventType: eventType, | ||
| } | ||
| select { | ||
| case m.unhealthy <- event: | ||
| klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices)) | ||
| default: | ||
| klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices)) | ||
| } | ||
| } | ||
|
|
||
| // The purpose of this function is to allow for a O(1) lookup of | ||
|
|
@@ -268,7 +359,7 @@ func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap { | |
| continue | ||
| } | ||
| giID = d.MigStatic.gIInfo.Id | ||
| ciID = d.MigStatic.gIInfo.Id | ||
| ciID = d.MigStatic.cIInfo.Id | ||
|
|
||
| default: | ||
| // This may be a problem; and should be logged | ||
|
|
@@ -304,8 +395,8 @@ func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice { | |
| } | ||
|
|
||
| // getAdditionalXids returns a list of additional Xids to skip from the specified string. | ||
| // The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. | ||
| // Invalid values nare ignored. | ||
| // The input is treated as a comma-separated string and all valid uint64 values are considered as Xid values. | ||
| // Invalid values are ignored. | ||
| // TODO: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443]. | ||
| func getAdditionalXids(input string) []uint64 { | ||
| if input == "" { | ||
|
|
@@ -353,3 +444,13 @@ func xidsToSkip(additionalXids string) map[uint64]bool { | |
| } | ||
| return skippedXids | ||
| } | ||
|
|
||
| // IsEventNonFatal evaluates whether a hardware event is considered an application-level | ||
| // warning (None) rather than a critical hardware failure (NoSchedule). | ||
| // Currently, it only checks for XID events. | ||
| func (m *nvmlDeviceHealthMonitor) IsEventNonFatal(event *DeviceHealthEvent) bool { | ||
| if event.EventType == HealthEventXID { | ||
| return m.skippedXids[event.EventData] | ||
| } | ||
| return false | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AddOrUpdateTaintdeduplicates by key, so if a device gets XID 48 then XID 63, the value overwrites to"63"and the first XID info is lost. This is consistent with Option A's one-key-per-dimension design, but should be documented — e.g., a comment here noting that only the most recent XID is retained per device.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done