Skip to content

Commit cd7171c

Browse files
authored
Add xpumd health source support for GPU plugin (#2279)
The GPU plugin currently gets health data from the Level-Zero sidecar, which requires a privileged container. XPUMD 2.x exposes a local gRPC streaming API (WatchDeviceHealth) that provides equivalent health info without needing a privileged sidecar. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent 8249468 commit cd7171c

10 files changed

Lines changed: 816 additions & 83 deletions

File tree

cmd/gpu_plugin/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Table of Contents
1919
* [CDI support](#cdi-support)
2020
* [KMD and UMD](#kmd-and-umd)
2121
* [Health management](#health-management)
22+
* [xpumd health source](#xpumd-health-source)
2223
* [by-path mounting](#by-path-mounting)
2324
* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
2425
* [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
@@ -59,6 +60,7 @@ For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd).
5960
| -enable-monitoring | - | disabled | Enable '*_monitoring' resource that provides access to all Intel GPU devices on the node, [see use](./monitoring.md) |
6061
| -monitoring-mode | string | single | How monitoring resources are registered: single or split |
6162
| -health-management | - | disabled | Enable health management by requesting data from oneAPI/Level-Zero interface. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. See [health management](#health-management) |
63+
| -xpumd-endpoint | string | "" | Unix socket path for xpumd health service (e.g. `/run/xpumd/intelxpuinfo.sock`). When set, xpumd is used as the health data source instead of the Level-Zero sidecar. Cannot be combined with `-health-management`. Temperature limits are specified in xpumd service configuration, not with GPU plugin flags. See [xpumd health source](#xpumd-health-source) |
6264
| -wsl | - | disabled | Adapt plugin to run in the WSL environment. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. |
6365
| -shared-dev-num | int | 1 | Number of containers that can share the same GPU device |
6466
| -allow-ids | string | "" | A list of PCI Device IDs that are allowed to be registered as resources. Default is empty (=all registered). Cannot be used together with `deny-ids`. |
@@ -265,6 +267,22 @@ Kubernetes Device Plugin API allows passing device's healthiness to Kubelet. By
265267

266268
Temperature limit can be provided via the command line argument, default is 100C.
267269

270+
### xpumd health source
271+
272+
As an alternative to the Level-Zero sidecar, GPU plugin can obtain device health data from [Intel XPU Manager (xpumd)](https://github.com/intel/xpumanager) v2.x that provides equivalent health information without requiring a privileged sidecar container.
273+
274+
When xpumd is running on the host it creates a unix socket (default: `/run/xpumd/intelxpuinfo.sock`). The GPU plugin connects to this socket and streams health events. A device is reported as `Unhealthy` if any health domain reports severity `WARNING` or higher.
275+
276+
To use xpumd as the health source, deploy the plugin with the provided Kustomize overlay:
277+
278+
```bash
279+
kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/xpumd?ref=<RELEASE_VERSION>'
280+
```
281+
282+
The overlay mounts `/run/xpumd` from the host (read-only) into the plugin pod and passes the required `-xpumd-endpoint` and `-enable-monitoring` flags automatically.
283+
284+
> **Note**: `-xpumd-endpoint` and (sidecar) `-health-management` flags are mutually exclusive. Sidecar specific temperature limit flags (`-temp-limit`, `-gpu-temp-limit`, `-memory-temp-limit`) are not applicable when using `xpumd` as health source.
285+
268286
### By-path mounting
269287

270288
The DRM devices for the Intel GPUs register `by-path` symlinks under `/dev/dri/by-path`. For each GPU character device, there is a corresponding symlink in the by-path directory:

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 157 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package main
1616

1717
import (
18+
"context"
1819
"flag"
1920
"fmt"
2021
"io/fs"
@@ -32,6 +33,7 @@ import (
3233
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
3334

3435
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
36+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/xpumdservice"
3537
gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
3638
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3739
cdispec "tags.cncf.io/container-device-interface/specs-go"
@@ -71,6 +73,9 @@ const (
7173

7274
// Period of device scans.
7375
scanPeriod = 5 * time.Second
76+
77+
// Default limit for temperatures.
78+
defaultTempLimit = 100
7479
)
7580

7681
type cliOptions struct {
@@ -79,6 +84,7 @@ type cliOptions struct {
7984
denyIDs string
8085
bypathMount string
8186
monitoringMode string
87+
xpumdEndpoint string
8288
sharedDevNum int
8389
globalTempLimit int
8490
memoryTempLimit int
@@ -88,6 +94,20 @@ type cliOptions struct {
8894
healthManagement bool
8995
}
9096

97+
type argError struct {
98+
msg string
99+
}
100+
101+
func (e argError) Error() string {
102+
return fmt.Sprintf("argument error: %s", e.msg)
103+
}
104+
105+
func newArgError(msg string) error {
106+
return argError{
107+
msg: msg,
108+
}
109+
}
110+
91111
func validatePCIDeviceIDs(pciIDList string) error {
92112
if pciIDList == "" {
93113
return nil
@@ -98,11 +118,11 @@ func validatePCIDeviceIDs(pciIDList string) error {
98118
for id := range strings.SplitSeq(pciIDList, ",") {
99119
id = strings.TrimSpace(id)
100120
if id == "" {
101-
return os.ErrNotExist
121+
return newArgError("empty PCI ID")
102122
}
103123

104124
if !r.MatchString(id) {
105-
return os.ErrInvalid
125+
return newArgError("invalid PCI ID: " + id)
106126
}
107127
}
108128

@@ -191,6 +211,7 @@ type devicePlugin struct {
191211
scanResources chan bool
192212

193213
levelzeroService levelzeroservice.LevelzeroService
214+
xpumdService xpumdservice.XpumdService
194215

195216
sysfsDrmDir string
196217
devFsRoot string
@@ -255,15 +276,33 @@ func logHealthStatusChange(card, newStatus string, statuses map[string]string) {
255276
}
256277
}
257278

258-
func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
259-
if dp.levelzeroService == nil {
260-
return pluginapi.Healthy
261-
}
262-
279+
// bdfForCard resolves the PCI BDF address for a card sysfs path by following
280+
// the "device" symlink. It returns ("", false) when the link cannot be read.
281+
func bdfForCard(cardPath string) (string, bool) {
263282
link, err := os.Readlink(filepath.Join(cardPath, "device"))
264283
if err != nil {
265284
klog.Warning("couldn't read device link for", cardPath)
266285

286+
return "", false
287+
}
288+
289+
return filepath.Base(link), true
290+
}
291+
292+
func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
293+
if dp.xpumdService != nil {
294+
return dp.healthStatusForCardXpumd(cardPath)
295+
} else if dp.levelzeroService != nil {
296+
return dp.healthStatusForCardLZ(cardPath)
297+
}
298+
299+
return pluginapi.Healthy
300+
}
301+
302+
// healthStatusForCardLZ checks device health using the Level-Zero service.
303+
func (dp *devicePlugin) healthStatusForCardLZ(cardPath string) string {
304+
bdfAddr, ok := bdfForCard(cardPath)
305+
if !ok {
267306
return pluginapi.Healthy
268307
}
269308

@@ -272,8 +311,6 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
272311
// Check status changes after the function exits
273312
defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }()
274313

275-
bdfAddr := filepath.Base(link)
276-
277314
dh, err := dp.levelzeroService.GetDeviceHealth(bdfAddr)
278315
if err != nil {
279316
klog.Warningf("Device health retrieval failed: %v", err)
@@ -311,6 +348,34 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
311348
return health
312349
}
313350

351+
// healthStatusForCardXpumd checks device health using the xpumd service.
352+
func (dp *devicePlugin) healthStatusForCardXpumd(cardPath string) string {
353+
bdfAddr, ok := bdfForCard(cardPath)
354+
if !ok {
355+
return pluginapi.Healthy
356+
}
357+
358+
health := pluginapi.Healthy
359+
360+
// Check status changes after the function exits
361+
defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }()
362+
363+
healthy, err := dp.xpumdService.GetDeviceHealth(bdfAddr)
364+
if err != nil {
365+
klog.Warningf("xpumd device health retrieval failed: %v", err)
366+
367+
return health
368+
}
369+
370+
klog.V(4).Infof("xpumd health for %s: Healthy=%t", bdfAddr, healthy)
371+
372+
if !healthy {
373+
health = pluginapi.Unhealthy
374+
}
375+
376+
return health
377+
}
378+
314379
// Implement the PreferredAllocator interface.
315380
func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
316381
response := &pluginapi.PreferredAllocationResponse{}
@@ -706,23 +771,67 @@ func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi
706771
return nil, &dpapi.UseDefaultMethodError{}
707772
}
708773

709-
func checkAllowDenyOptions(opts cliOptions) bool {
774+
func checkBasics(opts cliOptions) error {
775+
if opts.sharedDevNum < 1 {
776+
return newArgError("the number of containers sharing the same GPU must greater than zero")
777+
}
778+
779+
var str = opts.preferredAllocationPolicy
780+
if !(str == "balanced" || str == "packed" || str == "none") {
781+
return newArgError("invalid value for preferredAllocationPolicy, the valid values: balanced, packed, none")
782+
}
783+
710784
if len(opts.allowIDs) > 0 && len(opts.denyIDs) > 0 {
711-
klog.Error("Cannot use both allow-ids and deny-ids options at the same time. Please use only one of them.")
712-
return false
785+
return newArgError("cannot use both allow-ids and deny-ids options at the same time")
713786
}
714787

715788
if err := validatePCIDeviceIDs(opts.allowIDs); err != nil {
716-
klog.Error("Failed to validate allow-ids: ", err)
717-
return false
789+
return fmt.Errorf("failed to validate allow-ids: %w", err)
718790
}
719791

720792
if err := validatePCIDeviceIDs(opts.denyIDs); err != nil {
721-
klog.Error("Failed to validate deny-ids: ", err)
722-
return false
793+
return fmt.Errorf("failed to validate deny-ids: %w", err)
723794
}
724795

725-
return true
796+
switch opts.monitoringMode {
797+
case monitoringModeSingle:
798+
case monitoringModeSplit:
799+
default:
800+
return newArgError(fmt.Sprintf("invalid value for monitoring-mode, valid values: %s, %s",
801+
monitoringModeSplit, monitoringModeSingle))
802+
}
803+
804+
return nil
805+
}
806+
807+
func checkArgs(opts cliOptions) error {
808+
if err := checkBasics(opts); err != nil {
809+
return fmt.Errorf("%w", err)
810+
}
811+
812+
if opts.wslScan {
813+
if opts.enableMonitoring {
814+
return newArgError("monitoring is not supported within WSL.")
815+
}
816+
817+
if opts.healthManagement || opts.xpumdEndpoint != "" {
818+
return newArgError("health management is not supported within WSL.")
819+
}
820+
}
821+
822+
if opts.healthManagement && opts.xpumdEndpoint != "" {
823+
return newArgError("cannot use both Level-Zero sidecar and xpumd for health management.")
824+
}
825+
826+
if opts.xpumdEndpoint != "" {
827+
if opts.globalTempLimit != defaultTempLimit ||
828+
opts.gpuTempLimit != defaultTempLimit ||
829+
opts.memoryTempLimit != defaultTempLimit {
830+
return newArgError("temperature limits do not work with xpumd health source")
831+
}
832+
}
833+
834+
return nil
726835
}
727836

728837
func main() {
@@ -734,69 +843,57 @@ func main() {
734843
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
735844
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable monitoring (= all GPUs) resource(s). See also --monitoring-mode")
736845
flag.StringVar(&opts.monitoringMode, "monitoring-mode", monitoringModeSingle, "monitoring resource mode when --enable-monitoring is set: single (combined gpu.intel.com/monitoring resource) or split (per-driver i915_monitoring/xe_monitoring resources)")
737-
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
846+
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable Level-Zero sidecar based GPU health management")
847+
flag.StringVar(&opts.xpumdEndpoint, "xpumd-endpoint", "", "enable xpumd based health management. Argument is unix socket path for the xpumd health service (e.g. /run/xpumd/intelxpuinfo.sock). When set, health data is retrieved from xpumd")
738848
flag.StringVar(&opts.bypathMount, "bypath", bypathOptionSingle, "DRI device 'by-path/' directory mounting options: single, none, all. Default: single")
739849
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
740-
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
741-
flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy")
742-
flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", 100, "GPU temperature limit at which device is marked unhealthy")
743-
flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", 100, "Memory temperature limit at which device is marked unhealthy")
850+
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device.")
851+
flag.IntVar(&opts.globalTempLimit, "temp-limit", defaultTempLimit, "Global temperature limit at which device is marked unhealthy. Use with health-managmement.")
852+
flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", defaultTempLimit, "GPU temperature limit at which device is marked unhealthy. Use with health-managmement.")
853+
flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", defaultTempLimit, "Memory temperature limit at which device is marked unhealthy. Use with health-managmement.")
744854
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
745855
flag.StringVar(&opts.allowIDs, "allow-ids", "", "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)")
746856
flag.StringVar(&opts.denyIDs, "deny-ids", "", "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)")
747857

748858
flag.Parse()
749859

750-
if opts.sharedDevNum < 1 {
751-
klog.Error("The number of containers sharing the same GPU must greater than zero")
752-
os.Exit(1)
753-
}
754-
755-
var str = opts.preferredAllocationPolicy
756-
if !(str == "balanced" || str == "packed" || str == "none") {
757-
klog.Error("invalid value for preferredAllocationPolicy, the valid values: balanced, packed, none")
758-
os.Exit(1)
759-
}
860+
klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy)
760861

761-
if !checkAllowDenyOptions(opts) {
762-
klog.Error("Invalid allow/deny options.")
862+
plugin := newDevicePlugin(prefix+sysFsRoot, prefix+devFsRoot, opts)
763863

764-
os.Exit(1)
864+
if err := checkArgs(plugin.options); err != nil {
865+
klog.Fatal("Argument check failed: ", err)
765866
}
766867

767-
switch opts.monitoringMode {
768-
case monitoringModeSingle:
769-
case monitoringModeSplit:
770-
default:
771-
klog.Fatalf("invalid value for monitoring-mode, valid values: %s, %s", monitoringModeSplit, monitoringModeSingle)
772-
}
868+
// Setup xpumd service if enabled
869+
setupXpumdService(plugin)
870+
// Setup Level-Zero service if enabled
871+
setupLevelZeroService(plugin)
773872

774-
klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy)
775-
776-
plugin := newDevicePlugin(prefix+sysFsRoot, prefix+devFsRoot, opts)
873+
manager := dpapi.NewManager(namespace, plugin)
874+
manager.Run()
875+
}
777876

778-
if plugin.options.wslScan {
779-
klog.Info("WSL mode requested")
877+
func setupLevelZeroService(plugin *devicePlugin) {
878+
if !plugin.options.healthManagement && !plugin.options.wslScan {
879+
return
880+
}
780881

781-
if plugin.options.enableMonitoring {
782-
klog.Error("Monitoring is not supported within WSL. Please disable monitoring.")
882+
klog.Info("levelzero service requested: ", gpulevelzero.DefaultUnixSocketPath)
783883

784-
os.Exit(1)
785-
}
884+
plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath)
786885

787-
if plugin.options.healthManagement {
788-
klog.Error("Health management is not supported within WSL. Please disable health management.")
886+
go plugin.levelzeroService.Run(true)
887+
}
789888

790-
os.Exit(1)
791-
}
889+
func setupXpumdService(plugin *devicePlugin) {
890+
if plugin.options.xpumdEndpoint == "" {
891+
return
792892
}
793893

794-
if plugin.options.healthManagement || plugin.options.wslScan {
795-
plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath)
894+
klog.Info("xpumd health source requested: ", plugin.options.xpumdEndpoint)
796895

797-
go plugin.levelzeroService.Run(true)
798-
}
896+
plugin.xpumdService = xpumdservice.NewXpumd(plugin.options.xpumdEndpoint)
799897

800-
manager := dpapi.NewManager(namespace, plugin)
801-
manager.Run()
898+
go plugin.xpumdService.Run(context.Background())
802899
}

0 commit comments

Comments
 (0)