|
| 1 | +From 3c576ab7e84f9c8ea8cc89691df6bc001b5d5dee Mon Sep 17 00:00:00 2001 |
| 2 | +From: Michael Henriksen <mhenriks@redhat.com> |
| 3 | +Date: Fri, 17 Apr 2026 23:29:54 -0400 |
| 4 | +Subject: [PATCH] Fix VM with PCI hostdev failing to restart after hotplug |
| 5 | + block volume |
| 6 | + |
| 7 | +When a hotplug block volume is mounted into the virt-launcher pod, |
| 8 | +allowBlockMajorMinor() calls cgroupManager.Set() to add the block |
| 9 | +device to the cgroup allowlist. On cgroups v2, this replaces the |
| 10 | +entire eBPF device filter program. The v2Manager rebuilds the program |
| 11 | +from its in-memory rule cache, which is initialized from |
| 12 | +generateDeviceRulesForVMI() and does not include devices provisioned |
| 13 | +by device plugins. This wipes access to device-plugin-provided nodes |
| 14 | +such as /dev/vfio/* (PCI/MDEV/GPU/SR-IOV passthrough) and |
| 15 | +/dev/bus/usb/* (USB passthrough), causing libvirt to fail with |
| 16 | +"pci backend driver type 'default' is not supported" when starting |
| 17 | +the domain. |
| 18 | + |
| 19 | +Fix by recursively scanning /dev/vfio/ and /dev/bus/usb/ inside the |
| 20 | +container and including all discovered device rules in the initial |
| 21 | +cache so they are preserved when the eBPF program is rebuilt. |
| 22 | + |
| 23 | +Fixes: https://github.com/kubevirt/kubevirt/issues/17124 |
| 24 | + |
| 25 | +Signed-off-by: Michael Henriksen <mhenriks@redhat.com> |
| 26 | +Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> |
| 27 | +Signed-off-by: Michael Henriksen <mhenriks@redhat.com> |
| 28 | +--- |
| 29 | + pkg/virt-handler/cgroup/BUILD.bazel | 3 + |
| 30 | + pkg/virt-handler/cgroup/cgroup_test.go | 50 ++++++++++++++ |
| 31 | + pkg/virt-handler/cgroup/util.go | 95 +++++++++++++++++++++++--- |
| 32 | + tests/storage/hotplug.go | 77 +++++++++++++++++++++ |
| 33 | + 4 files changed, 214 insertions(+), 11 deletions(-) |
| 34 | + |
| 35 | +diff --git a/pkg/virt-handler/cgroup/BUILD.bazel b/pkg/virt-handler/cgroup/BUILD.bazel |
| 36 | +index ace69f1d78..4f4ec95714 100644 |
| 37 | +--- a/pkg/virt-handler/cgroup/BUILD.bazel |
| 38 | ++++ b/pkg/virt-handler/cgroup/BUILD.bazel |
| 39 | +@@ -40,6 +40,9 @@ go_test( |
| 40 | + embed = [":go_default_library"], |
| 41 | + race = "on", |
| 42 | + deps = [ |
| 43 | ++ "//pkg/safepath:go_default_library", |
| 44 | ++ "//pkg/virt-handler/isolation:go_default_library", |
| 45 | ++ "//staging/src/kubevirt.io/api/core/v1:go_default_library", |
| 46 | + "//staging/src/kubevirt.io/client-go/testutils:go_default_library", |
| 47 | + "//vendor/github.com/onsi/ginkgo/v2:go_default_library", |
| 48 | + "//vendor/github.com/onsi/gomega:go_default_library", |
| 49 | +diff --git a/pkg/virt-handler/cgroup/cgroup_test.go b/pkg/virt-handler/cgroup/cgroup_test.go |
| 50 | +index 50b5198e2a..53450e2a06 100644 |
| 51 | +--- a/pkg/virt-handler/cgroup/cgroup_test.go |
| 52 | ++++ b/pkg/virt-handler/cgroup/cgroup_test.go |
| 53 | +@@ -20,12 +20,20 @@ |
| 54 | + package cgroup |
| 55 | + |
| 56 | + import ( |
| 57 | ++ "os" |
| 58 | ++ "path/filepath" |
| 59 | ++ |
| 60 | + . "github.com/onsi/ginkgo/v2" |
| 61 | + . "github.com/onsi/gomega" |
| 62 | + runc_cgroups "github.com/opencontainers/runc/libcontainer/cgroups" |
| 63 | + runc_configs "github.com/opencontainers/runc/libcontainer/configs" |
| 64 | + "github.com/opencontainers/runc/libcontainer/devices" |
| 65 | + "go.uber.org/mock/gomock" |
| 66 | ++ |
| 67 | ++ v1 "kubevirt.io/api/core/v1" |
| 68 | ++ |
| 69 | ++ "kubevirt.io/kubevirt/pkg/safepath" |
| 70 | ++ "kubevirt.io/kubevirt/pkg/virt-handler/isolation" |
| 71 | + ) |
| 72 | + |
| 73 | + var _ = Describe("cgroup manager", func() { |
| 74 | +@@ -195,3 +203,45 @@ var _ = Describe("cgroup manager", func() { |
| 75 | + ), |
| 76 | + ) |
| 77 | + }) |
| 78 | ++ |
| 79 | ++var _ = Describe("generateDeviceRulesForVMI", func() { |
| 80 | ++ var ( |
| 81 | ++ ctrl *gomock.Controller |
| 82 | ++ tempDir string |
| 83 | ++ ) |
| 84 | ++ |
| 85 | ++ BeforeEach(func() { |
| 86 | ++ ctrl = gomock.NewController(GinkgoT()) |
| 87 | ++ tempDir = GinkgoT().TempDir() |
| 88 | ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev"), 0755)).To(Succeed()) |
| 89 | ++ }) |
| 90 | ++ |
| 91 | ++ newMockIsolationWithMountRoot := func() isolation.IsolationResult { |
| 92 | ++ mountRoot, err := safepath.NewPathNoFollow(tempDir) |
| 93 | ++ Expect(err).ToNot(HaveOccurred()) |
| 94 | ++ |
| 95 | ++ mockIso := isolation.NewMockIsolationResult(ctrl) |
| 96 | ++ mockIso.EXPECT().MountRoot().Return(mountRoot, nil) |
| 97 | ++ return mockIso |
| 98 | ++ } |
| 99 | ++ |
| 100 | ++ It("should not fail when /dev/vfio does not exist", func() { |
| 101 | ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") |
| 102 | ++ Expect(err).ToNot(HaveOccurred()) |
| 103 | ++ Expect(rules).To(BeEmpty()) |
| 104 | ++ }) |
| 105 | ++ |
| 106 | ++ It("should not fail when /dev/vfio exists but is empty", func() { |
| 107 | ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev", "vfio"), 0755)).To(Succeed()) |
| 108 | ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") |
| 109 | ++ Expect(err).ToNot(HaveOccurred()) |
| 110 | ++ Expect(rules).To(BeEmpty()) |
| 111 | ++ }) |
| 112 | ++ |
| 113 | ++ It("should not fail when /dev/bus/usb exists but is empty", func() { |
| 114 | ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev", "bus", "usb"), 0755)).To(Succeed()) |
| 115 | ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") |
| 116 | ++ Expect(err).ToNot(HaveOccurred()) |
| 117 | ++ Expect(rules).To(BeEmpty()) |
| 118 | ++ }) |
| 119 | ++}) |
| 120 | +diff --git a/pkg/virt-handler/cgroup/util.go b/pkg/virt-handler/cgroup/util.go |
| 121 | +index 892113c83d..be115bad14 100644 |
| 122 | +--- a/pkg/virt-handler/cgroup/util.go |
| 123 | ++++ b/pkg/virt-handler/cgroup/util.go |
| 124 | +@@ -59,6 +59,9 @@ const ( |
| 125 | + V2 CgroupVersion = "v2" |
| 126 | + |
| 127 | + loggingVerbosity = 2 |
| 128 | ++ |
| 129 | ++ rwmPermissions = "rwm" |
| 130 | ++ rwPermissions = "rw" |
| 131 | + ) |
| 132 | + |
| 133 | + var ( |
| 134 | +@@ -126,6 +129,18 @@ func getSourceBlockToFsMigratedVolumes(vmi *v1.VirtualMachineInstance, host stri |
| 135 | + return vols |
| 136 | + } |
| 137 | + |
| 138 | ++func getDevicePermissionsFromCgroups() devices.Permissions { |
| 139 | ++ if cgroups.IsCgroup2UnifiedMode() { |
| 140 | ++ return rwmPermissions |
| 141 | ++ } else { |
| 142 | ++ return rwPermissions |
| 143 | ++ } |
| 144 | ++} |
| 145 | ++ |
| 146 | ++func getDeviceRwmPermissions() devices.Permissions { |
| 147 | ++ return rwmPermissions |
| 148 | ++} |
| 149 | ++ |
| 150 | + // This builds up the known persistent block devices allow list for a VMI (as in, hotplugged volumes are handled separately) |
| 151 | + // This will be maintained and extended as new devices likely have to end up on this list as well |
| 152 | + // For example - https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/ |
| 153 | +@@ -159,7 +174,7 @@ func generateDeviceRulesForVMI(vmi *v1.VirtualMachineInstance, isolationRes isol |
| 154 | + } |
| 155 | + return nil, fmt.Errorf("failed to resolve path for volume %s: %v", volume.Name, err) |
| 156 | + } |
| 157 | +- if deviceRule, err := newAllowedDeviceRule(path); err != nil { |
| 158 | ++ if deviceRule, err := newAllowedDeviceRule(path, getDeviceRwmPermissions()); err != nil { |
| 159 | + return nil, fmt.Errorf("failed to create device rule for %s: %v", path, err) |
| 160 | + } else if deviceRule != nil { |
| 161 | + log.Log.V(loggingVerbosity).Infof("device rule for volume %s: %v", volume.Name, deviceRule) |
| 162 | +@@ -171,7 +186,7 @@ func generateDeviceRulesForVMI(vmi *v1.VirtualMachineInstance, isolationRes isol |
| 163 | + if err != nil { |
| 164 | + return nil, err |
| 165 | + } |
| 166 | +- if deviceRule, err := newAllowedDeviceRule(path); err != nil { |
| 167 | ++ if deviceRule, err := newAllowedDeviceRule(path, getDeviceRwmPermissions()); err != nil { |
| 168 | + return nil, fmt.Errorf("failed to create device rule for %s: %v", path, err) |
| 169 | + } else if deviceRule != nil { |
| 170 | + log.Log.V(loggingVerbosity).Infof("device rule for volume rng: %v", deviceRule) |
| 171 | +@@ -183,7 +198,7 @@ func generateDeviceRulesForVMI(vmi *v1.VirtualMachineInstance, isolationRes isol |
| 172 | + if err != nil { |
| 173 | + return nil, err |
| 174 | + } |
| 175 | +- if deviceRule, err := newAllowedDeviceRule(path); err != nil { |
| 176 | ++ if deviceRule, err := newAllowedDeviceRule(path, getDeviceRwmPermissions()); err != nil { |
| 177 | + return nil, fmt.Errorf("failed to create device rule for %s: %v", path, err) |
| 178 | + } else if deviceRule != nil { |
| 179 | + log.Log.V(loggingVerbosity).Infof("device rule for volume vsock: %v", deviceRule) |
| 180 | +@@ -191,10 +206,73 @@ func generateDeviceRulesForVMI(vmi *v1.VirtualMachineInstance, isolationRes isol |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | ++ // Device-plugin-provisioned devices (VFIO, USB) must be in the cgroup |
| 185 | ++ // rule cache so they survive eBPF program rebuilds during hotplug. |
| 186 | ++ for _, devDir := range []string{ |
| 187 | ++ filepath.Join("dev", "vfio"), |
| 188 | ++ filepath.Join("dev", "bus", "usb"), |
| 189 | ++ } { |
| 190 | ++ rules, err := discoverDeviceRulesInDir(mountRoot, devDir) |
| 191 | ++ if err != nil { |
| 192 | ++ return nil, fmt.Errorf("failed to discover device rules in %s: %v", devDir, err) |
| 193 | ++ } |
| 194 | ++ vmiDeviceRules = append(vmiDeviceRules, rules...) |
| 195 | ++ } |
| 196 | ++ |
| 197 | + return vmiDeviceRules, nil |
| 198 | + } |
| 199 | + |
| 200 | +-func newAllowedDeviceRule(devicePath *safepath.Path) (*devices.Rule, error) { |
| 201 | ++// discoverDeviceRulesInDir recursively scans a directory under the |
| 202 | ++// container's filesystem and creates allow rules for all device nodes |
| 203 | ++// found. These devices are provisioned by device plugins or the container |
| 204 | ++// runtime and must be preserved in the v2 cgroup manager's rule cache so |
| 205 | ++// they are not lost when the eBPF device filter is rebuilt by subsequent |
| 206 | ++// Set() calls (e.g. during hotplug volume mounting). |
| 207 | ++func discoverDeviceRulesInDir(mountRoot *safepath.Path, relPath string) ([]*devices.Rule, error) { |
| 208 | ++ dirPath, err := safepath.JoinNoFollow(mountRoot, relPath) |
| 209 | ++ if err != nil { |
| 210 | ++ if errors.Is(err, os.ErrNotExist) { |
| 211 | ++ return nil, nil |
| 212 | ++ } |
| 213 | ++ return nil, err |
| 214 | ++ } |
| 215 | ++ |
| 216 | ++ var entries []os.DirEntry |
| 217 | ++ err = dirPath.ExecuteNoFollow(func(path string) (err error) { |
| 218 | ++ entries, err = os.ReadDir(path) |
| 219 | ++ return err |
| 220 | ++ }) |
| 221 | ++ if err != nil { |
| 222 | ++ return nil, err |
| 223 | ++ } |
| 224 | ++ |
| 225 | ++ var rules []*devices.Rule |
| 226 | ++ for _, entry := range entries { |
| 227 | ++ if entry.IsDir() { |
| 228 | ++ subRules, err := discoverDeviceRulesInDir(mountRoot, filepath.Join(relPath, entry.Name())) |
| 229 | ++ if err != nil { |
| 230 | ++ return nil, err |
| 231 | ++ } |
| 232 | ++ rules = append(rules, subRules...) |
| 233 | ++ continue |
| 234 | ++ } |
| 235 | ++ devPath, err := safepath.JoinNoFollow(dirPath, entry.Name()) |
| 236 | ++ if err != nil { |
| 237 | ++ return nil, err |
| 238 | ++ } |
| 239 | ++ rule, err := newAllowedDeviceRule(devPath, getDeviceRwmPermissions()) |
| 240 | ++ if err != nil { |
| 241 | ++ return nil, fmt.Errorf("failed to create device rule for %s/%s: %v", relPath, entry.Name(), err) |
| 242 | ++ } |
| 243 | ++ if rule != nil { |
| 244 | ++ log.Log.V(loggingVerbosity).Infof("device rule for %s/%s: %v", relPath, entry.Name(), rule) |
| 245 | ++ rules = append(rules, rule) |
| 246 | ++ } |
| 247 | ++ } |
| 248 | ++ return rules, nil |
| 249 | ++} |
| 250 | ++ |
| 251 | ++func newAllowedDeviceRule(devicePath *safepath.Path, devicePermissions devices.Permissions) (*devices.Rule, error) { |
| 252 | + fileInfo, err := safepath.StatAtNoFollow(devicePath) |
| 253 | + if err != nil { |
| 254 | + return nil, err |
| 255 | +@@ -211,7 +289,7 @@ func newAllowedDeviceRule(devicePath *safepath.Path) (*devices.Rule, error) { |
| 256 | + Type: deviceType, |
| 257 | + Major: int64(unix.Major(stat.Rdev)), |
| 258 | + Minor: int64(unix.Minor(stat.Rdev)), |
| 259 | +- Permissions: "rwm", |
| 260 | ++ Permissions: devicePermissions, |
| 261 | + Allow: true, |
| 262 | + }, nil |
| 263 | + } |
| 264 | +@@ -224,12 +302,7 @@ func GenerateDefaultDeviceRules() []*devices.Rule { |
| 265 | + |
| 266 | + const toAllow = true |
| 267 | + |
| 268 | +- var permissions devices.Permissions |
| 269 | +- if cgroups.IsCgroup2UnifiedMode() { |
| 270 | +- permissions = "rwm" |
| 271 | +- } else { |
| 272 | +- permissions = "rw" |
| 273 | +- } |
| 274 | ++ permissions := getDevicePermissionsFromCgroups() |
| 275 | + |
| 276 | + defaultRules := []*devices.Rule{ |
| 277 | + { // /dev/ptmx (PTY master multiplex) |
| 278 | +diff --git a/tests/storage/hotplug.go b/tests/storage/hotplug.go |
| 279 | +index 2a40430c29..cc3567386f 100644 |
| 280 | +--- a/tests/storage/hotplug.go |
| 281 | ++++ b/tests/storage/hotplug.go |
| 282 | +@@ -2275,6 +2275,83 @@ var _ = Describe(SIG("Hotplug", func() { |
| 283 | + verifyVolumeNolongerAccessible(vmi, targets[0]) |
| 284 | + }) |
| 285 | + }) |
| 286 | ++ |
| 287 | ++ // Regression test for https://github.com/kubevirt/kubevirt/issues/17124 |
| 288 | ++ Context("with PCI hostdev", Serial, func() { |
| 289 | ++ const deviceName = "example.org/soundcard" |
| 290 | ++ |
| 291 | ++ BeforeEach(func() { |
| 292 | ++ kvconfig.EnableFeatureGate(featuregate.HostDevicesGate) |
| 293 | ++ |
| 294 | ++ kv := libkubevirt.GetCurrentKv(virtClient) |
| 295 | ++ config := kv.Spec.Configuration |
| 296 | ++ config.PermittedHostDevices = &v1.PermittedHostDevices{ |
| 297 | ++ PciHostDevices: []v1.PciHostDevice{ |
| 298 | ++ { |
| 299 | ++ PCIVendorSelector: "8086:2668", |
| 300 | ++ ResourceName: deviceName, |
| 301 | ++ }, |
| 302 | ++ }, |
| 303 | ++ } |
| 304 | ++ kvconfig.UpdateKubeVirtConfigValueAndWait(config) |
| 305 | ++ }) |
| 306 | ++ |
| 307 | ++ AfterEach(func() { |
| 308 | ++ kv := libkubevirt.GetCurrentKv(virtClient) |
| 309 | ++ config := kv.Spec.Configuration |
| 310 | ++ config.PermittedHostDevices = &v1.PermittedHostDevices{} |
| 311 | ++ kvconfig.UpdateKubeVirtConfigValueAndWait(config) |
| 312 | ++ kvconfig.DisableFeatureGate(featuregate.HostDevicesGate) |
| 313 | ++ }) |
| 314 | ++ |
| 315 | ++ It("should restart a VM after hotplugging a block volume", func() { |
| 316 | ++ sc, exists := libstorage.GetRWOBlockStorageClass() |
| 317 | ++ if !exists { |
| 318 | ++ Skip("no RWO block storage class available") |
| 319 | ++ } |
| 320 | ++ |
| 321 | ++ vmiSpec := libvmifact.NewAlpineWithTestTooling() |
| 322 | ++ vmiSpec.Spec.Domain.Devices.HostDevices = []v1.HostDevice{ |
| 323 | ++ {Name: "sound0", DeviceName: deviceName}, |
| 324 | ++ } |
| 325 | ++ vm, err := virtClient.VirtualMachine(testsuite.GetTestNamespace(nil)).Create( |
| 326 | ++ context.Background(), |
| 327 | ++ libvmi.NewVirtualMachine(vmiSpec, libvmi.WithRunStrategy(v1.RunStrategyAlways)), |
| 328 | ++ metav1.CreateOptions{}, |
| 329 | ++ ) |
| 330 | ++ Expect(err).ToNot(HaveOccurred()) |
| 331 | ++ Eventually(matcher.ThisVM(vm)).WithTimeout(300 * time.Second).WithPolling(time.Second).Should(matcher.BeReady()) |
| 332 | ++ |
| 333 | ++ vmi, err := virtClient.VirtualMachineInstance(vm.Namespace).Get(context.Background(), vm.Name, metav1.GetOptions{}) |
| 334 | ++ Expect(err).ToNot(HaveOccurred()) |
| 335 | ++ libwait.WaitForSuccessfulVMIStart(vmi, libwait.WithTimeout(240)) |
| 336 | ++ |
| 337 | ++ dvBuilder := libdv.NewDataVolume( |
| 338 | ++ libdv.WithBlankImageSource(), |
| 339 | ++ libdv.WithStorage( |
| 340 | ++ libdv.StorageWithStorageClass(sc), |
| 341 | ++ libdv.StorageWithVolumeSize(cd.BlankVolumeSize), |
| 342 | ++ libdv.StorageWithVolumeMode(k8sv1.PersistentVolumeBlock), |
| 343 | ++ ), |
| 344 | ++ ) |
| 345 | ++ dv, err := virtClient.CdiClient().CdiV1beta1().DataVolumes(testsuite.GetTestNamespace(nil)).Create( |
| 346 | ++ context.Background(), dvBuilder, metav1.CreateOptions{}) |
| 347 | ++ Expect(err).ToNot(HaveOccurred()) |
| 348 | ++ libstorage.EventuallyDV(dv, 240, Or(matcher.HaveSucceeded(), matcher.WaitForFirstConsumer())) |
| 349 | ++ |
| 350 | ++ By("Hotplugging a block volume to the running VM") |
| 351 | ++ addVolumeVMWithSource(vm.Name, vm.Namespace, getAddVolumeOptions("hotplug-vol", v1.DiskBusSCSI, &v1.HotplugVolumeSource{ |
| 352 | ++ DataVolume: &v1.DataVolumeSource{Name: dv.Name}, |
| 353 | ++ }, false, false, "")) |
| 354 | ++ verifyVolumeStatus(vmi, v1.VolumeReady, "", "hotplug-vol") |
| 355 | ++ |
| 356 | ++ By("Restarting the VM") |
| 357 | ++ vm = libvmops.StopVirtualMachine(vm) |
| 358 | ++ err = virtClient.VirtualMachine(vm.Namespace).Start(context.Background(), vm.Name, &v1.StartOptions{}) |
| 359 | ++ Expect(err).ToNot(HaveOccurred()) |
| 360 | ++ Eventually(matcher.ThisVM(vm), 300*time.Second, time.Second).Should(matcher.BeReady()) |
| 361 | ++ }) |
| 362 | ++ }) |
| 363 | + })) |
| 364 | + |
| 365 | + func verifyVolumeAndDiskVMAdded(virtClient kubecli.KubevirtClient, vm *v1.VirtualMachine, volumeNames ...string) { |
| 366 | +-- |
| 367 | +2.34.1 |
| 368 | + |
0 commit comments