Skip to content

Commit b5f02b4

Browse files
skip mdev-mode GPUs in waitForVFs
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 7f01855 commit b5f02b4

1 file changed

Lines changed: 19 additions & 0 deletions

File tree

cmd/nvidia-validator/main.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1788,6 +1788,7 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17881788
func waitForVFs(ctx context.Context, timeout time.Duration) error {
17891789
pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
17901790
nvpciLib := nvpci.New()
1791+
nvmdevLib := nvmdev.New()
17911792

17921793
return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
17931794
gpus, err := nvpciLib.GetGPUs()
@@ -1812,6 +1813,24 @@ func waitForVFs(ctx context.Context, timeout time.Duration) error {
18121813
return true, nil
18131814
}
18141815

1816+
// vGPU stack is ready when mdev parents are registered (PF on Turing,
1817+
// VFs on Ampere+ SR-IOV) or all SR-IOV VFs are enabled. Missing
1818+
// /sys/class/mdev_bus = no mdev support, not a retryable error.
1819+
if _, statErr := os.Stat("/sys/class/mdev_bus"); statErr == nil {
1820+
parents, err := nvmdevLib.GetAllParentDevices()
1821+
if err != nil {
1822+
log.Warnf("Error listing mdev parent devices: %v", err)
1823+
return false, nil
1824+
}
1825+
if len(parents) > 0 {
1826+
log.Infof("vGPU stack ready: %d mdev parent device(s)", len(parents))
1827+
return true, nil
1828+
}
1829+
} else if !os.IsNotExist(statErr) {
1830+
log.Warnf("Error checking mdev_bus: %v", statErr)
1831+
return false, nil
1832+
}
1833+
18151834
if totalEnabled == totalExpected {
18161835
log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
18171836
return true, nil

0 commit comments

Comments
 (0)