Skip to content

Commit 71c0e97

Browse files
skip mdev-mode GPUs in waitForVFs
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 7f01855 commit 71c0e97

1 file changed

Lines changed: 55 additions & 32 deletions

File tree

cmd/nvidia-validator/main.go

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,9 +1747,9 @@ func (v *VGPUManager) validate() error {
17471747
return err
17481748
}
17491749

1750-
log.Info("Waiting for VFs to be available...")
1751-
if err := waitForVFs(ctx, defaultVFWaitTimeout); err != nil {
1752-
return fmt.Errorf("vGPU Manager VFs not ready: %w", err)
1750+
log.Info("Waiting for parent devices to be available...")
1751+
if err := waitForParentDevices(ctx, defaultVFWaitTimeout); err != nil {
1752+
return fmt.Errorf("vGPU Manager parent devices not ready: %w", err)
17531753
}
17541754

17551755
statusFile := vGPUManagerStatusFile
@@ -1783,43 +1783,66 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17831783
return hostDriver, runCommand(command, args, silent)
17841784
}
17851785

1786-
// waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1787-
// It polls sriov_numvfs until all GPUs have their full VF count enabled.
1788-
func waitForVFs(ctx context.Context, timeout time.Duration) error {
1786+
// waitForParentDevices polls until the vGPU stack is ready — either NVIDIA
1787+
// mdev parent devices have been registered (PF on Turing, VFs on Ampere+
1788+
// SR-IOV) or all SR-IOV VFs are enabled.
1789+
func waitForParentDevices(ctx context.Context, timeout time.Duration) error {
17891790
pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
1790-
nvpciLib := nvpci.New()
17911791

17921792
return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
1793-
gpus, err := nvpciLib.GetGPUs()
1794-
if err != nil {
1795-
log.Warnf("Error getting GPUs: %v", err)
1796-
return false, nil
1797-
}
1798-
1799-
var totalExpected, totalEnabled uint64
1800-
var pfCount int
1801-
for _, gpu := range gpus {
1802-
sriovInfo := gpu.SriovInfo
1803-
if sriovInfo.IsPF() {
1804-
pfCount++
1805-
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
1806-
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
1807-
}
1808-
}
1809-
1810-
if totalExpected == 0 {
1811-
log.Info("No SR-IOV capable GPUs found, skipping VF wait")
1793+
if mdevParentDevicesExist() || vfsExist() {
18121794
return true, nil
18131795
}
1796+
return false, nil
1797+
})
1798+
}
18141799

1815-
if totalEnabled == totalExpected {
1816-
log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
1817-
return true, nil
1800+
func mdevParentDevicesExist() bool {
1801+
nvmdevLib := nvmdev.New()
1802+
parents, err := nvmdevLib.GetAllParentDevices()
1803+
if err != nil {
1804+
log.Warnf("could not get mdev parent devices: %v", err)
1805+
return false
1806+
}
1807+
if len(parents) == 0 {
1808+
log.Info("found 0 mdev parent devices")
1809+
return false
1810+
}
1811+
log.Infof("found %d mdev parent devices", len(parents))
1812+
return true
1813+
}
1814+
1815+
func vfsExist() bool {
1816+
nvpciLib := nvpci.New()
1817+
gpus, err := nvpciLib.GetGPUs()
1818+
if err != nil {
1819+
log.Warnf("error getting GPUs: %v", err)
1820+
return false
1821+
}
1822+
1823+
var totalExpected, totalEnabled uint64
1824+
var pfCount int
1825+
for _, gpu := range gpus {
1826+
sriovInfo := gpu.SriovInfo
1827+
if sriovInfo.IsPF() {
1828+
pfCount++
1829+
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
1830+
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
18181831
}
1832+
}
18191833

1820-
log.Infof("Waiting for VFs: %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
1821-
return false, nil
1822-
})
1834+
if totalExpected == 0 {
1835+
log.Info("no SR-IOV capable GPUs found")
1836+
return false
1837+
}
1838+
1839+
if totalEnabled == totalExpected {
1840+
log.Infof("all %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
1841+
return true
1842+
}
1843+
1844+
log.Infof("not all VFs have been created. %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
1845+
return false
18231846
}
18241847

18251848
func (c *CCManager) validate() error {

0 commit comments

Comments
 (0)