Skip to content

Commit 0584143

Browse files
skip mdev-mode GPUs in waitForVFs
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 7f01855 commit 0584143

1 file changed

Lines changed: 75 additions & 33 deletions

File tree

cmd/nvidia-validator/main.go

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ const (
237237
wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
238238
// shell indicates what shell to use when invoking commands in a subprocess
239239
shell = "sh"
240-
// defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
241-
defaultVFWaitTimeout = 5 * time.Minute
240+
// defaultVGPUReadinessTimeout is the default timeout for waiting for the vGPU stack to be ready
241+
defaultVGPUReadinessTimeout = 5 * time.Minute
242242
// constants for driver components
243243
GDRCOPY = "gdrcopy"
244244
NVIDIAFS = "nvidia-fs"
@@ -1747,9 +1747,9 @@ func (v *VGPUManager) validate() error {
17471747
return err
17481748
}
17491749

1750-
log.Info("Waiting for VFs to be available...")
1751-
if err := waitForVFs(ctx, defaultVFWaitTimeout); err != nil {
1752-
return fmt.Errorf("vGPU Manager VFs not ready: %w", err)
1750+
log.Info("Waiting for parent devices to be available...")
1751+
if err := waitForParentDevices(ctx, defaultVGPUReadinessTimeout); err != nil {
1752+
return fmt.Errorf("vGPU Manager parent devices not ready: %w", err)
17531753
}
17541754

17551755
statusFile := vGPUManagerStatusFile
@@ -1783,43 +1783,85 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17831783
return hostDriver, runCommand(command, args, silent)
17841784
}
17851785

1786-
// waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1787-
// It polls sriov_numvfs until all GPUs have their full VF count enabled.
1788-
func waitForVFs(ctx context.Context, timeout time.Duration) error {
1786+
// waitForParentDevices polls until the vGPU stack is ready — either NVIDIA
1787+
// mdev parent devices have been registered (PF on Turing, VFs on Ampere+
1788+
// SR-IOV) or all SR-IOV VFs are enabled.
1789+
func waitForParentDevices(ctx context.Context, timeout time.Duration) error {
17891790
pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
1790-
nvpciLib := nvpci.New()
17911791

17921792
return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
1793-
gpus, err := nvpciLib.GetGPUs()
1794-
if err != nil {
1795-
log.Warnf("Error getting GPUs: %v", err)
1796-
return false, nil
1797-
}
1798-
1799-
var totalExpected, totalEnabled uint64
1800-
var pfCount int
1801-
for _, gpu := range gpus {
1802-
sriovInfo := gpu.SriovInfo
1803-
if sriovInfo.IsPF() {
1804-
pfCount++
1805-
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
1806-
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
1807-
}
1793+
if driverUsingSRIOV() {
1794+
return allVFsReady() && mdevParentDevicesExist(), nil
18081795
}
1796+
return mdevParentDevicesExist(), nil
1797+
})
1798+
}
18091799

1810-
if totalExpected == 0 {
1811-
log.Info("No SR-IOV capable GPUs found, skipping VF wait")
1812-
return true, nil
1800+
// driverUsingSRIOV reports whether the NVIDIA driver has enabled any SR-IOV
1801+
// Virtual Functions. T4 (mdev-only) returns false even though the silicon
1802+
// advertises sriov_totalvfs; A100 in vGPU mode returns true once the driver
1803+
// begins enabling VFs.
1804+
func driverUsingSRIOV() bool {
1805+
nvpciLib := nvpci.New()
1806+
gpus, err := nvpciLib.GetGPUs()
1807+
if err != nil {
1808+
log.Warnf("error getting GPUs: %v", err)
1809+
return false
1810+
}
1811+
for _, gpu := range gpus {
1812+
if gpu.SriovInfo.IsPF() && gpu.SriovInfo.PhysicalFunction.NumVFs > 0 {
1813+
return true
18131814
}
1815+
}
1816+
return false
1817+
}
18141818

1815-
if totalEnabled == totalExpected {
1816-
log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
1817-
return true, nil
1819+
func mdevParentDevicesExist() bool {
1820+
nvmdevLib := nvmdev.New()
1821+
parents, err := nvmdevLib.GetAllParentDevices()
1822+
if err != nil {
1823+
log.Warnf("could not get mdev parent devices: %v", err)
1824+
return false
1825+
}
1826+
if len(parents) == 0 {
1827+
log.Info("found 0 mdev parent devices")
1828+
return false
1829+
}
1830+
log.Infof("found %d mdev parent devices", len(parents))
1831+
return true
1832+
}
1833+
1834+
func allVFsReady() bool {
1835+
nvpciLib := nvpci.New()
1836+
gpus, err := nvpciLib.GetGPUs()
1837+
if err != nil {
1838+
log.Warnf("error getting GPUs: %v", err)
1839+
return false
1840+
}
1841+
1842+
var totalExpected, totalEnabled uint64
1843+
var pfCount int
1844+
for _, gpu := range gpus {
1845+
sriovInfo := gpu.SriovInfo
1846+
if sriovInfo.IsPF() {
1847+
pfCount++
1848+
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
1849+
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
18181850
}
1851+
}
18191852

1820-
log.Infof("Waiting for VFs: %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
1821-
return false, nil
1822-
})
1853+
if totalExpected == 0 {
1854+
log.Info("no SR-IOV capable GPUs found")
1855+
return false
1856+
}
1857+
1858+
if totalEnabled == totalExpected {
1859+
log.Infof("all %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
1860+
return true
1861+
}
1862+
1863+
log.Infof("not all VFs have been created. %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
1864+
return false
18231865
}
18241866

18251867
func (c *CCManager) validate() error {

0 commit comments

Comments
 (0)