@@ -237,8 +237,8 @@ const (
237237 wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
238238 // shell indicates what shell to use when invoking commands in a subprocess
239239 shell = "sh"
240- // defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
241- defaultVFWaitTimeout = 5 * time .Minute
240+ // defaultVGPUReadinessTimeout is the default timeout for waiting for the vGPU stack to be ready
241+ defaultVGPUReadinessTimeout = 5 * time .Minute
242242 // constants for driver components
243243 GDRCOPY = "gdrcopy"
244244 NVIDIAFS = "nvidia-fs"
@@ -1747,9 +1747,9 @@ func (v *VGPUManager) validate() error {
17471747 return err
17481748 }
17491749
1750- log .Info ("Waiting for VFs to be available..." )
1751- if err := waitForVFs (ctx , defaultVFWaitTimeout ); err != nil {
1752- return fmt .Errorf ("vGPU Manager VFs not ready: %w" , err )
1750+ log .Info ("Waiting for parent devices to be available..." )
1751+ if err := waitForParentDevices (ctx , defaultVGPUReadinessTimeout ); err != nil {
1752+ return fmt .Errorf ("vGPU Manager parent devices not ready: %w" , err )
17531753 }
17541754
17551755 statusFile := vGPUManagerStatusFile
@@ -1783,43 +1783,85 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17831783 return hostDriver , runCommand (command , args , silent )
17841784}
17851785
1786- // waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1787- // It polls sriov_numvfs until all GPUs have their full VF count enabled.
1788- func waitForVFs (ctx context.Context , timeout time.Duration ) error {
1786+ // waitForParentDevices polls until the vGPU stack is ready — either NVIDIA
1787+ // mdev parent devices have been registered (PF on Turing, VFs on Ampere+
1788+ // SR-IOV) or all SR-IOV VFs are enabled.
1789+ func waitForParentDevices (ctx context.Context , timeout time.Duration ) error {
17891790 pollInterval := time .Duration (sleepIntervalSecondsFlag ) * time .Second
1790- nvpciLib := nvpci .New ()
17911791
17921792 return wait .PollUntilContextTimeout (ctx , pollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
1793- gpus , err := nvpciLib .GetGPUs ()
1794- if err != nil {
1795- log .Warnf ("Error getting GPUs: %v" , err )
1796- return false , nil
1797- }
1798-
1799- var totalExpected , totalEnabled uint64
1800- var pfCount int
1801- for _ , gpu := range gpus {
1802- sriovInfo := gpu .SriovInfo
1803- if sriovInfo .IsPF () {
1804- pfCount ++
1805- totalExpected += sriovInfo .PhysicalFunction .TotalVFs
1806- totalEnabled += sriovInfo .PhysicalFunction .NumVFs
1807- }
1793+ if driverUsingSRIOV () {
1794+ return allVFsReady () && mdevParentDevicesExist (), nil
18081795 }
1796+ return mdevParentDevicesExist (), nil
1797+ })
1798+ }
18091799
1810- if totalExpected == 0 {
1811- log .Info ("No SR-IOV capable GPUs found, skipping VF wait" )
1812- return true , nil
1800+ // driverUsingSRIOV reports whether the NVIDIA driver has enabled any SR-IOV
1801+ // Virtual Functions. T4 (mdev-only) returns false even though the silicon
1802+ // advertises sriov_totalvfs; A100 in vGPU mode returns true once the driver
1803+ // begins enabling VFs.
1804+ func driverUsingSRIOV () bool {
1805+ nvpciLib := nvpci .New ()
1806+ gpus , err := nvpciLib .GetGPUs ()
1807+ if err != nil {
1808+ log .Warnf ("error getting GPUs: %v" , err )
1809+ return false
1810+ }
1811+ for _ , gpu := range gpus {
1812+ if gpu .SriovInfo .IsPF () && gpu .SriovInfo .PhysicalFunction .NumVFs > 0 {
1813+ return true
18131814 }
1815+ }
1816+ return false
1817+ }
18141818
1815- if totalEnabled == totalExpected {
1816- log .Infof ("All %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
1817- return true , nil
1819+ func mdevParentDevicesExist () bool {
1820+ nvmdevLib := nvmdev .New ()
1821+ parents , err := nvmdevLib .GetAllParentDevices ()
1822+ if err != nil {
1823+ log .Warnf ("could not get mdev parent devices: %v" , err )
1824+ return false
1825+ }
1826+ if len (parents ) == 0 {
1827+ log .Info ("found 0 mdev parent devices" )
1828+ return false
1829+ }
1830+ log .Infof ("found %d mdev parent devices" , len (parents ))
1831+ return true
1832+ }
1833+
1834+ func allVFsReady () bool {
1835+ nvpciLib := nvpci .New ()
1836+ gpus , err := nvpciLib .GetGPUs ()
1837+ if err != nil {
1838+ log .Warnf ("error getting GPUs: %v" , err )
1839+ return false
1840+ }
1841+
1842+ var totalExpected , totalEnabled uint64
1843+ var pfCount int
1844+ for _ , gpu := range gpus {
1845+ sriovInfo := gpu .SriovInfo
1846+ if sriovInfo .IsPF () {
1847+ pfCount ++
1848+ totalExpected += sriovInfo .PhysicalFunction .TotalVFs
1849+ totalEnabled += sriovInfo .PhysicalFunction .NumVFs
18181850 }
1851+ }
18191852
1820- log .Infof ("Waiting for VFs: %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , pfCount )
1821- return false , nil
1822- })
1853+ if totalExpected == 0 {
1854+ log .Info ("no SR-IOV capable GPUs found" )
1855+ return false
1856+ }
1857+
1858+ if totalEnabled == totalExpected {
1859+ log .Infof ("all %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
1860+ return true
1861+ }
1862+
1863+ log .Infof ("not all VFs have been created. %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , pfCount )
1864+ return false
18231865}
18241866
18251867func (c * CCManager ) validate () error {
0 commit comments