@@ -1747,9 +1747,9 @@ func (v *VGPUManager) validate() error {
17471747 return err
17481748 }
17491749
1750- log .Info ("Waiting for VFs to be available..." )
1751- if err := waitForVFs (ctx , defaultVFWaitTimeout ); err != nil {
1752- return fmt .Errorf ("vGPU Manager VFs not ready: %w" , err )
1750+ log .Info ("Waiting for parent devices to be available..." )
1751+ if err := waitForParentDevices (ctx , defaultVFWaitTimeout ); err != nil {
1752+ return fmt .Errorf ("vGPU Manager parent devices not ready: %w" , err )
17531753 }
17541754
17551755 statusFile := vGPUManagerStatusFile
@@ -1783,43 +1783,66 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17831783 return hostDriver , runCommand (command , args , silent )
17841784}
17851785
1786- // waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1787- // It polls sriov_numvfs until all GPUs have their full VF count enabled.
1788- func waitForVFs (ctx context.Context , timeout time.Duration ) error {
1786+ // waitForParentDevices polls until the vGPU stack is ready — either NVIDIA
1787+ // mdev parent devices have been registered (PF on Turing, VFs on Ampere+
1788+ // SR-IOV) or all SR-IOV VFs are enabled.
1789+ func waitForParentDevices (ctx context.Context , timeout time.Duration ) error {
17891790 pollInterval := time .Duration (sleepIntervalSecondsFlag ) * time .Second
1790- nvpciLib := nvpci .New ()
17911791
17921792 return wait .PollUntilContextTimeout (ctx , pollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
1793- gpus , err := nvpciLib .GetGPUs ()
1794- if err != nil {
1795- log .Warnf ("Error getting GPUs: %v" , err )
1796- return false , nil
1797- }
1798-
1799- var totalExpected , totalEnabled uint64
1800- var pfCount int
1801- for _ , gpu := range gpus {
1802- sriovInfo := gpu .SriovInfo
1803- if sriovInfo .IsPF () {
1804- pfCount ++
1805- totalExpected += sriovInfo .PhysicalFunction .TotalVFs
1806- totalEnabled += sriovInfo .PhysicalFunction .NumVFs
1807- }
1808- }
1809-
1810- if totalExpected == 0 {
1811- log .Info ("No SR-IOV capable GPUs found, skipping VF wait" )
1793+ if mdevParentDevicesExist () || vfsExist () {
18121794 return true , nil
18131795 }
1796+ return false , nil
1797+ })
1798+ }
18141799
1815- if totalEnabled == totalExpected {
1816- log .Infof ("All %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
1817- return true , nil
1800+ func mdevParentDevicesExist () bool {
1801+ nvmdevLib := nvmdev .New ()
1802+ parents , err := nvmdevLib .GetAllParentDevices ()
1803+ if err != nil {
1804+ log .Warnf ("could not get mdev parent devices: %v" , err )
1805+ return false
1806+ }
1807+ if len (parents ) == 0 {
1808+ log .Info ("found 0 mdev parent devices" )
1809+ return false
1810+ }
1811+ log .Infof ("found %d mdev parent devices" , len (parents ))
1812+ return true
1813+ }
1814+
1815+ func vfsExist () bool {
1816+ nvpciLib := nvpci .New ()
1817+ gpus , err := nvpciLib .GetGPUs ()
1818+ if err != nil {
1819+ log .Warnf ("error getting GPUs: %v" , err )
1820+ return false
1821+ }
1822+
1823+ var totalExpected , totalEnabled uint64
1824+ var pfCount int
1825+ for _ , gpu := range gpus {
1826+ sriovInfo := gpu .SriovInfo
1827+ if sriovInfo .IsPF () {
1828+ pfCount ++
1829+ totalExpected += sriovInfo .PhysicalFunction .TotalVFs
1830+ totalEnabled += sriovInfo .PhysicalFunction .NumVFs
18181831 }
1832+ }
18191833
1820- log .Infof ("Waiting for VFs: %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , pfCount )
1821- return false , nil
1822- })
1834+ if totalExpected == 0 {
1835+ log .Info ("no SR-IOV capable GPUs found" )
1836+ return false
1837+ }
1838+
1839+ if totalEnabled == totalExpected {
1840+ log .Infof ("all %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
1841+ return true
1842+ }
1843+
1844+ log .Infof ("not all VFs have been created. %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , pfCount )
1845+ return false
18231846}
18241847
18251848func (c * CCManager ) validate () error {
0 commit comments