@@ -1788,6 +1788,7 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
17881788func waitForVFs (ctx context.Context , timeout time.Duration ) error {
17891789 pollInterval := time .Duration (sleepIntervalSecondsFlag ) * time .Second
17901790 nvpciLib := nvpci .New ()
1791+ nvmdevLib := nvmdev .New ()
17911792
17921793 return wait .PollUntilContextTimeout (ctx , pollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
17931794 gpus , err := nvpciLib .GetGPUs ()
@@ -1812,6 +1813,24 @@ func waitForVFs(ctx context.Context, timeout time.Duration) error {
18121813 return true , nil
18131814 }
18141815
1816+ // vGPU stack is ready when mdev parents are registered (PF on Turing,
1817+ // VFs on Ampere+ SR-IOV) or all SR-IOV VFs are enabled. Missing
1818+ // /sys/class/mdev_bus = no mdev support, not a retryable error.
1819+ if _ , statErr := os .Stat ("/sys/class/mdev_bus" ); statErr == nil {
1820+ parents , err := nvmdevLib .GetAllParentDevices ()
1821+ if err != nil {
1822+ log .Warnf ("Error listing mdev parent devices: %v" , err )
1823+ return false , nil
1824+ }
1825+ if len (parents ) > 0 {
1826+ log .Infof ("vGPU stack ready: %d mdev parent device(s)" , len (parents ))
1827+ return true , nil
1828+ }
1829+ } else if ! os .IsNotExist (statErr ) {
1830+ log .Warnf ("Error checking mdev_bus: %v" , statErr )
1831+ return false , nil
1832+ }
1833+
18151834 if totalEnabled == totalExpected {
18161835 log .Infof ("All %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
18171836 return true , nil
0 commit comments