Skip to content

Commit 53b7aa2

Browse files
committed
e2e: add SSH retry logic for reboot scenarios
Add configurable SSH connectivity retry mechanism for scenarios where nodes may reboot during provisioning (e.g., MIG-enabled GPU nodes). - Add WaitForSSHAfterReboot config option to enable retry behavior - Implement retry logic with exponential backoff using wait.PollUntilContextTimeout - Add reboot detection for common SSH error patterns - Enable 5-minute retry window for Ubuntu 2404 MIG GPU test - Maintain backward compatibility with existing single-attempt behavior This resolves flaky test failures when GPU driver installation triggers automatic reboots during node provisioning. Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
1 parent 4e7e10f commit 53b7aa2

3 files changed

Lines changed: 85 additions & 2 deletions

File tree

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package e2e
33
import (
44
"context"
55
"testing"
6+
"time"
67

78
"github.com/Azure/agentbaker/e2e/components"
89
"github.com/Azure/agentbaker/e2e/config"
@@ -198,8 +199,9 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
198199
GPU: true,
199200
},
200201
Config: Config{
201-
Cluster: ClusterKubenet,
202-
VHD: config.VHDUbuntu2404Gen2Containerd,
202+
Cluster: ClusterKubenet,
203+
VHD: config.VHDUbuntu2404Gen2Containerd,
204+
WaitForSSHAfterReboot: 5 * time.Minute,
203205
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
204206
nbc.AgentPoolProfile.VMSize = "Standard_NC24ads_A100_v4"
205207
nbc.ConfigGPUDriverIfNeeded = true

e2e/test_helpers.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
2323
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6"
2424
"github.com/stretchr/testify/require"
25+
"k8s.io/apimachinery/pkg/util/wait"
2526
ctrruntimelog "sigs.k8s.io/controller-runtime/pkg/log"
2627
"sigs.k8s.io/controller-runtime/pkg/log/zap"
2728
)
@@ -649,7 +650,81 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str
649650
return &customVHD
650651
}
651652

653+
// isRebootRelatedSSHError checks if the error is related to a system reboot
654+
func isRebootRelatedSSHError(err error, stderr string) bool {
655+
if err == nil {
656+
return false
657+
}
658+
659+
rebootIndicators := []string{
660+
"System is going down",
661+
"pam_nologin",
662+
"Connection closed by",
663+
"Connection refused",
664+
"Connection timed out",
665+
}
666+
667+
errMsg := err.Error()
668+
for _, indicator := range rebootIndicators {
669+
if strings.Contains(errMsg, indicator) || strings.Contains(stderr, indicator) {
670+
return true
671+
}
672+
}
673+
return false
674+
}
675+
652676
func validateSSHConnectivity(ctx context.Context, s *Scenario) error {
677+
// If WaitForSSHAfterReboot is not set, use the original single-attempt behavior
678+
if s.Config.WaitForSSHAfterReboot == 0 {
679+
return attemptSSHConnection(ctx, s)
680+
}
681+
682+
// Retry logic with exponential backoff for scenarios that may reboot
683+
s.T.Logf("SSH connectivity validation will retry for up to %s if reboot-related errors are encountered", s.Config.WaitForSSHAfterReboot)
684+
startTime := time.Now()
685+
var lastSSHError error
686+
687+
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, s.Config.WaitForSSHAfterReboot, true, func(ctx context.Context) (bool, error) {
688+
err := attemptSSHConnection(ctx, s)
689+
if err == nil {
690+
elapsed := time.Since(startTime)
691+
s.T.Logf("SSH connectivity established after %s", toolkit.FormatDuration(elapsed))
692+
return true, nil
693+
}
694+
695+
// Save the last error for better error messages
696+
lastSSHError = err
697+
698+
// Extract stderr from the error
699+
stderr := ""
700+
if strings.Contains(err.Error(), "Stderr:") {
701+
parts := strings.Split(err.Error(), "Stderr:")
702+
if len(parts) > 1 {
703+
stderr = parts[1]
704+
}
705+
}
706+
707+
// Check if this is a reboot-related error
708+
if isRebootRelatedSSHError(err, stderr) {
709+
s.T.Logf("Detected reboot-related SSH error, will retry: %v", err)
710+
return false, nil // Continue polling
711+
}
712+
713+
// Not a reboot error, fail immediately
714+
return false, err
715+
})
716+
717+
// If we timed out while retrying reboot-related errors, provide a better error message
718+
if err != nil && lastSSHError != nil {
719+
elapsed := time.Since(startTime)
720+
return fmt.Errorf("SSH connection failed after waiting %s for node to reboot and come back up. Last SSH error: %w", toolkit.FormatDuration(elapsed), lastSSHError)
721+
}
722+
723+
return err
724+
}
725+
726+
// attemptSSHConnection performs a single SSH connectivity check
727+
func attemptSSHConnection(ctx context.Context, s *Scenario) error {
653728
connectionTest := fmt.Sprintf("%s echo 'SSH_CONNECTION_OK'", sshString(s.Runtime.VMPrivateIP))
654729
connectionResult, err := execOnPrivilegedPod(ctx, s.Runtime.Cluster.Kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, connectionTest)
655730

e2e/types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strconv"
1111
"strings"
1212
"testing"
13+
"time"
1314

1415
aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1"
1516
"github.com/Azure/agentbaker/e2e/config"
@@ -166,6 +167,11 @@ type Config struct {
166167
// It shouldn't be used for majority of scenarios, currently only used for scenarios where the node is not expected to be reachable via ssh
167168
SkipSSHConnectivityValidation bool
168169

170+
// WaitForSSHAfterReboot if set to non-zero duration, SSH connectivity validation will retry with exponential backoff
171+
// for up to this duration when encountering reboot-related errors. This is useful for scenarios where the node
172+
// reboots during provisioning (e.g., MIG-enabled GPU nodes). Default (zero value) means no retry.
173+
WaitForSSHAfterReboot time.Duration
174+
169175
// if VHDCaching is set then a VHD will be created first for the test scenario and then a VM will be created from that VHD.
170176
// The main purpose is to validate VHD Caching logic and ensure a reboot step between basePrep and nodePrep doesn't break anything.
171177
VHDCaching bool

0 commit comments

Comments
 (0)