@@ -19,6 +19,7 @@ import (
1919 "go.uber.org/zap"
2020 "golang.org/x/oauth2"
2121 "google.golang.org/grpc"
22+ "google.golang.org/grpc/backoff"
2223 "google.golang.org/grpc/codes"
2324 "google.golang.org/grpc/credentials"
2425 "google.golang.org/grpc/credentials/oauth"
@@ -46,6 +47,14 @@ func getServiceClient(token string, cfg *Config) (v1.SecureTLSBootstrapServiceCl
4647 return nil , nil , fmt .Errorf ("failed to get TLS config: %w" , err )
4748 }
4849
50+ // override max delay to 3s (default is 120s) - this ensures the gRPC subchannel
51+ // re-attempts a real TCP+TLS connection at least every 3s, which aligns with
52+ // the ~2s RPC-level retry cadence. Without this cap, the subchannel exponential
53+ // backoff grows to 120s, causing the retry interceptor to receive cached errors
54+ // from the last real attempt rather than triggering new connection attempts.
55+ grpcConnectionBackoffConfig := backoff .DefaultConfig
56+ grpcConnectionBackoffConfig .MaxDelay = 3 * time .Second
57+
4958 conn , err := grpc .NewClient (
5059 fmt .Sprintf ("%s:443" , cfg .APIServerFQDN ),
5160 grpc .WithUserAgent (internalhttp .GetUserAgent ()),
@@ -55,6 +64,15 @@ func getServiceClient(token string, cfg *Config) (v1.SecureTLSBootstrapServiceCl
5564 AccessToken : token ,
5665 }),
5766 }),
67+ // transport/connection-level config
68+ grpc .WithConnectParams (grpc.ConnectParams {
69+ Backoff : grpcConnectionBackoffConfig ,
70+ // MinConnectTimeout caps the per-attempt connection timeout (default: 20s).
71+ // 5s balances fast retry cycles (~8s/cycle) against headroom for first-connection
72+ // latency through new LB paths — healthy intra-Azure TCP+TLS 1.3 handshakes complete in <1s.
73+ MinConnectTimeout : 5 * time .Second ,
74+ }),
75+ // RPC-level retry config
5876 grpc .WithUnaryInterceptor (retry .UnaryClientInterceptor (
5977 retry .WithOnRetryCallback (getGRPCOnRetryCallbackFunc ()),
6078 retry .WithBackoff (retry .BackoffLinearWithJitter (2 * time .Second , 0.25 )),
0 commit comments