@@ -50,8 +50,9 @@ const maxRetryCount = 3
5050type retrier struct {
5151 dialers []protect.RDialer
5252 dialerOpts settings.DialerOpts
53- raddr * net.TCPAddr
54- laddr * net.TCPAddr // laddr may be nil; TCPAddr.IP may be nil.
53+ racing bool
54+ raddr net.Addr
55+ laddr net.Addr // laddr may be nil; TCPAddr.IP may be nil.
5556
5657 // Flags indicating whether the caller has called CloseRead and CloseWrite.
5758 readDone atomic.Bool
@@ -119,7 +120,7 @@ func calcTimeout(rtt time.Duration) time.Duration {
119120 // These values were chosen to have a <1% false positive rate based on test data.
120121 // False positives trigger an unnecessary retry, which can make connections slower, so they are
121122 // worth avoiding. However, overly long timeouts make retry slower and less useful.
122- return 800 * time .Millisecond + max (2 * rtt , 100 * time .Millisecond )
123+ return 400 * time .Millisecond + max (2 * rtt , 100 * time .Millisecond )
123124}
124125
125126// DialWithSplitRetry returns a TCP connection that transparently retries by
@@ -146,6 +147,33 @@ func DialWithSplitRetry(d *protect.RDial, laddr, raddr *net.TCPAddr) (*retrier,
146147 return r , nil
147148}
148149
150+ func dialerOptsForRace () settings.DialerOpts {
151+ return settings.DialerOpts {
152+ Strat : settings .SplitNever ,
153+ Retry : settings .RetryWithSplit ,
154+ }
155+ }
156+
157+ func DialRace (ds []protect.RDialer , laddr , raddr net.Addr ) (* retrier , error ) {
158+ r := & retrier {
159+ dialers : ds ,
160+ dialerOpts : dialerOptsForRace (),
161+ racing : true ,
162+ laddr : laddr , // may be nil
163+ raddr : raddr , // must not be nil
164+ retryDoneCh : make (chan struct {}),
165+ }
166+
167+ r .mu .Lock ()
168+ defer r .mu .Unlock ()
169+
170+ if _ , err := r .dialLocked (); err != nil {
171+ return nil , err
172+ }
173+ return r , nil
174+ }
175+
176+ // SycallConn implements core.DuplexConn.
149177func (r * retrier ) SyscallConn () (syscall.RawConn , error ) {
150178 r .mu .Lock ()
151179 c := r .conn
@@ -157,6 +185,7 @@ func (r *retrier) SyscallConn() (syscall.RawConn, error) {
157185 return nil , syscall .EINVAL
158186}
159187
188+ // SetKeepAlive implements core.DuplexConn.
160189func (r * retrier ) SetKeepAlive (y bool ) error {
161190 r .mu .Lock ()
162191 c := r .conn
@@ -242,8 +271,8 @@ func (r *retrier) dialLocked() (c core.DuplexConn, err error) {
242271 r .conn = c // c may be nil
243272 r .timeout = calcTimeout (rtt )
244273
245- logeif (err )("retrier: dial(%s) %s=>%s; strat: %d, rtt: %dms; err? %v" ,
246- r .dialerOpts , laddr (c ), r .raddr , strat , rtt .Milliseconds (), err )
274+ logeif (err )("retrier: dial(%s) %s=>%s; strat: %d (race? %t) , rtt: %dms; err? %v" ,
275+ r .dialerOpts , laddr (c ), r .raddr , strat , r . racing , rtt .Milliseconds (), err )
247276
248277 return
249278}
@@ -285,8 +314,8 @@ func (r *retrier) retryWriteReadLocked(buf []byte) (int, error) {
285314
286315 var nw int
287316 nw , r .retryErr = newConn .Write (r .tee )
288- logeif (r .retryErr )("retrier: retryLocked: strat(%s) %s=>%s; write? %d/%d; err? %v" ,
289- r .dialerOpts , laddr (newConn ), r .raddr , nw , len (r .tee ), r .retryErr )
317+ logeif (r .retryErr )("retrier: retryLocked: strat(%s, racing? %t ) %s=>%s; write? %d/%d; err? %v" ,
318+ r .dialerOpts , r . racing , laddr (newConn ), r .raddr , nw , len (r .tee ), r .retryErr )
290319 if r .retryErr != nil {
291320 return 0 , r .retryErr
292321 }
@@ -309,8 +338,8 @@ func (r *retrier) retryWriteReadLocked(buf []byte) (int, error) {
309338 _ = newConn .SetWriteDeadline (r .writeDeadline )
310339 }
311340
312- logedcond (readdone || writedone )("retrier: retryLocked: done! strat(%s) %s=>%s; write? %d/%d; closed r/w? %t/%t; deadline r/w: %v/%v" ,
313- r .dialerOpts , laddr (newConn ), r .raddr , nw , len (r .tee ), readdone , writedone , time .Since (r .readDeadline ).Seconds (), time .Since (r .writeDeadline ).Seconds ())
341+ logedcond (readdone || writedone )("retrier: retryLocked: done! strat(%s; racing? %t ) %s=>%s; write? %d/%d; closed r/w? %t/%t; deadline r/w: %v/%v" ,
342+ r .dialerOpts , r . racing , laddr (newConn ), r .raddr , nw , len (r .tee ), readdone , writedone , time .Since (r .readDeadline ).Seconds (), time .Since (r .writeDeadline ).Seconds ())
314343
315344 return newConn .Read (buf )
316345}
@@ -349,16 +378,20 @@ func (r *retrier) Read(buf []byte) (n int, err error) {
349378 if ! r .retryCompleted () {
350379 defer close (r .retryDoneCh ) // signal that retry is complete or unnecessary
351380 var retryerr error
381+ canRetry := r .retryCount < maxRetryCount
382+ if r .racing {
383+ canRetry = r .dialerCount < len (r .dialers )
384+ }
352385 // retry on errs like timeouts or connection resets
353- for (c == nil || err != nil ) && r . retryCount < maxRetryCount {
386+ for (c == nil || err != nil ) && canRetry {
354387 r .retryCount ++
355388 n , retryerr = r .retryWriteReadLocked (buf )
356389 c = r .conn // re-assign c to newConn, if any; may be nil
357390 if c == nil {
358391 err = core .UniqErr (err , retryerr )
359392 }
360- logeor (retryerr , log .I )("retrier: read# %d: [%s<=%s] %d; err? %v" ,
361- r .retryCount , laddr (c ), r .raddr , n , retryerr )
393+ logeor (retryerr , log .I )("retrier: read# %d + (racing? %t / c: %d) : [%s<=%s] %d; err? %v" ,
394+ r .retryCount , r . racing , r . dialerCount , laddr (c ), r .raddr , n , retryerr )
362395 }
363396 if c != nil && core .IsNotNil (c ) {
364397 _ = c .SetReadDeadline (r .readDeadline )
0 commit comments