Skip to content

Commit 67be2a4

Browse files
committed
retrier: when read timeout is 0, assume arb high duration
Otherwise, read deadline with 0 duration (not zero time.Time) closes conns immediately when we actually want the opposite of that (that is, when the timeout duration is 0, we want it to behave as if read deadline is 0 aka infinity).
1 parent 153f185 commit 67be2a4

1 file changed

Lines changed: 19 additions & 7 deletions

File tree

intra/dialers/retrier.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ package dialers
2626
import (
2727
"context"
2828
"io"
29+
"math"
2930
"net"
3031
"net/netip"
3132
"strings"
@@ -420,10 +421,9 @@ func (r *retrier) retryWriteReadLocked(buf []byte) (int, error) {
420421
core.CloseOp(newConn, core.CopW)
421422
}
422423

423-
logedcond(readdone || writedone)("retrier: retryLocked: done! strat(%s; mult? %d %T) %s=>%s; write? %d/%d; closed r/w? %t/%t; deadline r/w: %v/%v",
424-
r.dialerID(), len(r.dialers), newConn, laddr(newConn), r.raddr, nw, len(r.tee), readdone, writedone, core.FmtTimeAsPeriod(r.readDeadline), core.FmtTimeAsPeriod(r.writeDeadline))
424+
logedcond(readdone || writedone)("retrier: retryLocked: done! strat(%s; mult? %d %T) %s=>%s; write? %d/%d; closed r/w? %t/%t; rtt: %s, deadline r/w: %v/%v",
425+
r.dialerID(), len(r.dialers), newConn, laddr(newConn), r.raddr, nw, len(r.tee), readdone, writedone, core.FmtPeriod(r.timeout), core.FmtTimeAsPeriod(r.readDeadline), core.FmtTimeAsPeriod(r.writeDeadline))
425426

426-
newConn.SetReadDeadline(time.Now().Add(r.timeout))
427427
// all of buf was written to c
428428
// require a response within a short timeout on r.conn (same as newConn)
429429
r.shorterReadDeadlineForRetryLocked()
@@ -545,6 +545,17 @@ func (r *retrier) shorterReadDeadlineForRetryLocked() {
545545
}
546546
}
547547

548+
func (r *retrier) readTimeout() time.Duration {
549+
if r.timeout > 0 {
550+
return r.timeout
551+
}
552+
if r.readDeadline.IsZero() {
553+
// 2501h 59m 59s 25ms: a comfortably high duration in nanos
554+
return math.MaxInt64 >> 10
555+
}
556+
return time.Until(r.readDeadline)
557+
}
558+
548559
// Write data in b to retrier's underlying conn, r.conn
549560
func (r *retrier) Write(b []byte) (int, error) {
550561
// Double-checked locking pattern. This avoids lock acquisition on
@@ -576,15 +587,16 @@ func (r *retrier) Write(b []byte) (int, error) {
576587
// by the retry procedure. Block until we have a final socket (which will
577588
// already have replayed r.tee), and retry.
578589
// ie, wait until first write is done on the final socket.
579-
maxExpectedReadTimeout := r.timeout * maxRetryCount
590+
until := r.readTimeout()
591+
maxUntil := max(until, until*maxRetryCount)
580592
if r.multidial {
581-
maxExpectedReadTimeout = r.timeout * time.Duration(len(r.dialers))
593+
maxUntil = max(maxUntil, maxUntil*time.Duration(len(r.dialers)))
582594
}
583595
select {
584596
case <-r.retryDoneCh:
585-
case <-time.After(3 * maxExpectedReadTimeout): // arb high timeout; it should rarely if ever needed
597+
case <-time.After(maxUntil): // arb high timeout; it should rarely if ever needed
586598
log.W("retrier: write: %s: 1st write timed-out waiting for %s [calc-rtt: %s] 1st read b/w [%s=>%s], mult: %d, b: %d/%d, err: %v",
587-
r.dialerID(), core.FmtPeriod(3*maxExpectedReadTimeout), core.FmtPeriod(r.timeout), src, r.raddr, len(r.dialers), n, len(b), err)
599+
r.dialerID(), core.FmtPeriod(maxUntil), core.FmtPeriod(r.timeout), src, r.raddr, len(r.dialers), n, len(b), err)
588600
return n, core.JoinErr(err, errRetryTimeout)
589601
}
590602

0 commit comments

Comments
 (0)