Skip to content

Commit 5f31a6b

Browse files
committed
tls: add read deadline to containers/image registry connections
A stalled TLS connection to a container registry (e.g. quay.io) can cause image pulls to hang indefinitely. The HTTP response body read blocks forever in tls.Conn.Read with no timeout, starving the entire pull pipeline and leaving pods stuck in ContainerCreating for hours. Wrap the HTTP transport dialer with a deadlineConn that enforces a 5-minute read deadline via SetReadDeadline on every Read call. When triggered, bodyReader treats the timeout the same as ECONNRESET and attempts a Range-based reconnect to resume the download. Also add a 2-minute ResponseHeaderTimeout to the transport. Ref: https://redhat.atlassian.net/browse/OCPBUGS-79544
1 parent d39d7e5 commit 5f31a6b

2 files changed

Lines changed: 49 additions & 1 deletion

File tree

image/docker/body_reader.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"io"
88
"math"
99
"math/rand/v2"
10+
"net"
1011
"net/http"
1112
"net/url"
1213
"strconv"
@@ -17,6 +18,16 @@ import (
1718
"github.com/sirupsen/logrus"
1819
)
1920

21+
// isRetryableNetworkError returns true for network errors that indicate
22+
// a stalled or interrupted connection that may be resolved by reconnecting.
23+
func isRetryableNetworkError(err error) bool {
24+
var netErr net.Error
25+
if errors.As(err, &netErr) && netErr.Timeout() {
26+
return true
27+
}
28+
return false
29+
}
30+
2031
const (
2132
// bodyReaderMinimumProgress is the minimum progress we consider a good reason to retry
2233
bodyReaderMinimumProgress = 1 * 1024 * 1024
@@ -147,7 +158,7 @@ func (br *bodyReader) Read(p []byte) (int, error) {
147158
br.lastSuccessTime = time.Now()
148159
return n, err // Unlike the default: case, don’t log anything.
149160

150-
case errors.Is(err, io.ErrUnexpectedEOF) || errors.Is(err, syscall.ECONNRESET):
161+
case errors.Is(err, io.ErrUnexpectedEOF) || errors.Is(err, syscall.ECONNRESET) || isRetryableNetworkError(err):
151162
originalErr := err
152163
redactedURL := br.logURL.Redacted()
153164
if err := br.errorIfNotReconnecting(originalErr, redactedURL); err != nil {

image/docker/docker_client.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"errors"
88
"fmt"
99
"io"
10+
"net"
1011
"net/http"
1112
"net/url"
1213
"os"
@@ -39,6 +40,14 @@ import (
3940
)
4041

4142
const (
43+
// registryReadTimeout is the maximum duration a single read from a registry
44+
// connection can block without returning any data before we consider the
45+
// connection stalled. This is enforced via SetReadDeadline on the underlying
46+
// net.Conn. When triggered, the read returns a timeout error which
47+
// bodyReader treats as a reconnectable condition (like ECONNRESET), allowing
48+
// it to resume the download with a Range request.
49+
registryReadTimeout = 5 * time.Minute
50+
4251
dockerHostname = "docker.io"
4352
dockerV1Hostname = "index.docker.io"
4453
dockerRegistry = "registry-1.docker.io"
@@ -963,6 +972,22 @@ func (bt *bearerToken) readFromHTTPResponseBody(res *http.Response) error {
963972
return nil
964973
}
965974

975+
// deadlineConn wraps a net.Conn to enforce a read deadline on every Read call.
976+
// If no data arrives within the configured readTimeout, the Read returns a
977+
// net.Error with Timeout() == true, which bodyReader treats as a reconnectable
978+
// condition and retries with a Range request.
979+
type deadlineConn struct {
980+
net.Conn
981+
readTimeout time.Duration
982+
}
983+
984+
func (c *deadlineConn) Read(p []byte) (int, error) {
985+
if err := c.Conn.SetReadDeadline(time.Now().Add(c.readTimeout)); err != nil {
986+
return 0, err
987+
}
988+
return c.Conn.Read(p)
989+
}
990+
966991
// detectPropertiesHelper performs the work of detectProperties which executes
967992
// it at most once.
968993
func (c *dockerClient) detectPropertiesHelper(ctx context.Context) error {
@@ -973,6 +998,17 @@ func (c *dockerClient) detectPropertiesHelper(ctx context.Context) error {
973998
}
974999
tr := tlsclientconfig.NewTransport()
9751000
tr.TLSClientConfig = c.tlsClientConfig
1001+
// Wrap the dialer to set per-read deadlines on the underlying connection.
1002+
// This ensures that a stalled registry response (e.g. TLS read that never
1003+
// returns data) will time out instead of blocking indefinitely.
1004+
originalDial := tr.DialContext
1005+
tr.DialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
1006+
conn, err := originalDial(ctx, network, addr)
1007+
if err != nil {
1008+
return nil, err
1009+
}
1010+
return &deadlineConn{Conn: conn, readTimeout: registryReadTimeout}, nil
1011+
}
9761012
// if set DockerProxyURL explicitly, use the DockerProxyURL instead of system proxy
9771013
if c.sys != nil && c.sys.DockerProxyURL != nil {
9781014
tr.Proxy = http.ProxyURL(c.sys.DockerProxyURL)
@@ -982,6 +1018,7 @@ func (c *dockerClient) detectPropertiesHelper(ctx context.Context) error {
9821018
return c.sys.DockerProxy(request.URL)
9831019
}
9841020
}
1021+
tr.ResponseHeaderTimeout = 2 * time.Minute
9851022
c.client = &http.Client{Transport: tr}
9861023

9871024
ping := func(scheme string) error {

0 commit comments

Comments
 (0)