Reconfigure API timeouts (#617)

ValentaTomas · jakubno · web-flow · commit c82910b3d91e · 2025-05-08T08:04:29.000-07:00
Co-authored-by: Jakub Novak &lt;jakub@e2b.dev&gt;
diff --git a/packages/api/main.go b/packages/api/main.go
@@ -45,9 +45,11 @@ const (
 	maxMultipartMemory = 1 << 27 // 128 MiB
 	maxUploadLimit     = 1 << 28 // 256 MiB
 
-	maxReadHeaderTimeout = 60 * time.Second
-	maxReadTimeout       = 75 * time.Second
-	maxWriteTimeout      = 75 * time.Second
+	maxReadTimeout  = 75 * time.Second
+	maxWriteTimeout = 75 * time.Second
+	// This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition
+	// https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds
+	idleTimeout = 620 * time.Second
 
 	defaultPort = 80
 )
@@ -171,12 +173,13 @@ func NewGinServer(ctx context.Context, logger *zap.Logger, apiStore *handlers.AP
 	r.MaxMultipartMemory = maxMultipartMemory
 
 	s := &http.Server{
-		Handler:           r,
-		Addr:              fmt.Sprintf("0.0.0.0:%d", port),
-		ReadHeaderTimeout: maxReadHeaderTimeout,
-		ReadTimeout:       maxReadTimeout,
-		WriteTimeout:      maxWriteTimeout,
-		BaseContext:       func(net.Listener) context.Context { return ctx },
+		Handler: r,
+		Addr:    fmt.Sprintf("0.0.0.0:%d", port),
+		// Configure timeouts to be greater than the proxy timeouts.
+		ReadTimeout:  maxReadTimeout,
+		WriteTimeout: maxWriteTimeout,
+		IdleTimeout:  idleTimeout,
+		BaseContext:  func(net.Listener) context.Context { return ctx },
 	}
 
 	return s
diff --git a/packages/client-proxy/internal/proxy/proxy.go b/packages/client-proxy/internal/proxy/proxy.go
@@ -23,7 +23,10 @@ const (
 	maxRetries            = 3
 
 	minOrchestratorProxyConns = 3
-	idleTimeout               = 620 * time.Second
+	// This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition
+	// Also it's a good practice to set it to a value higher than the idle timeout of the backend service
+	// https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds
+	idleTimeout = 610 * time.Second
 
 	// We use a constant connection key, because we don't have to separate connection pools
 	// as we need to do when connecting to sandboxes (from orchestrator proxy) to prevent reuse of pool connections
diff --git a/packages/orchestrator/internal/proxy/proxy.go b/packages/orchestrator/internal/proxy/proxy.go
@@ -19,7 +19,10 @@ import (
 
 const (
 	minSandboxConns = 1
-	idleTimeout     = 630 * time.Second
+	// This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition
+	// Also it's a good practice to set it to higher values as you progress in the stack
+	// https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds
+	idleTimeout = 620 * time.Second
 )
 
 type SandboxProxy struct {
diff --git a/packages/shared/pkg/proxy/proxy.go b/packages/shared/pkg/proxy/proxy.go
@@ -12,6 +12,7 @@ import (
 )
 
 const maxClientConns = 8192 // Reasonably big number that is lower than the number of available ports.
+const idleTimeoutBufferUpstreamDownstream = 10
 
 type Proxy struct {
 	http.Server
@@ -33,10 +34,13 @@ func New(
 
 	return &Proxy{
 		Server: http.Server{
-			Addr:              fmt.Sprintf(":%d", port),
-			ReadTimeout:       0,
-			WriteTimeout:      0,
-			IdleTimeout:       idleTimeout,
+			Addr:         fmt.Sprintf(":%d", port),
+			ReadTimeout:  0,
+			WriteTimeout: 0,
+			// Downstream (client side) idle > upstream (server side) idle
+			// otherwise a new connection between downstream idle and upstream idle
+			// will try to reuse an upstream connection which is in `CLOSE_WAIT` state resulting in error
+			IdleTimeout:       idleTimeout + idleTimeoutBufferUpstreamDownstream,
 			ReadHeaderTimeout: 0,
 			Handler:           handler(p, getDestination),
 		},