Skip to content

Commit bde9fa2

Browse files
authored
chore(api): RequestTimeout + distinguish between client cancel and request timeout in telemetry (#2165)
* chore(api): distinguish between client cancel and request timeout * fix: set correct status * fix: correctly cancel timeout * fix: lint * chore: simplify * chore: clean up * fix: setting the error * chore: don't reset status code if not client cancel * chore: simplify the logic in timeout middleware * chore: remove unnecessary exlude routes * chore: remove dead code * fix: remove dead code
1 parent 70213ce commit bde9fa2

7 files changed

Lines changed: 118 additions & 88 deletions

File tree

packages/api/internal/middleware/otel/metrics/middleware.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"go.opentelemetry.io/otel/attribute"
1212
"go.opentelemetry.io/otel/metric"
1313
semconv "go.opentelemetry.io/otel/semconv/v1.7.0"
14+
15+
sharedmiddleware "github.com/e2b-dev/infra/packages/shared/pkg/middleware"
1416
)
1517

1618
const MetricPrefix = "metric."
@@ -76,9 +78,9 @@ func Middleware(meterProvider metric.MeterProvider, service string, options ...O
7678
)
7779

7880
code := ginCtx.Writer.Status()
79-
if errors.Is(ctx.Err(), context.Canceled) {
81+
if errors.Is(sharedmiddleware.CancelCause(ginCtx), context.Canceled) {
8082
// 499 is the nginx convention for "client closed request before server responded"
81-
code = 499
83+
code = sharedmiddleware.StatusClientClosedRequest
8284
}
8385

8486
groupedCode := code / 100 * 100

packages/api/internal/middleware/otel/tracing/middleware.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
oteltrace "go.opentelemetry.io/otel/trace"
3131

3232
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
33+
sharedmiddleware "github.com/e2b-dev/infra/packages/shared/pkg/middleware"
3334
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
3435
)
3536

@@ -112,8 +113,11 @@ func Middleware(tracerProvider oteltrace.TracerProvider, service string) gin.Han
112113
c.Next()
113114

114115
status := c.Writer.Status()
115-
if errors.Is(ctx.Err(), context.Canceled) {
116-
status = 499
116+
cause := sharedmiddleware.CancelCause(c)
117+
if errors.Is(cause, sharedmiddleware.ErrRequestTimeout) {
118+
span.SetAttributes(attribute.Bool("request.timeout", true))
119+
} else if errors.Is(cause, context.Canceled) {
120+
status = sharedmiddleware.StatusClientClosedRequest
117121
span.SetAttributes(attribute.Bool("client.canceled", true))
118122
}
119123

packages/api/internal/middleware/timeout.go

Lines changed: 0 additions & 41 deletions
This file was deleted.

packages/api/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ const (
6363
// Must be less than maxWriteTimeout so the context cancels before the
6464
// server's write deadline kills the connection (WriteTimeout does NOT
6565
// cancel r.Context(); see https://github.com/golang/go/issues/59602).
66-
// requestTimeout = 60 * time.Second
66+
requestTimeout = 70 * time.Second
6767

6868
// This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition
6969
// https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds
@@ -133,7 +133,7 @@ func NewGinServer(ctx context.Context, config cfg.Config, tel *telemetry.Client,
133133
},
134134
}),
135135
gin.Recovery(),
136-
// customMiddleware.RequestTimeout(requestTimeout), //nolint:contextcheck // Gin middleware sets context via c.Request.WithContext
136+
sharedmiddleware.RequestTimeout(requestTimeout), //nolint:contextcheck // Gin middleware sets context via c.Request.WithContext
137137
)
138138

139139
corsConfig := cors.DefaultConfig()

packages/shared/pkg/middleware/logging.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,24 @@ func LoggingMiddleware(logger logger.Logger, conf Config) gin.HandlerFunc {
7272
end = end.UTC()
7373
}
7474

75-
status := c.Writer.Status()
76-
if errors.Is(ctx.Err(), context.Canceled) {
77-
status = 499
78-
}
79-
8075
fields := []zapcore.Field{
81-
zap.Int("status", status),
8276
zap.String("method", c.Request.Method),
8377
zap.String("path", path),
8478
zap.String("query", query),
8579
zap.String("user-agent", c.Request.UserAgent()),
8680
zap.Duration("latency", latency),
8781
}
8882

83+
status := c.Writer.Status()
84+
cause := CancelCause(c)
85+
if errors.Is(cause, ErrRequestTimeout) {
86+
fields = append(fields, zap.Bool("request_timeout", true))
87+
} else if errors.Is(cause, context.Canceled) {
88+
status = StatusClientClosedRequest // 499
89+
fields = append(fields, zap.Bool("client_canceled", true))
90+
}
91+
92+
fields = append(fields, zap.Int("status", status))
8993
if conf.TimeFormat != "" {
9094
fields = append(fields, zap.String("time", end.Format(conf.TimeFormat)))
9195
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package middleware
2+
3+
import (
4+
"context"
5+
"errors"
6+
"time"
7+
8+
"github.com/gin-gonic/gin"
9+
)
10+
11+
// ErrRequestTimeout is the cancel cause set when the per-request timeout fires.
12+
// Callers can distinguish this from a client disconnection by checking:
13+
//
14+
// errors.Is(CancelCause(c), middleware.ErrRequestTimeout)
15+
var ErrRequestTimeout = errors.New("request timeout exceeded")
16+
17+
// cancelCauseKey is the gin context key where RequestTimeout snapshots the
18+
// cancel cause before defer-cancel runs.
19+
const cancelCauseKey = "middleware.cancelCause"
20+
21+
// CancelCause returns the cancel cause captured by the timeout middleware.
22+
// It returns nil for normal (non-canceled/non-timed-out) requests.
23+
func CancelCause(c *gin.Context) error {
24+
if val, exists := c.Get(cancelCauseKey); exists {
25+
if err, ok := val.(error); ok {
26+
return err
27+
}
28+
}
29+
30+
return nil
31+
}
32+
33+
// StatusClientClosedRequest is the de-facto status code (used by nginx) for a
34+
// client that closed the connection before the server could send a response.
35+
// It is not an official IANA code but is widely recognised in logs and metrics.
36+
const StatusClientClosedRequest = 499
37+
38+
// RequestTimeout returns a Gin middleware that sets a context deadline on each
39+
// request. This is needed because http.Server.WriteTimeout does NOT cancel
40+
// r.Context() (see https://github.com/golang/go/issues/59602), so without an
41+
// explicit deadline, blocking calls like pgxpool.Acquire will wait indefinitely
42+
// when the connection pool is saturated.
43+
//
44+
// After the handler returns, the middleware checks context.Cause to distinguish
45+
// two cancellation scenarios and sets an appropriate status if nothing was
46+
// written yet:
47+
// - server-side timeout (cause is ErrRequestTimeout) → 408 Request Timeout
48+
// - client disconnect (ctx.Err() == context.Canceled) → 499 Client Closed Request
49+
//
50+
// Routes matching any of the excludedRoutes patterns are skipped (useful for
51+
// health checks and long-polling endpoints).
52+
func RequestTimeout(timeout time.Duration) gin.HandlerFunc {
53+
return func(c *gin.Context) {
54+
ctx, cancel := context.WithTimeoutCause(c.Request.Context(), timeout, ErrRequestTimeout)
55+
defer cancel()
56+
57+
c.Request = c.Request.WithContext(ctx)
58+
c.Next()
59+
60+
// Snapshot the cause *before* defer-cancel fires so outer
61+
// middlewares can distinguish timeout vs client-disconnect
62+
// via CancelCause(c) without racing with the deferred cancel.
63+
if err := context.Cause(ctx); err != nil {
64+
c.Set(cancelCauseKey, err)
65+
}
66+
}
67+
}

packages/api/internal/middleware/timeout_test.go renamed to packages/shared/pkg/middleware/timeout_test.go

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -35,27 +35,6 @@ func TestRequestTimeout_SetsDeadline(t *testing.T) {
3535
require.Equal(t, http.StatusOK, w.Code)
3636
}
3737

38-
func TestRequestTimeout_Returns408WhenHandlerDoesNotWrite(t *testing.T) {
39-
t.Parallel()
40-
41-
r := gin.New()
42-
r.Use(RequestTimeout(100 * time.Millisecond))
43-
r.GET("/slow", func(c *gin.Context) {
44-
<-c.Request.Context().Done()
45-
})
46-
47-
w := httptest.NewRecorder()
48-
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/slow", nil)
49-
50-
start := time.Now()
51-
r.ServeHTTP(w, req)
52-
elapsed := time.Since(start)
53-
54-
assert.Less(t, elapsed, 500*time.Millisecond, "should have timed out promptly")
55-
require.Equal(t, http.StatusRequestTimeout, w.Code)
56-
assert.Equal(t, "request timed out", w.Body.String())
57-
}
58-
5938
func TestRequestTimeout_CancelsBlockingHandler(t *testing.T) {
6039
t.Parallel()
6140

@@ -80,38 +59,53 @@ func TestRequestTimeout_CancelsBlockingHandler(t *testing.T) {
8059
assert.Less(t, elapsed, 500*time.Millisecond, "handler should have been unblocked by context timeout")
8160
}
8261

83-
func TestRequestTimeout_ExcludedRouteHasNoDeadline(t *testing.T) {
62+
func TestRequestTimeout_NormalRequestContextNotCanceled(t *testing.T) {
8463
t.Parallel()
8564

65+
// Simulate an outer middleware that reads CancelCause after c.Next().
66+
// The context itself will be canceled by defer cancel(), but CancelCause
67+
// should return nil for normal (non-timed-out) requests.
68+
var outerCause error
69+
outerMiddleware := func(c *gin.Context) {
70+
c.Next()
71+
outerCause = CancelCause(c)
72+
}
73+
8674
r := gin.New()
87-
r.Use(RequestTimeout(500*time.Millisecond, "/health"))
88-
r.GET("/health", func(c *gin.Context) {
89-
_, ok := c.Request.Context().Deadline()
90-
assert.False(t, ok, "excluded route should not have a deadline")
75+
r.Use(outerMiddleware)
76+
r.Use(RequestTimeout(5 * time.Second))
77+
r.GET("/test", func(c *gin.Context) {
9178
c.Status(http.StatusOK)
9279
})
9380

9481
w := httptest.NewRecorder()
95-
req, _ := http.NewRequestWithContext(t.Context(), http.MethodGet, "/health", nil)
82+
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/test", nil)
9683
r.ServeHTTP(w, req)
9784

9885
require.Equal(t, http.StatusOK, w.Code)
86+
assert.NoError(t, outerCause, "CancelCause should return nil for normal requests")
9987
}
10088

101-
func TestRequestTimeout_ExcludedRouteWithParam(t *testing.T) {
89+
func TestRequestTimeout_TimeoutContextVisibleToOuterMiddleware(t *testing.T) {
10290
t.Parallel()
10391

92+
var outerCause error
93+
outerMiddleware := func(c *gin.Context) {
94+
c.Next()
95+
outerCause = CancelCause(c)
96+
}
97+
10498
r := gin.New()
105-
r.Use(RequestTimeout(500*time.Millisecond, "/templates/:templateID/builds/:buildID/logs"))
106-
r.GET("/templates/:templateID/builds/:buildID/logs", func(c *gin.Context) {
107-
_, ok := c.Request.Context().Deadline()
108-
assert.False(t, ok, "excluded parameterized route should not have a deadline")
109-
c.Status(http.StatusOK)
99+
r.Use(outerMiddleware)
100+
r.Use(RequestTimeout(50 * time.Millisecond))
101+
r.GET("/slow", func(c *gin.Context) {
102+
<-c.Request.Context().Done()
110103
})
111104

112105
w := httptest.NewRecorder()
113-
req, _ := http.NewRequestWithContext(t.Context(), http.MethodGet, "/templates/abc123/builds/build456/logs", nil)
106+
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/slow", nil)
114107
r.ServeHTTP(w, req)
115108

116-
require.Equal(t, http.StatusOK, w.Code)
109+
require.ErrorIs(t, outerCause, ErrRequestTimeout,
110+
"outer middleware should see ErrRequestTimeout as the cause when the timeout fires")
117111
}

0 commit comments

Comments
 (0)