Skip to content

Commit 9d8b7e1

Browse files
CRE-4405: Add gateway request tracing
Add httptrace metrics to network/httpclient used by the Gateway
1 parent b538c88 commit 9d8b7e1

5 files changed

Lines changed: 233 additions & 6 deletions

File tree

core/cmd/shell.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ import (
5151
"github.com/smartcontractkit/chainlink/v2/core/services/ccv/ccvcommon"
5252
"github.com/smartcontractkit/chainlink/v2/core/services/chainlink"
5353
"github.com/smartcontractkit/chainlink/v2/core/services/cre"
54+
gatewaynetwork "github.com/smartcontractkit/chainlink/v2/core/services/gateway/network"
5455
"github.com/smartcontractkit/chainlink/v2/core/services/keystore"
5556
"github.com/smartcontractkit/chainlink/v2/core/services/llo"
5657
"github.com/smartcontractkit/chainlink/v2/core/services/llo/retirement"
@@ -80,6 +81,7 @@ func metricViews() []sdkmetric.View {
8081
ccvcommon.MetricViews(),
8182
ocr3beholderwrapper.MetricViews(),
8283
ocr3_1beholderwrapper.MetricViews(),
84+
gatewaynetwork.HTTPClientMetricViews(),
8385
)
8486
}
8587

core/services/gateway/network/httpclient.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"io"
99
"maps"
1010
"net/http"
11+
"net/http/httptrace"
1112
"slices"
1213
"strings"
1314

@@ -169,9 +170,10 @@ func responseHeadersFromNetHeader(h http.Header) (map[string]string, map[string]
169170
}
170171

171172
type httpClient struct {
172-
client *safeurl.WrappedClient
173-
config HTTPClientConfig
174-
lggr logger.Logger
173+
client *safeurl.WrappedClient
174+
config HTTPClientConfig
175+
lggr logger.Logger
176+
metrics *httpClientMetrics
175177
}
176178

177179
// NewHTTPClient creates a new NewHTTPClient
@@ -196,10 +198,16 @@ func NewHTTPClient(config HTTPClientConfig, lggr logger.Logger) (HTTPClient, err
196198
SetCheckRedirect(disableRedirects).
197199
Build()
198200

201+
metrics, err := newHTTPClientMetrics()
202+
if err != nil {
203+
return nil, err
204+
}
205+
199206
return &httpClient{
200-
config: config,
201-
client: safeurl.Client(safeConfig),
202-
lggr: lggr,
207+
config: config,
208+
client: safeurl.Client(safeConfig),
209+
lggr: lggr,
210+
metrics: metrics,
203211
}, nil
204212
}
205213

@@ -297,8 +305,13 @@ func (c *httpClient) Send(ctx context.Context, req HTTPRequest) (*HTTPResponse,
297305
timeoutCtx, cancel := context.WithTimeout(ctx, to)
298306
defer cancel()
299307

308+
requestStart := time.Now()
309+
trace, traceState := newClientTrace(ctx, req.Method, requestStart, c.metrics)
310+
timeoutCtx = httptrace.WithClientTrace(timeoutCtx, trace)
311+
300312
r, err := http.NewRequestWithContext(timeoutCtx, req.Method, req.URL, bytes.NewBuffer(req.Body))
301313
if err != nil {
314+
c.metrics.recordTotal(ctx, req.Method, 0, false, false, time.Since(requestStart))
302315
return nil, err
303316
}
304317
for k, values := range requestToNetHeader(req) {
@@ -309,6 +322,7 @@ func (c *httpClient) Send(ctx context.Context, req HTTPRequest) (*HTTPResponse,
309322

310323
resp, err := c.client.Do(r)
311324
if err != nil {
325+
c.metrics.recordTotal(ctx, req.Method, 0, false, traceState.connReused.Load(), time.Since(requestStart))
312326
if isBlockedRequest(err) {
313327
c.lggr.Warnw("HTTP request blocked", "err", err)
314328
return nil, fmt.Errorf("%w: %w", ErrBlockedRequest, err)
@@ -324,10 +338,13 @@ func (c *httpClient) Send(ctx context.Context, req HTTPRequest) (*HTTPResponse,
324338
reader := http.MaxBytesReader(nil, resp.Body, int64(n))
325339
body, err := io.ReadAll(reader)
326340
if err != nil {
341+
c.metrics.recordTotal(ctx, req.Method, resp.StatusCode, false, traceState.connReused.Load(), time.Since(requestStart))
327342
c.lggr.Errorw("failed to read HTTP response body", "err", err)
328343
return nil, errors.Join(err, ErrHTTPRead)
329344
}
330345

346+
c.metrics.recordTotal(ctx, req.Method, resp.StatusCode, true, traceState.connReused.Load(), time.Since(requestStart))
347+
331348
headers, multiHeaders := responseHeadersFromNetHeader(resp.Header)
332349
c.lggr.Debugw("received HTTP response", "statusCode", resp.StatusCode)
333350
return &HTTPResponse{
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package network
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strconv"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/prometheus"
10+
"go.opentelemetry.io/otel/attribute"
11+
"go.opentelemetry.io/otel/metric"
12+
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
13+
14+
"github.com/smartcontractkit/chainlink-common/pkg/beholder"
15+
)
16+
17+
// TracePhase identifies a phase of the HTTP client request lifecycle as
18+
// observed via net/http/httptrace.
19+
type TracePhase string
20+
21+
const (
22+
PhaseGetConn TracePhase = "get_conn"
23+
PhaseDNSLookup TracePhase = "dns_lookup"
24+
PhaseTCPConnect TracePhase = "tcp_connect"
25+
PhaseTLSHandshake TracePhase = "tls_handshake"
26+
PhaseWroteRequest TracePhase = "wrote_request"
27+
PhaseTimeToFirstByte TracePhase = "time_to_first_byte"
28+
PhaseTotal TracePhase = "total"
29+
)
30+
31+
type httpClientMetrics struct {
32+
phaseDuration metric.Int64Histogram
33+
}
34+
35+
func newHTTPClientMetrics() (*httpClientMetrics, error) {
36+
phaseDuration, err := beholder.GetMeter().Int64Histogram(
37+
"platform_gateway_http_client_phase_duration_ms",
38+
metric.WithUnit("ms"),
39+
metric.WithDescription("HTTP client request phase duration observed via httptrace. The count of phase=total observations is the request count, partitioned by method, statusCode, success, and connectionReused."),
40+
)
41+
if err != nil {
42+
return nil, fmt.Errorf("failed to create platform_gateway_http_client_phase_duration_ms histogram: %w", err)
43+
}
44+
45+
return &httpClientMetrics{
46+
phaseDuration: phaseDuration,
47+
}, nil
48+
}
49+
50+
func (m *httpClientMetrics) recordPhase(ctx context.Context, method string, phase TracePhase, d time.Duration) {
51+
m.phaseDuration.Record(ctx, d.Milliseconds(), metric.WithAttributes(
52+
attribute.String("method", method),
53+
attribute.String("phase", string(phase)),
54+
))
55+
}
56+
57+
// recordTotal records the total request lifetime with the result attributes.
58+
// The histogram's count for phase=total doubles as the request counter.
59+
func (m *httpClientMetrics) recordTotal(ctx context.Context, method string, statusCode int, success, connReused bool, d time.Duration) {
60+
m.phaseDuration.Record(ctx, d.Milliseconds(), metric.WithAttributes(
61+
attribute.String("method", method),
62+
attribute.String("phase", string(PhaseTotal)),
63+
attribute.String("statusCode", strconv.Itoa(statusCode)),
64+
attribute.String("success", strconv.FormatBool(success)),
65+
attribute.String("connectionReused", strconv.FormatBool(connReused)),
66+
))
67+
}
68+
69+
// HTTPClientMetricViews returns histogram bucket definitions for the HTTP client trace metrics.
70+
// Due to the OTEL specification, all histogram buckets must be defined when the beholder client is created.
71+
func HTTPClientMetricViews() []sdkmetric.View {
72+
return []sdkmetric.View{
73+
sdkmetric.NewView(
74+
sdkmetric.Instrument{Name: "platform_gateway_http_client_phase_duration_ms"},
75+
sdkmetric.Stream{Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
76+
// 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 (ms)
77+
Boundaries: prometheus.ExponentialBuckets(1, 2, 16),
78+
}},
79+
),
80+
}
81+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package network
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
8+
"github.com/stretchr/testify/require"
9+
)
10+
11+
func TestNewHTTPClientMetrics(t *testing.T) {
12+
m, err := newHTTPClientMetrics()
13+
require.NoError(t, err)
14+
require.NotNil(t, m)
15+
}
16+
17+
func TestHTTPClientMetrics_RecordPhase(t *testing.T) {
18+
m, err := newHTTPClientMetrics()
19+
require.NoError(t, err)
20+
21+
ctx := context.Background()
22+
for _, phase := range []TracePhase{
23+
PhaseGetConn,
24+
PhaseDNSLookup,
25+
PhaseTCPConnect,
26+
PhaseTLSHandshake,
27+
PhaseWroteRequest,
28+
PhaseTimeToFirstByte,
29+
PhaseTotal,
30+
} {
31+
m.recordPhase(ctx, "GET", phase, 25*time.Millisecond)
32+
}
33+
}
34+
35+
func TestHTTPClientMetrics_RecordTotal(t *testing.T) {
36+
m, err := newHTTPClientMetrics()
37+
require.NoError(t, err)
38+
39+
ctx := context.Background()
40+
m.recordTotal(ctx, "GET", 200, true, false, 25*time.Millisecond)
41+
m.recordTotal(ctx, "POST", 500, true, true, 100*time.Millisecond)
42+
m.recordTotal(ctx, "GET", 0, false, false, 5*time.Millisecond)
43+
}
44+
45+
func TestHTTPClientMetricViews(t *testing.T) {
46+
views := HTTPClientMetricViews()
47+
require.Len(t, views, 1)
48+
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package network
2+
3+
import (
4+
"context"
5+
"crypto/tls"
6+
"net/http/httptrace"
7+
"sync/atomic"
8+
"time"
9+
)
10+
11+
// requestTrace holds out-of-band signals captured during a single HTTP request
12+
// so the caller can include them in the final request metric.
13+
type requestTrace struct {
14+
connReused atomic.Bool
15+
}
16+
17+
// newClientTrace returns an httptrace.ClientTrace that records per-phase
18+
// durations on m, along with a requestTrace exposing connection reuse.
19+
//
20+
// httptrace may invoke callbacks concurrently so each phase keeps its own atomic start timestamp.
21+
func newClientTrace(ctx context.Context, method string, requestStart time.Time, m *httpClientMetrics) (*httptrace.ClientTrace, *requestTrace) {
22+
rt := &requestTrace{}
23+
24+
var (
25+
getConnStart atomic.Pointer[time.Time]
26+
dnsStart atomic.Pointer[time.Time]
27+
connectStart atomic.Pointer[time.Time]
28+
tlsStart atomic.Pointer[time.Time]
29+
)
30+
31+
storeNow := func(p *atomic.Pointer[time.Time]) {
32+
now := time.Now()
33+
p.Store(&now)
34+
}
35+
36+
recordPhase := func(start *atomic.Pointer[time.Time], phase TracePhase) {
37+
s := start.Load()
38+
if s == nil {
39+
return
40+
}
41+
m.recordPhase(ctx, method, phase, time.Since(*s))
42+
}
43+
44+
return &httptrace.ClientTrace{
45+
GetConn: func(string) {
46+
storeNow(&getConnStart)
47+
},
48+
GotConn: func(info httptrace.GotConnInfo) {
49+
recordPhase(&getConnStart, PhaseGetConn)
50+
if info.Reused {
51+
rt.connReused.Store(true)
52+
}
53+
},
54+
DNSStart: func(httptrace.DNSStartInfo) {
55+
storeNow(&dnsStart)
56+
},
57+
DNSDone: func(httptrace.DNSDoneInfo) {
58+
recordPhase(&dnsStart, PhaseDNSLookup)
59+
},
60+
ConnectStart: func(string, string) {
61+
storeNow(&connectStart)
62+
},
63+
ConnectDone: func(string, string, error) {
64+
recordPhase(&connectStart, PhaseTCPConnect)
65+
},
66+
TLSHandshakeStart: func() {
67+
storeNow(&tlsStart)
68+
},
69+
TLSHandshakeDone: func(tls.ConnectionState, error) {
70+
recordPhase(&tlsStart, PhaseTLSHandshake)
71+
},
72+
WroteRequest: func(httptrace.WroteRequestInfo) {
73+
m.recordPhase(ctx, method, PhaseWroteRequest, time.Since(requestStart))
74+
},
75+
GotFirstResponseByte: func() {
76+
m.recordPhase(ctx, method, PhaseTimeToFirstByte, time.Since(requestStart))
77+
},
78+
}, rt
79+
}

0 commit comments

Comments
 (0)