|
| 1 | +// RPC client observability using Beholder. |
| 2 | +// |
| 3 | +// This file defines rpc_request_latency_ms and rpc_request_errors_total, emitted |
| 4 | +// from the RPC client when RPCClientBase is constructed with a non-nil RPCClientMetrics. |
| 5 | +// Metrics are queryable in Prometheus/Grafana by env, network, chain_id, and rpc_provider. |
| 6 | +// |
| 7 | +// Example Prometheus/Grafana queries: |
| 8 | +// |
| 9 | +// - Latency over time (e.g. p99 by env and chain): |
| 10 | +// histogram_quantile(0.99, sum(rate(rpc_request_latency_ms_bucket[5m])) by (le, env, network, chain_id)) |
| 11 | +// |
| 12 | +// - Error rate over time (errors per second by env and chain): |
| 13 | +// sum(rate(rpc_request_errors_total[5m])) by (env, network, chain_id, rpc_provider) |
| 14 | +// |
| 15 | +// - Request rate by call type: |
| 16 | +// sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) |
| 17 | +package metrics |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + |
| 23 | + "github.com/prometheus/client_golang/prometheus" |
| 24 | + "github.com/prometheus/client_golang/prometheus/promauto" |
| 25 | + "go.opentelemetry.io/otel/attribute" |
| 26 | + "go.opentelemetry.io/otel/metric" |
| 27 | + |
| 28 | + "github.com/smartcontractkit/chainlink-common/pkg/beholder" |
| 29 | +) |
| 30 | + |
| 31 | +const ( |
| 32 | + // RPCRequestLatencyMs is the Beholder/Prometheus metric name for RPC call latency in milliseconds. |
| 33 | + RPCRequestLatencyMs = "rpc_request_latency_ms" |
| 34 | + // RPCRequestErrorsTotal is the Beholder/Prometheus metric name for total RPC call errors. |
| 35 | + RPCRequestErrorsTotal = "rpc_request_errors_total" |
| 36 | +) |
| 37 | + |
| 38 | +var ( |
| 39 | + rpcRequestLatencyBuckets = []float64{ |
| 40 | + 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, |
| 41 | + } |
| 42 | + promRPCRequestLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ |
| 43 | + Name: RPCRequestLatencyMs, |
| 44 | + Help: "RPC request latency in milliseconds (per call)", |
| 45 | + Buckets: rpcRequestLatencyBuckets, |
| 46 | + }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) |
| 47 | + promRPCRequestErrors = promauto.NewCounterVec(prometheus.CounterOpts{ |
| 48 | + Name: RPCRequestErrorsTotal, |
| 49 | + Help: "Total number of failed RPC requests", |
| 50 | + }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) |
| 51 | +) |
| 52 | + |
| 53 | +// RPCClientMetrics records RPC latency and error metrics for observability via Beholder/Prometheus. |
| 54 | +// Metrics are queryable by environment, network, chain_id, and rpc_provider in Grafana. |
| 55 | +type RPCClientMetrics interface { |
| 56 | + // RecordRequest records latency for an RPC call. If err is non-nil, also increments the error counter. |
| 57 | + // callName identifies the operation (e.g. "latest_block", "latest_finalized_block"). |
| 58 | + RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) |
| 59 | +} |
| 60 | + |
| 61 | +var _ RPCClientMetrics = (*rpcClientMetrics)(nil) |
| 62 | + |
| 63 | +type rpcClientMetrics struct { |
| 64 | + env string |
| 65 | + network string |
| 66 | + chainID string |
| 67 | + rpcProvider string |
| 68 | + latency metric.Float64Histogram |
| 69 | + errorsTotal metric.Int64Counter |
| 70 | +} |
| 71 | + |
| 72 | +// RPCClientMetricsConfig holds labels for RPC client metrics. |
| 73 | +// Empty strings are allowed; they will still be emitted as labels for filtering. |
| 74 | +type RPCClientMetricsConfig struct { |
| 75 | + Env string // e.g. "staging", "production" |
| 76 | + Network string // chain/network name |
| 77 | + ChainID string // chain ID |
| 78 | + RPCProvider string // RPC provider or node name (optional) |
| 79 | +} |
| 80 | + |
| 81 | +// NewRPCClientMetrics creates RPC client metrics that publish to Beholder and Prometheus. |
| 82 | +// Callers (e.g. chain-specific RPC clients or multinode) should pass env, network, chainID, and optionally rpcProvider |
| 83 | +// so metrics can be queried in Grafana by environment, chain/network, and RPC provider. |
| 84 | +func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { |
| 85 | + latency, err := beholder.GetMeter().Float64Histogram(RPCRequestLatencyMs) |
| 86 | + if err != nil { |
| 87 | + return nil, fmt.Errorf("failed to register RPC request latency metric: %w", err) |
| 88 | + } |
| 89 | + errorsTotal, err := beholder.GetMeter().Int64Counter(RPCRequestErrorsTotal) |
| 90 | + if err != nil { |
| 91 | + return nil, fmt.Errorf("failed to register RPC request errors metric: %w", err) |
| 92 | + } |
| 93 | + return &rpcClientMetrics{ |
| 94 | + env: cfg.Env, |
| 95 | + network: cfg.Network, |
| 96 | + chainID: cfg.ChainID, |
| 97 | + rpcProvider: cfg.RPCProvider, |
| 98 | + latency: latency, |
| 99 | + errorsTotal: errorsTotal, |
| 100 | + }, nil |
| 101 | +} |
| 102 | + |
| 103 | +func (m *rpcClientMetrics) RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) { |
| 104 | + attrs := metric.WithAttributes( |
| 105 | + attribute.String("env", m.env), |
| 106 | + attribute.String("network", m.network), |
| 107 | + attribute.String("chain_id", m.chainID), |
| 108 | + attribute.String("rpc_provider", m.rpcProvider), |
| 109 | + attribute.String("call", callName), |
| 110 | + ) |
| 111 | + promRPCRequestLatency.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Observe(latencyMs) |
| 112 | + m.latency.Record(ctx, latencyMs, attrs) |
| 113 | + if err != nil { |
| 114 | + promRPCRequestErrors.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Inc() |
| 115 | + m.errorsTotal.Add(ctx, 1, attrs) |
| 116 | + } |
| 117 | +} |
| 118 | + |
| 119 | +// NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. |
| 120 | +type NoopRPCClientMetrics struct{} |
| 121 | + |
| 122 | +func (NoopRPCClientMetrics) RecordRequest(context.Context, string, float64, error) {} |
| 123 | + |
| 124 | +// Ensure NoopRPCClientMetrics implements RPCClientMetrics. |
| 125 | +var _ RPCClientMetrics = NoopRPCClientMetrics{} |
0 commit comments