Skip to content

Commit be2bc6b

Browse files
authored
add pollHealthChecker interface for optional RPC health checks (#83)
* add pollHealthChecker interface for optional RPC health checks Add optional interface for chain-specific RPC clients to run extra health checks during alive-loop polling. Failures count toward poll failure threshold. Enables chain integrations to detect issues like missing historical state. * added fixes for build and lint * Introduce nodeStateFinalizedStateNotAvailable and separate polling for finalized state availability with configurable threshold and regex-based error classification. * added fixes for lint and mock * Add FinalizedStateUnavailable to ClientErrors * Update metrics dependency * Update metrics dependency to include IncrementFinalizedStateFailed * Fix goimports formatting * Add CheckFinalizedStateAvailability mock expectations to tests * Remove redundant PollHealthCheck from RPCClient interface * lint fix-1 * Refactor finalizedStateNotAvailableLoop: dial once, respect createVerifiedConn state * Fix flaky test: assert on log message instead of transient FSM state * Add threshold > 0 guard and FSM transition test for FinalizedStateNotAvailable * Skip finalized state check when threshold is 0, log check enabled/disabled at startup * fix: pass required string argument to LatestChainInfo
1 parent 8e2464d commit be2bc6b

13 files changed

Lines changed: 461 additions & 21 deletions

multinode/config/config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ type MultiNode struct {
3434
FinalityDepth *uint32
3535
FinalityTagEnabled *bool
3636
FinalizedBlockOffset *uint32
37+
38+
// Finalized State Availability Check
39+
FinalizedStateCheckFailureThreshold *uint32
3740
}
3841

3942
func (c *MultiNodeConfig) Enabled() bool {
@@ -94,6 +97,10 @@ func (c *MultiNodeConfig) FinalityTagEnabled() bool { return *c.MultiNode.Finali
9497

9598
func (c *MultiNodeConfig) FinalizedBlockOffset() uint32 { return *c.MultiNode.FinalizedBlockOffset }
9699

100+
func (c *MultiNodeConfig) FinalizedStateCheckFailureThreshold() uint32 {
101+
return *c.MultiNode.FinalizedStateCheckFailureThreshold
102+
}
103+
97104
func (c *MultiNodeConfig) SetFrom(f *MultiNodeConfig) {
98105
if f.MultiNode.Enabled != nil {
99106
c.MultiNode.Enabled = f.MultiNode.Enabled
@@ -150,4 +157,9 @@ func (c *MultiNodeConfig) SetFrom(f *MultiNodeConfig) {
150157
if f.MultiNode.FinalizedBlockOffset != nil {
151158
c.MultiNode.FinalizedBlockOffset = f.MultiNode.FinalizedBlockOffset
152159
}
160+
161+
// Finalized State Availability Check
162+
if f.MultiNode.FinalizedStateCheckFailureThreshold != nil {
163+
c.MultiNode.FinalizedStateCheckFailureThreshold = f.MultiNode.FinalizedStateCheckFailureThreshold
164+
}
153165
}

multinode/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ require (
77
github.com/pkg/errors v0.9.1
88
github.com/prometheus/client_model v0.6.2
99
github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bfc7
10-
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2
10+
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20260310180305-3ee91a6d9ae9
1111
github.com/stretchr/testify v1.11.1
1212
go.uber.org/zap v1.27.1
1313
)

multinode/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bf
8080
github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bfc7/go.mod h1:0ghbAr7tRO0tT5ZqBXhOyzgUO37tNNe33Yn0hskauVM=
8181
github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 h1:FJAFgXS9oqASnkS03RE1HQwYQQxrO4l46O5JSzxqLgg=
8282
github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10/go.mod h1:oiDa54M0FwxevWwyAX773lwdWvFYYlYHHQV1LQ5HpWY=
83-
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2 h1:ysZjKH+BpWlQhF93kr/Lc668UlCvT9NjfcsGdZT19I8=
84-
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2/go.mod h1:jo+cUqNcHwN8IF7SInQNXDZ8qzBsyMpnLdYbDswviFc=
83+
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20260310180305-3ee91a6d9ae9 h1:GK+2aFpW/Z5ZnMGCa9NU6o7LKHQ/9xJVZx2yMAMudnc=
84+
github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20260310180305-3ee91a6d9ae9/go.mod h1:HG/aei0MgBOpsyRLexdKGtOUO8yjSJO3iUu0Uu8KBm4=
8585
github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e h1:Hv9Mww35LrufCdM9wtS9yVi/rEWGI1UnjHbcKKU0nVY=
8686
github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e/go.mod h1:T4zH9R8R8lVWKfU7tUvYz2o2jMv1OpGCdpY2j2QZXzU=
8787
github.com/smartcontractkit/libocr v0.0.0-20250912173940-f3ab0246e23d h1:LokA9PoCNb8mm8mDT52c3RECPMRsGz1eCQORq+J3n74=

multinode/mock_node_metrics_test.go

Lines changed: 68 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

multinode/mock_rpc_client_test.go

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

multinode/node.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ type NodeConfig interface {
2727
DeathDeclarationDelay() time.Duration
2828
NewHeadsPollInterval() time.Duration
2929
VerifyChainID() bool
30+
FinalizedStateCheckFailureThreshold() uint32
3031
}
3132

3233
type ChainConfig interface {
@@ -48,13 +49,15 @@ type nodeMetrics interface {
4849
IncrementNodeTransitionsToInvalidChainID(ctx context.Context, nodeName string)
4950
IncrementNodeTransitionsToUnusable(ctx context.Context, nodeName string)
5051
IncrementNodeTransitionsToSyncing(ctx context.Context, nodeName string)
52+
IncrementNodeTransitionsToFinalizedStateNotAvailable(ctx context.Context, nodeName string)
5153
RecordNodeClientVersion(ctx context.Context, nodeName string, version string)
5254
SetHighestSeenBlock(ctx context.Context, nodeName string, blockNumber int64)
5355
SetHighestFinalizedBlock(ctx context.Context, nodeName string, blockNumber int64)
5456
IncrementSeenBlocks(ctx context.Context, nodeName string)
5557
IncrementPolls(ctx context.Context, nodeName string)
5658
IncrementPollsFailed(ctx context.Context, nodeName string)
5759
IncrementPollsSuccess(ctx context.Context, nodeName string)
60+
IncrementFinalizedStateFailed(ctx context.Context, nodeName string)
5861
}
5962

6063
type Node[
@@ -273,7 +276,7 @@ func (n *node[CHAIN_ID, HEAD, RPC]) verifyChainID(callerCtx context.Context, lgg
273276
// The node is already closed, and any subsequent transition is invalid.
274277
// To make spotting such transitions a bit easier, return the invalid node state.
275278
return nodeStateLen
276-
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing:
279+
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing, nodeStateFinalizedStateNotAvailable:
277280
default:
278281
panic(fmt.Sprintf("cannot verify node in state %v", st))
279282
}

multinode/node_fsm.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ func (n nodeState) String() string {
3535
return "Syncing"
3636
case nodeStateFinalizedBlockOutOfSync:
3737
return "FinalizedBlockOutOfSync"
38+
case nodeStateFinalizedStateNotAvailable:
39+
return "FinalizedStateNotAvailable"
3840
default:
3941
return fmt.Sprintf("nodeState(%d)", n)
4042
}
@@ -72,6 +74,8 @@ const (
7274
nodeStateSyncing
7375
// nodeStateFinalizedBlockOutOfSync - node is lagging behind on latest finalized block
7476
nodeStateFinalizedBlockOutOfSync
77+
// nodeStateFinalizedStateNotAvailable - node cannot serve historical state at finalized block
78+
nodeStateFinalizedStateNotAvailable
7579
// nodeStateLen tracks the number of states
7680
nodeStateLen
7781
)
@@ -182,7 +186,7 @@ func (n *node[CHAIN_ID, HEAD, RPC]) transitionToAlive(fn func()) {
182186
return
183187
}
184188
switch n.state {
185-
case nodeStateDialed, nodeStateInvalidChainID, nodeStateSyncing:
189+
case nodeStateDialed, nodeStateInvalidChainID, nodeStateSyncing, nodeStateFinalizedStateNotAvailable:
186190
n.state = nodeStateAlive
187191
default:
188192
panic(transitionFail(n.state, nodeStateAlive))
@@ -266,7 +270,7 @@ func (n *node[CHAIN_ID, HEAD, RPC]) transitionToUnreachable(fn func()) {
266270
return
267271
}
268272
switch n.state {
269-
case nodeStateUndialed, nodeStateDialed, nodeStateAlive, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing:
273+
case nodeStateUndialed, nodeStateDialed, nodeStateAlive, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing, nodeStateFinalizedStateNotAvailable:
270274
n.rpc.Close()
271275
n.state = nodeStateUnreachable
272276
default:
@@ -288,6 +292,8 @@ func (n *node[CHAIN_ID, HEAD, RPC]) declareState(state nodeState) {
288292
n.declareSyncing()
289293
case nodeStateAlive:
290294
n.declareAlive()
295+
case nodeStateFinalizedStateNotAvailable:
296+
n.declareFinalizedStateNotAvailable()
291297
default:
292298
panic(fmt.Sprintf("%#v state declaration is not implemented", state))
293299
}
@@ -311,7 +317,7 @@ func (n *node[CHAIN_ID, HEAD, RPC]) transitionToInvalidChainID(fn func()) {
311317
return
312318
}
313319
switch n.state {
314-
case nodeStateDialed, nodeStateOutOfSync, nodeStateSyncing:
320+
case nodeStateDialed, nodeStateOutOfSync, nodeStateSyncing, nodeStateFinalizedStateNotAvailable:
315321
n.rpc.Close()
316322
n.state = nodeStateInvalidChainID
317323
default:
@@ -338,7 +344,7 @@ func (n *node[CHAIN_ID, HEAD, RPC]) transitionToSyncing(fn func()) {
338344
return
339345
}
340346
switch n.state {
341-
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID:
347+
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateFinalizedStateNotAvailable:
342348
n.rpc.Close()
343349
n.state = nodeStateSyncing
344350
default:
@@ -351,6 +357,33 @@ func (n *node[CHAIN_ID, HEAD, RPC]) transitionToSyncing(fn func()) {
351357
fn()
352358
}
353359

360+
func (n *node[CHAIN_ID, HEAD, RPC]) declareFinalizedStateNotAvailable() {
361+
n.transitionToFinalizedStateNotAvailable(func() {
362+
n.lfcLog.Errorw("RPC Node cannot serve finalized state", "nodeState", n.state)
363+
n.wg.Add(1)
364+
go n.finalizedStateNotAvailableLoop()
365+
})
366+
}
367+
368+
func (n *node[CHAIN_ID, HEAD, RPC]) transitionToFinalizedStateNotAvailable(fn func()) {
369+
ctx, cancel := n.stopCh.NewCtx()
370+
defer cancel()
371+
n.metrics.IncrementNodeTransitionsToFinalizedStateNotAvailable(ctx, n.name)
372+
n.stateMu.Lock()
373+
defer n.stateMu.Unlock()
374+
if n.state == nodeStateClosed {
375+
return
376+
}
377+
switch n.state {
378+
case nodeStateAlive:
379+
n.rpc.Close()
380+
n.state = nodeStateFinalizedStateNotAvailable
381+
default:
382+
panic(transitionFail(n.state, nodeStateFinalizedStateNotAvailable))
383+
}
384+
fn()
385+
}
386+
354387
func transitionFail(from nodeState, to nodeState) string {
355388
return fmt.Sprintf("cannot transition from %#v to %#v", from, to)
356389
}

multinode/node_fsm_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func TestUnit_Node_StateTransitions(t *testing.T) {
3636

3737
t.Run("transitionToAlive", func(t *testing.T) {
3838
const destinationState = nodeStateAlive
39-
allowedStates := []nodeState{nodeStateDialed, nodeStateInvalidChainID, nodeStateSyncing}
39+
allowedStates := []nodeState{nodeStateDialed, nodeStateInvalidChainID, nodeStateSyncing, nodeStateFinalizedStateNotAvailable}
4040
rpc := newMockRPCClient[ID, Head](t)
4141
testTransition(t, rpc, testNode.transitionToAlive, destinationState, allowedStates...)
4242
})
@@ -56,21 +56,21 @@ func TestUnit_Node_StateTransitions(t *testing.T) {
5656
})
5757
t.Run("transitionToUnreachable", func(t *testing.T) {
5858
const destinationState = nodeStateUnreachable
59-
allowedStates := []nodeState{nodeStateUndialed, nodeStateDialed, nodeStateAlive, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing}
59+
allowedStates := []nodeState{nodeStateUndialed, nodeStateDialed, nodeStateAlive, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing, nodeStateFinalizedStateNotAvailable}
6060
rpc := newMockRPCClient[ID, Head](t)
6161
rpc.On("Close")
6262
testTransition(t, rpc, testNode.transitionToUnreachable, destinationState, allowedStates...)
6363
})
6464
t.Run("transitionToInvalidChain", func(t *testing.T) {
6565
const destinationState = nodeStateInvalidChainID
66-
allowedStates := []nodeState{nodeStateDialed, nodeStateOutOfSync, nodeStateSyncing}
66+
allowedStates := []nodeState{nodeStateDialed, nodeStateOutOfSync, nodeStateSyncing, nodeStateFinalizedStateNotAvailable}
6767
rpc := newMockRPCClient[ID, Head](t)
6868
rpc.On("Close")
6969
testTransition(t, rpc, testNode.transitionToInvalidChainID, destinationState, allowedStates...)
7070
})
7171
t.Run("transitionToSyncing", func(t *testing.T) {
7272
const destinationState = nodeStateSyncing
73-
allowedStates := []nodeState{nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID}
73+
allowedStates := []nodeState{nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateFinalizedStateNotAvailable}
7474
rpc := newMockRPCClient[ID, Head](t)
7575
rpc.On("Close")
7676
testTransition(t, rpc, testNode.transitionToSyncing, destinationState, allowedStates...)
@@ -86,6 +86,13 @@ func TestUnit_Node_StateTransitions(t *testing.T) {
8686
node.transitionToSyncing(fn.Fn)
8787
})
8888
})
89+
t.Run("transitionToFinalizedStateNotAvailable", func(t *testing.T) {
90+
const destinationState = nodeStateFinalizedStateNotAvailable
91+
allowedStates := []nodeState{nodeStateAlive}
92+
rpc := newMockRPCClient[ID, Head](t)
93+
rpc.On("Close")
94+
testTransition(t, rpc, testNode.transitionToFinalizedStateNotAvailable, destinationState, allowedStates...)
95+
})
8996
}
9097

9198
func testTransition(t *testing.T, rpc *mockRPCClient[ID, Head], transition func(node testNode, fn func()), destinationState nodeState, allowedStates ...nodeState) {

0 commit comments

Comments
 (0)