Skip to content

Commit de6ed75

Browse files
committed
fix(l1): retry transient chain ID errors instead of shutting down the node
The L1 client ran as a node service, and node.StartService brings the whole node down whenever a service's Run returns. checkChainID returned transient RPC failures (rate limits, timeouts, an unresponsive endpoint) straight out of Run, so a single eth_chainId hiccup on a free-tier provider — e.g. "daily request count exceeded, request rate limited" — killed the node. This also violated the service contract, which asks services to log and retry non-critical errors rather than return them. Add verifyChainID, used only by Run, which retries transient failures with the client's existing resubscribe-delay loop (matching subscribeToUpdates and finalisedHeight), demoting them to warnings until the check succeeds or the context is cancelled. Retries are unbounded by design: a cap would just reintroduce the shutdown. A genuine chain-ID mismatch is a misconfiguration and stays fatal, now modelled as a typed chainIDMismatchError so the retry loop can tell it apart from transient errors. checkChainID stays single-attempt because the one-shot CatchUpL1Head migration path must keep failing fast. Fixes #1385
1 parent 62d2222 commit de6ed75

2 files changed

Lines changed: 118 additions & 19 deletions

File tree

l1/l1.go

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,59 @@ func (c *Client) subscribeToUpdates(
143143
}
144144
}
145145

146-
// checkChainID checks that the client is connected to the right L1 client
146+
// chainIDMismatchError marks an L1/L2 network mismatch: a misconfiguration that
147+
// retrying cannot fix, so verifyChainID treats it as fatal. (Supporting custom
148+
// forked Starknet networks would mean warning here instead of erroring.)
149+
type chainIDMismatchError struct {
150+
network string
151+
}
152+
153+
func (e *chainIDMismatchError) Error() string {
154+
return fmt.Sprintf(
155+
"mismatched network id between L1 and L2. L2 network is %s; "+
156+
"is --eth-node pointing to the right network?",
157+
e.network,
158+
)
159+
}
160+
161+
// verifyChainID checks the L1 node is on the expected network, retrying transient
162+
// failures (rate limits, timeouts, an unresponsive node) as warnings until the
163+
// check passes or ctx is cancelled, so a flaky L1 never shuts the node down
164+
// (issue #1385). A network mismatch is a misconfiguration and is returned fatally.
165+
func (c *Client) verifyChainID(ctx context.Context) error {
166+
timer := time.NewTimer(0)
167+
defer timer.Stop()
168+
169+
for {
170+
select {
171+
case <-ctx.Done():
172+
return nil
173+
case <-timer.C:
174+
err := c.checkChainID(ctx)
175+
if err == nil {
176+
return nil
177+
}
178+
179+
var mismatch *chainIDMismatchError
180+
if errors.As(err, &mismatch) {
181+
return err
182+
}
183+
184+
// Transient: warn and retry, unless we're already shutting down.
185+
if ctx.Err() != nil {
186+
return nil
187+
}
188+
c.logger.Warn("Failed to verify L1 chain ID; retrying",
189+
zap.Duration("tryAgainIn", c.resubscribeDelay),
190+
zap.Error(err),
191+
)
192+
timer.Reset(c.resubscribeDelay)
193+
}
194+
}
195+
}
196+
197+
// checkChainID runs a single chain-ID verification attempt (no retry); the
198+
// one-shot CatchUpL1Head migration path relies on it failing fast.
147199
func (c *Client) checkChainID(ctx context.Context) error {
148200
const chainIDCheckTimeout = 30 * time.Second
149201
ctx, cancel := context.WithTimeout(ctx, chainIDCheckTimeout)
@@ -165,18 +217,12 @@ func (c *Client) checkChainID(ctx context.Context) error {
165217
return nil
166218
}
167219

168-
// NOTE: for now we return an error. If we want to support users who fork
169-
// Starknet to create a "custom" Starknet network, we will need to log a warning instead.
170-
return fmt.Errorf(
171-
"mismatched network id between L1 and L2. L2 network is %s; "+
172-
"is --eth-node pointing to the right network?",
173-
c.network.String(),
174-
)
220+
return &chainIDMismatchError{network: c.network.String()}
175221
}
176222

177223
func (c *Client) Run(ctx context.Context) error {
178224
defer c.l1.Close()
179-
if err := c.checkChainID(ctx); err != nil {
225+
if err := c.verifyChainID(ctx); err != nil {
180226
return err
181227
}
182228

l1/l1_test.go

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"math/big"
77
"net"
88
"net/http"
9+
"sync/atomic"
910
"testing"
1011
"testing/synctest"
1112
"time"
@@ -141,12 +142,10 @@ func TestMismatchedChainID(t *testing.T) {
141142
require.ErrorContains(t, err, "--eth-node")
142143
}
143144

144-
// TestChainIDCheckTimeout asserts that the startup eth_chainId probe gives up
145-
// after chainIDCheckTimeout (30s in production) with a user-actionable error
146-
// when the L1 endpoint accepts the dial but never responds to eth_chainId
147-
// (e.g. --eth-node pointing at an incorrect RPC URL). The test runs inside a
148-
// synctest bubble so the 30s wait advances in virtual time and the test
149-
// completes in microseconds of wallclock.
145+
// TestChainIDCheckTimeout asserts a chain-ID probe gives up after 30s with a
146+
// user-actionable error when the L1 endpoint never answers eth_chainId. It uses
147+
// the one-shot CatchUpL1Head path, which fails fast (Run now retries instead,
148+
// per issue #1385). synctest advances the 30s wait in virtual time.
150149
func TestChainIDCheckTimeout(t *testing.T) {
151150
synctest.Test(t, func(t *testing.T) {
152151
network := networks.Mainnet
@@ -171,12 +170,14 @@ func TestChainIDCheckTimeout(t *testing.T) {
171170

172171
client := l1.NewClient(subscriber, chain, nopLog)
173172

174-
err := client.Run(t.Context())
173+
err := client.CatchUpL1Head(t.Context())
175174
require.ErrorContains(t, err, "eth_chainId did not respond within")
176175
require.ErrorContains(t, err, "--eth-node")
177176
})
178177
}
179178

179+
// TestChainIDFetchError asserts a non-timeout eth_chainId failure is wrapped and
180+
// surfaced by the fail-fast CatchUpL1Head path (Run retries it instead, #1385).
180181
func TestChainIDFetchError(t *testing.T) {
181182
t.Parallel()
182183

@@ -200,13 +201,65 @@ func TestChainIDFetchError(t *testing.T) {
200201

201202
client := l1.NewClient(subscriber, chain, nopLog)
202203

203-
ctx, cancel := context.WithTimeout(t.Context(), time.Second)
204-
t.Cleanup(cancel)
205-
err := client.Run(ctx)
204+
err := client.CatchUpL1Head(t.Context())
206205
require.ErrorContains(t, err, "retrieving Ethereum chain ID")
207206
require.ErrorIs(t, err, rpcErr)
208207
}
209208

209+
// TestTransientChainIDErrorDoesNotShutDownNode is the regression guard for issue
210+
// #1385: a transient eth_chainId failure (the rate-limit error from the issue) is
211+
// retried, not fatal. ChainID keeps failing; the node-wide context is cancelled
212+
// on the third attempt. Run must then return no error, having retried more than
213+
// once instead of aborting on the first failure (the old, node-killing behaviour).
214+
func TestTransientChainIDErrorDoesNotShutDownNode(t *testing.T) {
215+
t.Parallel()
216+
217+
network := networks.Mainnet
218+
ctrl := gomock.NewController(t)
219+
nopLog := log.NewNopZapLogger()
220+
chain := blockchain.New(
221+
memory.New(),
222+
&network,
223+
blockchain.WithNewState(statetestutils.UseNewState()),
224+
)
225+
226+
ctx, cancel := context.WithCancel(t.Context())
227+
t.Cleanup(cancel)
228+
229+
subscriber := mocks.NewMockSubscriber(ctrl)
230+
subscriber.EXPECT().Close().Times(1)
231+
232+
const cancelAfter = 3
233+
var chainIDCalls atomic.Int32
234+
rateLimitErr := errors.New("daily request count exceeded, request rate limited")
235+
subscriber.
236+
EXPECT().
237+
ChainID(gomock.Any()).
238+
DoAndReturn(func(context.Context) (*big.Int, error) {
239+
if chainIDCalls.Add(1) == cancelAfter {
240+
cancel() // shut down while L1 is still retrying
241+
}
242+
return nil, rateLimitErr
243+
}).
244+
MinTimes(cancelAfter)
245+
246+
// After cancellation Run unwinds through catch-up and the watch loop; mirror
247+
// the happy-path Run test so those downstream calls resolve cleanly.
248+
subscriber.EXPECT().WatchLogStateUpdate(gomock.Any(), gomock.Any()).Return(newFakeSubscription(), nil).AnyTimes()
249+
subscriber.EXPECT().LatestHeight(gomock.Any()).Return(uint64(0), nil).AnyTimes()
250+
subscriber.EXPECT().FinalisedHeight(gomock.Any()).Return(uint64(0), nil).AnyTimes()
251+
subscriber.EXPECT().FilterLogStateUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
252+
253+
client := l1.NewClient(subscriber, chain, nopLog,
254+
l1.WithResubscribeDelay(0),
255+
l1.WithPollFinalisedInterval(time.Nanosecond),
256+
)
257+
258+
require.NoError(t, client.Run(ctx))
259+
require.GreaterOrEqual(t, chainIDCalls.Load(), int32(cancelAfter),
260+
"a transient chain ID error should be retried, not shut the node down")
261+
}
262+
210263
// TestFinalisedHeightTimeoutDuringCatchUp asserts that the L1 catch-up startup
211264
// scan gives up on a hung eth_getBlockByNumber("finalized") call with a
212265
// user-actionable error pointing at --eth-node, instead of stalling forever.

0 commit comments

Comments
 (0)