fix(l1): retry transient chain ID errors instead of shutting down the node

Ehsan-saradar · Ehsan-saradar · commit de6ed754fae4 · 2026-06-28T12:25:35.000Z
The L1 client ran as a node service, and node.StartService brings the whole node down whenever a service's Run returns. checkChainID returned transient RPC failures (rate limits, timeouts, an unresponsive endpoint) straight out of Run, so a single eth_chainId hiccup on a free-tier provider — e.g. "daily request count exceeded, request rate limited" — killed the node. This also violated the service contract, which asks services to log and retry non-critical errors rather than return them. Add verifyChainID, used only by Run, which retries transient failures with the client's existing resubscribe-delay loop (matching subscribeToUpdates and finalisedHeight), demoting them to warnings until the check succeeds or the context is cancelled. Retries are unbounded by design: a cap would just reintroduce the shutdown. A genuine chain-ID mismatch is a misconfiguration and stays fatal, now modelled as a typed chainIDMismatchError so the retry loop can tell it apart from transient errors. checkChainID stays single-attempt because the one-shot CatchUpL1Head migration path must keep failing fast. Fixes #1385
diff --git a/l1/l1.go b/l1/l1.go
@@ -143,7 +143,59 @@ func (c *Client) subscribeToUpdates(
 	}
 }
 
-// checkChainID checks that the client is connected to the right L1 client
+// chainIDMismatchError marks an L1/L2 network mismatch: a misconfiguration that
+// retrying cannot fix, so verifyChainID treats it as fatal. (Supporting custom
+// forked Starknet networks would mean warning here instead of erroring.)
+type chainIDMismatchError struct {
+	network string
+}
+
+func (e *chainIDMismatchError) Error() string {
+	return fmt.Sprintf(
+		"mismatched network id between L1 and L2. L2 network is %s; "+
+			"is --eth-node pointing to the right network?",
+		e.network,
+	)
+}
+
+// verifyChainID checks the L1 node is on the expected network, retrying transient
+// failures (rate limits, timeouts, an unresponsive node) as warnings until the
+// check passes or ctx is cancelled, so a flaky L1 never shuts the node down
+// (issue #1385). A network mismatch is a misconfiguration and is returned fatally.
+func (c *Client) verifyChainID(ctx context.Context) error {
+	timer := time.NewTimer(0)
+	defer timer.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-timer.C:
+			err := c.checkChainID(ctx)
+			if err == nil {
+				return nil
+			}
+
+			var mismatch *chainIDMismatchError
+			if errors.As(err, &mismatch) {
+				return err
+			}
+
+			// Transient: warn and retry, unless we're already shutting down.
+			if ctx.Err() != nil {
+				return nil
+			}
+			c.logger.Warn("Failed to verify L1 chain ID; retrying",
+				zap.Duration("tryAgainIn", c.resubscribeDelay),
+				zap.Error(err),
+			)
+			timer.Reset(c.resubscribeDelay)
+		}
+	}
+}
+
+// checkChainID runs a single chain-ID verification attempt (no retry); the
+// one-shot CatchUpL1Head migration path relies on it failing fast.
 func (c *Client) checkChainID(ctx context.Context) error {
 	const chainIDCheckTimeout = 30 * time.Second
 	ctx, cancel := context.WithTimeout(ctx, chainIDCheckTimeout)
@@ -165,18 +217,12 @@ func (c *Client) checkChainID(ctx context.Context) error {
 		return nil
 	}
 
-	// NOTE: for now we return an error. If we want to support users who fork
-	// Starknet to create a "custom" Starknet network, we will need to log a warning instead.
-	return fmt.Errorf(
-		"mismatched network id between L1 and L2. L2 network is %s; "+
-			"is --eth-node pointing to the right network?",
-		c.network.String(),
-	)
+	return &chainIDMismatchError{network: c.network.String()}
 }
 
 func (c *Client) Run(ctx context.Context) error {
 	defer c.l1.Close()
-	if err := c.checkChainID(ctx); err != nil {
+	if err := c.verifyChainID(ctx); err != nil {
 		return err
 	}
 
diff --git a/l1/l1_test.go b/l1/l1_test.go
@@ -6,6 +6,7 @@ import (
 	"math/big"
 	"net"
 	"net/http"
+	"sync/atomic"
 	"testing"
 	"testing/synctest"
 	"time"
@@ -141,12 +142,10 @@ func TestMismatchedChainID(t *testing.T) {
 	require.ErrorContains(t, err, "--eth-node")
 }
 
-// TestChainIDCheckTimeout asserts that the startup eth_chainId probe gives up
-// after chainIDCheckTimeout (30s in production) with a user-actionable error
-// when the L1 endpoint accepts the dial but never responds to eth_chainId
-// (e.g. --eth-node pointing at an incorrect RPC URL). The test runs inside a
-// synctest bubble so the 30s wait advances in virtual time and the test
-// completes in microseconds of wallclock.
+// TestChainIDCheckTimeout asserts a chain-ID probe gives up after 30s with a
+// user-actionable error when the L1 endpoint never answers eth_chainId. It uses
+// the one-shot CatchUpL1Head path, which fails fast (Run now retries instead,
+// per issue #1385). synctest advances the 30s wait in virtual time.
 func TestChainIDCheckTimeout(t *testing.T) {
 	synctest.Test(t, func(t *testing.T) {
 		network := networks.Mainnet
@@ -171,12 +170,14 @@ func TestChainIDCheckTimeout(t *testing.T) {
 
 		client := l1.NewClient(subscriber, chain, nopLog)
 
-		err := client.Run(t.Context())
+		err := client.CatchUpL1Head(t.Context())
 		require.ErrorContains(t, err, "eth_chainId did not respond within")
 		require.ErrorContains(t, err, "--eth-node")
 	})
 }
 
+// TestChainIDFetchError asserts a non-timeout eth_chainId failure is wrapped and
+// surfaced by the fail-fast CatchUpL1Head path (Run retries it instead, #1385).
 func TestChainIDFetchError(t *testing.T) {
 	t.Parallel()
 
@@ -200,13 +201,65 @@ func TestChainIDFetchError(t *testing.T) {
 
 	client := l1.NewClient(subscriber, chain, nopLog)
 
-	ctx, cancel := context.WithTimeout(t.Context(), time.Second)
-	t.Cleanup(cancel)
-	err := client.Run(ctx)
+	err := client.CatchUpL1Head(t.Context())
 	require.ErrorContains(t, err, "retrieving Ethereum chain ID")
 	require.ErrorIs(t, err, rpcErr)
 }
 
+// TestTransientChainIDErrorDoesNotShutDownNode is the regression guard for issue
+// #1385: a transient eth_chainId failure (the rate-limit error from the issue) is
+// retried, not fatal. ChainID keeps failing; the node-wide context is cancelled
+// on the third attempt. Run must then return no error, having retried more than
+// once instead of aborting on the first failure (the old, node-killing behaviour).
+func TestTransientChainIDErrorDoesNotShutDownNode(t *testing.T) {
+	t.Parallel()
+
+	network := networks.Mainnet
+	ctrl := gomock.NewController(t)
+	nopLog := log.NewNopZapLogger()
+	chain := blockchain.New(
+		memory.New(),
+		&network,
+		blockchain.WithNewState(statetestutils.UseNewState()),
+	)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	t.Cleanup(cancel)
+
+	subscriber := mocks.NewMockSubscriber(ctrl)
+	subscriber.EXPECT().Close().Times(1)
+
+	const cancelAfter = 3
+	var chainIDCalls atomic.Int32
+	rateLimitErr := errors.New("daily request count exceeded, request rate limited")
+	subscriber.
+		EXPECT().
+		ChainID(gomock.Any()).
+		DoAndReturn(func(context.Context) (*big.Int, error) {
+			if chainIDCalls.Add(1) == cancelAfter {
+				cancel() // shut down while L1 is still retrying
+			}
+			return nil, rateLimitErr
+		}).
+		MinTimes(cancelAfter)
+
+	// After cancellation Run unwinds through catch-up and the watch loop; mirror
+	// the happy-path Run test so those downstream calls resolve cleanly.
+	subscriber.EXPECT().WatchLogStateUpdate(gomock.Any(), gomock.Any()).Return(newFakeSubscription(), nil).AnyTimes()
+	subscriber.EXPECT().LatestHeight(gomock.Any()).Return(uint64(0), nil).AnyTimes()
+	subscriber.EXPECT().FinalisedHeight(gomock.Any()).Return(uint64(0), nil).AnyTimes()
+	subscriber.EXPECT().FilterLogStateUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
+
+	client := l1.NewClient(subscriber, chain, nopLog,
+		l1.WithResubscribeDelay(0),
+		l1.WithPollFinalisedInterval(time.Nanosecond),
+	)
+
+	require.NoError(t, client.Run(ctx))
+	require.GreaterOrEqual(t, chainIDCalls.Load(), int32(cancelAfter),
+		"a transient chain ID error should be retried, not shut the node down")
+}
+
 // TestFinalisedHeightTimeoutDuringCatchUp asserts that the L1 catch-up startup
 // scan gives up on a hung eth_getBlockByNumber("finalized") call with a
 // user-actionable error pointing at --eth-node, instead of stalling forever.