NethermindEth · brbrr · Jun 30, 2026 · Jun 28, 2026 · Jun 29, 2026 · Jun 30, 2026
diff --git a/l1/l1.go b/l1/l1.go
@@ -143,7 +143,60 @@
 	}
 }
 
-// checkChainID checks that the client is connected to the right L1 client
+// chainIDMismatchError marks an L1/L2 network mismatch: a misconfiguration that
+// retrying cannot fix, so verifyChainID treats it as fatal. (Supporting custom
+// forked Starknet networks would mean warning here instead of erroring.)
+type chainIDMismatchError struct {
+	network string
+}
+
+func (e *chainIDMismatchError) Error() string {
+	return fmt.Sprintf(
+		"mismatched network id between L1 and L2. L2 network is %s; "+
+			"is --eth-node pointing to the right network?",
+		e.network,
+	)
+}
+
+// verifyChainID checks the L1 node is on the expected network, retrying transient
+// failures (rate limits, timeouts, an unresponsive node) as warnings until the
+// check passes or ctx is cancelled, so a flaky L1 never shuts the node down
+// (issue #1385). A network mismatch is a misconfiguration and is returned fatally.
+func (c *Client) verifyChainID(ctx context.Context) error {
+	timer := time.NewTimer(0)
+	defer timer.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-timer.C:
+			err := c.checkChainID(ctx)
+			if err == nil {
+				return nil
+			}
+
+			var mismatch *chainIDMismatchError
+			if errors.As(err, &mismatch) {
+				return err
+			}
+
+			// Transient: warn and retry, unless we're already shutting down.
+			if ctx.Err() != nil {
+				return nil
+			}
+			// err may contain the L1 URL (with API key), so keep it to Debug.
+			c.logger.Warn("Failed to verify L1 chain ID; retrying",
+				zap.Duration("tryAgainIn", c.resubscribeDelay),
+			)
+			c.logger.Debug("L1 chain ID verification failed", zap.Error(err))
+			timer.Reset(c.resubscribeDelay)
+		}
+	}
+}
+
+// checkChainID runs a single chain-ID verification attempt (no retry); the
+// one-shot CatchUpL1Head migration path relies on it failing fast.
 func (c *Client) checkChainID(ctx context.Context) error {
 	const chainIDCheckTimeout = 30 * time.Second
 	ctx, cancel := context.WithTimeout(ctx, chainIDCheckTimeout)
@@ -165,20 +218,19 @@
 		return nil
 	}
 
-	// NOTE: for now we return an error. If we want to support users who fork
-	// Starknet to create a "custom" Starknet network, we will need to log a warning instead.
-	return fmt.Errorf(
-		"mismatched network id between L1 and L2. L2 network is %s; "+
-			"is --eth-node pointing to the right network?",
-		c.network.String(),
-	)
+	return &chainIDMismatchError{network: c.network.String()}
 }
 
 func (c *Client) Run(ctx context.Context) error {
 	defer c.l1.Close()
-	if err := c.checkChainID(ctx); err != nil {
+	if err := c.verifyChainID(ctx); err != nil {
 		return err
 	}
+	// verifyChainID returns nil on ctx cancellation; don't start any further
+	// RPC work (which would just fail with context canceled) if we're stopping.
+	if ctx.Err() != nil {
+		return nil
+	}
 
 	// catchUpL1HeadUpdates is best-effort: a backward eth_getLogs scan can fail
 	// on free-tier RPC providers that cap range size or rate-limit. On failure

diff --git a/l1/l1_test.go b/l1/l1_test.go
@@ -6,6 +6,7 @@ import (
 	"math/big"
 	"net"
 	"net/http"
+	"sync/atomic"
 	"testing"
 	"testing/synctest"
 	"time"
@@ -141,12 +142,10 @@ func TestMismatchedChainID(t *testing.T) {
 	require.ErrorContains(t, err, "--eth-node")
 }
 
-// TestChainIDCheckTimeout asserts that the startup eth_chainId probe gives up
-// after chainIDCheckTimeout (30s in production) with a user-actionable error
-// when the L1 endpoint accepts the dial but never responds to eth_chainId
-// (e.g. --eth-node pointing at an incorrect RPC URL). The test runs inside a
-// synctest bubble so the 30s wait advances in virtual time and the test
-// completes in microseconds of wallclock.
+// TestChainIDCheckTimeout asserts a chain-ID probe gives up after 30s with a
+// user-actionable error when the L1 endpoint never answers eth_chainId. It uses
+// the one-shot CatchUpL1Head path, which fails fast (Run now retries instead,
+// per issue #1385). synctest advances the 30s wait in virtual time.
 func TestChainIDCheckTimeout(t *testing.T) {
 	synctest.Test(t, func(t *testing.T) {
 		network := networks.Mainnet
@@ -171,12 +170,14 @@ func TestChainIDCheckTimeout(t *testing.T) {
 
 		client := l1.NewClient(subscriber, chain, nopLog)
 
-		err := client.Run(t.Context())
+		err := client.CatchUpL1Head(t.Context())
 		require.ErrorContains(t, err, "eth_chainId did not respond within")
 		require.ErrorContains(t, err, "--eth-node")
 	})
 }
 
+// TestChainIDFetchError asserts a non-timeout eth_chainId failure is wrapped and
+// surfaced by the fail-fast CatchUpL1Head path (Run retries it instead, #1385).
 func TestChainIDFetchError(t *testing.T) {
 	t.Parallel()
 
@@ -200,13 +201,61 @@ func TestChainIDFetchError(t *testing.T) {
 
 	client := l1.NewClient(subscriber, chain, nopLog)
 
-	ctx, cancel := context.WithTimeout(t.Context(), time.Second)
-	t.Cleanup(cancel)
-	err := client.Run(ctx)
+	err := client.CatchUpL1Head(t.Context())
 	require.ErrorContains(t, err, "retrieving Ethereum chain ID")
 	require.ErrorIs(t, err, rpcErr)
 }
 
+// TestTransientChainIDErrorDoesNotShutDownNode is the regression guard for issue
+// #1385: a transient eth_chainId failure (the rate-limit error from the issue) is
+// retried, not fatal. ChainID keeps failing; the node-wide context is cancelled
+// on the third attempt. Run must then return no error, having retried more than
+// once instead of aborting on the first failure (the old, node-killing behaviour).
+func TestTransientChainIDErrorDoesNotShutDownNode(t *testing.T) {
+	t.Parallel()
+
+	network := networks.Mainnet
+	ctrl := gomock.NewController(t)
+	nopLog := log.NewNopZapLogger()
+	chain := blockchain.New(
+		memory.New(),
+		&network,
+		blockchain.WithNewState(statetestutils.UseNewState()),
+	)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	t.Cleanup(cancel)
+
+	subscriber := mocks.NewMockSubscriber(ctrl)
+	subscriber.EXPECT().Close().Times(1)
+
+	const cancelAfter = 3
+	var chainIDCalls atomic.Int32
+	rateLimitErr := errors.New("daily request count exceeded, request rate limited")
+	subscriber.
+		EXPECT().
+		ChainID(gomock.Any()).
+		DoAndReturn(func(context.Context) (*big.Int, error) {
+			if chainIDCalls.Add(1) == cancelAfter {
+				cancel() // shut down while L1 is still retrying
+			}
+			return nil, rateLimitErr
+		}).
+		MinTimes(cancelAfter)
+
+	// Once ctx is cancelled, Run returns early after verifyChainID without
+	// entering catch-up or the watch loop, so no other Subscriber calls occur.
+
+	client := l1.NewClient(subscriber, chain, nopLog,
+		l1.WithResubscribeDelay(0),
+		l1.WithPollFinalisedInterval(time.Nanosecond),
+	)
+
+	require.NoError(t, client.Run(ctx))
+	require.GreaterOrEqual(t, chainIDCalls.Load(), int32(cancelAfter),
+		"a transient chain ID error should be retried, not shut the node down")
+}
+
 // TestFinalisedHeightTimeoutDuringCatchUp asserts that the L1 catch-up startup
 // scan gives up on a hung eth_getBlockByNumber("finalized") call with a
 // user-actionable error pointing at --eth-node, instead of stalling forever.