Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 61 additions & 9 deletions l1/l1.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,60 @@
}
}

// checkChainID checks that the client is connected to the right L1 client
// chainIDMismatchError marks an L1/L2 network mismatch: a misconfiguration that
// retrying cannot fix, so verifyChainID treats it as fatal. (Supporting custom
// forked Starknet networks would mean warning here instead of erroring.)
type chainIDMismatchError struct {
network string
}

func (e *chainIDMismatchError) Error() string {
return fmt.Sprintf(
"mismatched network id between L1 and L2. L2 network is %s; "+
"is --eth-node pointing to the right network?",
e.network,
)
}
Comment on lines +146 to +159

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no need for a custom error as I can see. It is being used only in one place


// verifyChainID checks the L1 node is on the expected network, retrying transient
// failures (rate limits, timeouts, an unresponsive node) as warnings until the
// check passes or ctx is cancelled, so a flaky L1 never shuts the node down
// (issue #1385). A network mismatch is a misconfiguration and is returned fatally.
func (c *Client) verifyChainID(ctx context.Context) error {
timer := time.NewTimer(0)
defer timer.Stop()

for {
select {
case <-ctx.Done():
return nil

Check warning on line 172 in l1/l1.go

View check run for this annotation

Codecov / codecov/patch

l1/l1.go#L171-L172

Added lines #L171 - L172 were not covered by tests
case <-timer.C:
err := c.checkChainID(ctx)
if err == nil {
return nil
}

var mismatch *chainIDMismatchError
if errors.As(err, &mismatch) {
return err
}

// Transient: warn and retry, unless we're already shutting down.
if ctx.Err() != nil {
return nil
}
Comment on lines +184 to +187

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ctx.Err() is not required since ctx.Done() is checked two instructions after

// err may contain the L1 URL (with API key), so keep it to Debug.
c.logger.Warn("Failed to verify L1 chain ID; retrying",
zap.Duration("tryAgainIn", c.resubscribeDelay),
)
c.logger.Debug("L1 chain ID verification failed", zap.Error(err))
Comment on lines +189 to +192

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to logs for the same thing? Why doesn't the warn share the error as well?

timer.Reset(c.resubscribeDelay)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why resubscribe delay is used for separating between requests?

}
}
}

// checkChainID runs a single chain-ID verification attempt (no retry); the
// one-shot CatchUpL1Head migration path relies on it failing fast.
func (c *Client) checkChainID(ctx context.Context) error {
const chainIDCheckTimeout = 30 * time.Second
ctx, cancel := context.WithTimeout(ctx, chainIDCheckTimeout)
Expand All @@ -165,20 +218,19 @@
return nil
}

// NOTE: for now we return an error. If we want to support users who fork
// Starknet to create a "custom" Starknet network, we will need to log a warning instead.
return fmt.Errorf(
"mismatched network id between L1 and L2. L2 network is %s; "+
"is --eth-node pointing to the right network?",
c.network.String(),
)
return &chainIDMismatchError{network: c.network.String()}
}

func (c *Client) Run(ctx context.Context) error {
defer c.l1.Close()
if err := c.checkChainID(ctx); err != nil {
if err := c.verifyChainID(ctx); err != nil {
return err
}
// verifyChainID returns nil on ctx cancellation; don't start any further
// RPC work (which would just fail with context canceled) if we're stopping.
if ctx.Err() != nil {
return nil
}
Comment on lines +226 to +233

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this patch here, why does verifyChainID returns nil on context cancellation rather than returning the context error?


Comment thread
brbrr marked this conversation as resolved.
// catchUpL1HeadUpdates is best-effort: a backward eth_getLogs scan can fail
// on free-tier RPC providers that cap range size or rate-limit. On failure
Expand Down
69 changes: 59 additions & 10 deletions l1/l1_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"math/big"
"net"
"net/http"
"sync/atomic"
"testing"
"testing/synctest"
"time"
Expand Down Expand Up @@ -141,12 +142,10 @@ func TestMismatchedChainID(t *testing.T) {
require.ErrorContains(t, err, "--eth-node")
}

// TestChainIDCheckTimeout asserts that the startup eth_chainId probe gives up
// after chainIDCheckTimeout (30s in production) with a user-actionable error
// when the L1 endpoint accepts the dial but never responds to eth_chainId
// (e.g. --eth-node pointing at an incorrect RPC URL). The test runs inside a
// synctest bubble so the 30s wait advances in virtual time and the test
// completes in microseconds of wallclock.
// TestChainIDCheckTimeout asserts a chain-ID probe gives up after 30s with a
// user-actionable error when the L1 endpoint never answers eth_chainId. It uses
// the one-shot CatchUpL1Head path, which fails fast (Run now retries instead,
// per issue #1385). synctest advances the 30s wait in virtual time.
func TestChainIDCheckTimeout(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
network := networks.Mainnet
Expand All @@ -171,12 +170,14 @@ func TestChainIDCheckTimeout(t *testing.T) {

client := l1.NewClient(subscriber, chain, nopLog)

err := client.Run(t.Context())
err := client.CatchUpL1Head(t.Context())
require.ErrorContains(t, err, "eth_chainId did not respond within")
require.ErrorContains(t, err, "--eth-node")
})
}

// TestChainIDFetchError asserts a non-timeout eth_chainId failure is wrapped and
// surfaced by the fail-fast CatchUpL1Head path (Run retries it instead, #1385).
func TestChainIDFetchError(t *testing.T) {
t.Parallel()

Expand All @@ -200,13 +201,61 @@ func TestChainIDFetchError(t *testing.T) {

client := l1.NewClient(subscriber, chain, nopLog)

ctx, cancel := context.WithTimeout(t.Context(), time.Second)
t.Cleanup(cancel)
err := client.Run(ctx)
err := client.CatchUpL1Head(t.Context())
require.ErrorContains(t, err, "retrieving Ethereum chain ID")
require.ErrorIs(t, err, rpcErr)
}

// TestTransientChainIDErrorDoesNotShutDownNode is the regression guard for issue
// #1385: a transient eth_chainId failure (the rate-limit error from the issue) is
// retried, not fatal. ChainID keeps failing; the node-wide context is cancelled
// on the third attempt. Run must then return no error, having retried more than
// once instead of aborting on the first failure (the old, node-killing behaviour).
func TestTransientChainIDErrorDoesNotShutDownNode(t *testing.T) {
t.Parallel()

network := networks.Mainnet
ctrl := gomock.NewController(t)
nopLog := log.NewNopZapLogger()
chain := blockchain.New(
memory.New(),
&network,
blockchain.WithNewState(statetestutils.UseNewState()),
)

ctx, cancel := context.WithCancel(t.Context())
t.Cleanup(cancel)

subscriber := mocks.NewMockSubscriber(ctrl)
subscriber.EXPECT().Close().Times(1)

const cancelAfter = 3
var chainIDCalls atomic.Int32
rateLimitErr := errors.New("daily request count exceeded, request rate limited")
subscriber.
EXPECT().
ChainID(gomock.Any()).
DoAndReturn(func(context.Context) (*big.Int, error) {
if chainIDCalls.Add(1) == cancelAfter {
cancel() // shut down while L1 is still retrying
}
return nil, rateLimitErr
}).
MinTimes(cancelAfter)

// Once ctx is cancelled, Run returns early after verifyChainID without
// entering catch-up or the watch loop, so no other Subscriber calls occur.

client := l1.NewClient(subscriber, chain, nopLog,
l1.WithResubscribeDelay(0),
l1.WithPollFinalisedInterval(time.Nanosecond),
)

require.NoError(t, client.Run(ctx))
require.GreaterOrEqual(t, chainIDCalls.Load(), int32(cancelAfter),
"a transient chain ID error should be retried, not shut the node down")
}

// TestFinalisedHeightTimeoutDuringCatchUp asserts that the L1 catch-up startup
// scan gives up on a hung eth_getBlockByNumber("finalized") call with a
// user-actionable error pointing at --eth-node, instead of stalling forever.
Expand Down
Loading