-
Notifications
You must be signed in to change notification settings - Fork 237
fix(l1): retry transient chain ID errors instead of shutting down the node #3771
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -143,7 +143,60 @@ | |
| } | ||
| } | ||
|
|
||
| // checkChainID checks that the client is connected to the right L1 client | ||
| // chainIDMismatchError marks an L1/L2 network mismatch: a misconfiguration that | ||
| // retrying cannot fix, so verifyChainID treats it as fatal. (Supporting custom | ||
| // forked Starknet networks would mean warning here instead of erroring.) | ||
| type chainIDMismatchError struct { | ||
| network string | ||
| } | ||
|
|
||
| func (e *chainIDMismatchError) Error() string { | ||
| return fmt.Sprintf( | ||
| "mismatched network id between L1 and L2. L2 network is %s; "+ | ||
| "is --eth-node pointing to the right network?", | ||
| e.network, | ||
| ) | ||
| } | ||
|
|
||
| // verifyChainID checks the L1 node is on the expected network, retrying transient | ||
| // failures (rate limits, timeouts, an unresponsive node) as warnings until the | ||
| // check passes or ctx is cancelled, so a flaky L1 never shuts the node down | ||
| // (issue #1385). A network mismatch is a misconfiguration and is returned fatally. | ||
| func (c *Client) verifyChainID(ctx context.Context) error { | ||
| timer := time.NewTimer(0) | ||
| defer timer.Stop() | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| return nil | ||
| case <-timer.C: | ||
| err := c.checkChainID(ctx) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
|
|
||
| var mismatch *chainIDMismatchError | ||
| if errors.As(err, &mismatch) { | ||
| return err | ||
| } | ||
|
|
||
| // Transient: warn and retry, unless we're already shutting down. | ||
| if ctx.Err() != nil { | ||
| return nil | ||
| } | ||
|
Comment on lines
+184
to
+187
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
| // err may contain the L1 URL (with API key), so keep it to Debug. | ||
| c.logger.Warn("Failed to verify L1 chain ID; retrying", | ||
| zap.Duration("tryAgainIn", c.resubscribeDelay), | ||
| ) | ||
| c.logger.Debug("L1 chain ID verification failed", zap.Error(err)) | ||
|
Comment on lines
+189
to
+192
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to logs for the same thing? Why doesn't the warn share the error as well? |
||
| timer.Reset(c.resubscribeDelay) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why resubscribe delay is used for separating between requests? |
||
| } | ||
| } | ||
| } | ||
|
|
||
| // checkChainID runs a single chain-ID verification attempt (no retry); the | ||
| // one-shot CatchUpL1Head migration path relies on it failing fast. | ||
| func (c *Client) checkChainID(ctx context.Context) error { | ||
| const chainIDCheckTimeout = 30 * time.Second | ||
| ctx, cancel := context.WithTimeout(ctx, chainIDCheckTimeout) | ||
|
|
@@ -165,20 +218,19 @@ | |
| return nil | ||
| } | ||
|
|
||
| // NOTE: for now we return an error. If we want to support users who fork | ||
| // Starknet to create a "custom" Starknet network, we will need to log a warning instead. | ||
| return fmt.Errorf( | ||
| "mismatched network id between L1 and L2. L2 network is %s; "+ | ||
| "is --eth-node pointing to the right network?", | ||
| c.network.String(), | ||
| ) | ||
| return &chainIDMismatchError{network: c.network.String()} | ||
| } | ||
|
|
||
| func (c *Client) Run(ctx context.Context) error { | ||
| defer c.l1.Close() | ||
| if err := c.checkChainID(ctx); err != nil { | ||
| if err := c.verifyChainID(ctx); err != nil { | ||
| return err | ||
| } | ||
| // verifyChainID returns nil on ctx cancellation; don't start any further | ||
| // RPC work (which would just fail with context canceled) if we're stopping. | ||
| if ctx.Err() != nil { | ||
| return nil | ||
| } | ||
|
Comment on lines
+226
to
+233
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this patch here, why does |
||
|
|
||
|
brbrr marked this conversation as resolved.
|
||
| // catchUpL1HeadUpdates is best-effort: a backward eth_getLogs scan can fail | ||
| // on free-tier RPC providers that cap range size or rate-limit. On failure | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no need for a custom error as I can see. It is being used only in one place