diff --git a/CLAUDE.md b/CLAUDE.md index 433abdf9..758ac347 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -109,6 +109,7 @@ The `getWorkflow` output is the ground truth — it shows the trigger's `topics` - All feature branches and PRs must target `staging`, never `main` directly. - The `main` branch is updated only by merging `staging` → `main` after migration checks pass. - Before merging to `main`, run `go run scripts/compare_storage_structure.go main` to check for breaking storage changes. +- **PR titles must follow [Conventional Commits](https://www.conventionalcommits.org/) / semantic-release format** (`feat:`, `fix:`, `chore:`, `docs:`, `refactor:`, `test:`, `ci:`, `perf:`, `BREAKING CHANGE:` footer for majors). Squash merges use the PR title as the commit subject, and `.github/workflows/release-on-pr-close.yml` runs `go-semantic-release` against it to compute the next version. A non-conforming title (e.g. `release: staging → main ...`) yields no version bump and no release. For staging→main release PRs, pick the highest-impact prefix across the bundled changes (e.g. `feat:` if any feature is included). ## Development Guidelines diff --git a/core/taskengine/manual_trigger_loop_test.go b/core/taskengine/manual_trigger_loop_test.go index 34fcc18a..e088ffef 100644 --- a/core/taskengine/manual_trigger_loop_test.go +++ b/core/taskengine/manual_trigger_loop_test.go @@ -427,10 +427,13 @@ func TestLoopNode_ContractWrite_InvalidAddress_PartialFailure(t *testing.T) { step, err := vm.RunNodeWithInputs(node, inputVariables) - // The loop should report failure because the second iteration failed + // Per AvaProtocol/EigenLayer-AVS#511, per-iteration runner failures do not + // fail the loop step. The loop ran to completion, so step.Success is true + // and the per-iteration outcomes are reflected in the data array (nil for + // failed iterations). require.NotNil(t, step, "Execution step should not be nil") - assert.False(t, step.Success, "Loop should report failure when an iteration fails") - assert.Contains(t, step.Error, "invalid address", "Error should mention invalid address") + assert.True(t, step.Success, "Loop step should succeed when it ran to completion, even if an iteration failed") + assert.Empty(t, step.Error, "Loop step error should be empty when loop ran to completion") // The output should still contain results (first iteration data, second nil) loopOutput := step.GetLoop() diff --git a/core/taskengine/vm.go b/core/taskengine/vm.go index 8c9d04c4..851395da 100644 --- a/core/taskengine/vm.go +++ b/core/taskengine/vm.go @@ -4653,6 +4653,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no results := make([]interface{}, len(inputArray)) iterationSteps := make([]*avsproto.Execution_Step, 0, len(inputArray)) // Collect iteration steps for gas aggregation success := true + infraFailure := false var firstError error if concurrent { @@ -4701,6 +4702,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no if err := eq.Submit(task); err != nil { log.WriteString(fmt.Sprintf("\nError submitting iteration %d: %s", iterationIndex, err.Error())) success = false + infraFailure = true if firstError == nil { firstError = err } @@ -4713,11 +4715,13 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no select { case result := <-resultChannel: if result.Error != nil { - success = false if firstError == nil { firstError = result.Error } log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error())) + // Per-iteration runner failure: leave results[i] = nil as a + // partial-success placeholder. Do NOT mark the loop as failed — + // the per-iteration outcome is reflected in the data array. } else { results[i] = result.Data } @@ -4726,6 +4730,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no } case <-time.After(iterationTimeout): success = false + infraFailure = true err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout) if firstError == nil { firstError = err @@ -4761,6 +4766,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no if err := eq.Submit(task); err != nil { success = false + infraFailure = true if firstError == nil { firstError = err } @@ -4772,11 +4778,12 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no select { case result := <-resultChannel: if result.Error != nil { - success = false if firstError == nil { firstError = result.Error } log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error())) + // Per-iteration runner failure: leave results[i] = nil as a + // partial-success placeholder. Do NOT mark the loop as failed. } else { results[i] = result.Data } @@ -4785,6 +4792,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no } case <-time.After(iterationTimeout): success = false + infraFailure = true err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout) if firstError == nil { firstError = err @@ -4831,12 +4839,18 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no Loop: loopOutput, } - if !success && firstError != nil { + // Only treat infrastructure failures (queue submit errors, iteration timeouts) + // as a hard step failure. Per-iteration runner errors (e.g. a contract call + // reverting in one iteration of a Loop > ContractRead) are reflected as nil + // entries in the results array — the loop ran to completion, so we preserve + // OutputData and return success so the client can inspect partial results. + // See AvaProtocol/EigenLayer-AVS#511. + if infraFailure && firstError != nil { finalizeStep(s, false, nil, firstError.Error(), log.String()) return s, firstError } - finalizeStep(s, true, nil, "", log.String()) + finalizeStep(s, success, nil, "", log.String()) return s, nil } diff --git a/pkg/erc4337/preset/builder.go b/pkg/erc4337/preset/builder.go index 19710943..b2ecf107 100644 --- a/pkg/erc4337/preset/builder.go +++ b/pkg/erc4337/preset/builder.go @@ -1155,17 +1155,62 @@ func sendUserOpCore( // On-chain or cache has advanced past our nonce — use the new value l.Debug("Nonce advanced", "old_nonce", userOp.Nonce.String(), "new_nonce", freshNonce.String()) } else if onChainNonce.Cmp(userOp.Nonce) < 0 { - // On-chain nonce is behind our attempted nonce. Prior UserOps (at lower - // nonces) haven't mined yet, causing the bundler to reject ours with AA25. - // The nonce itself is correct — we just need to wait for prior UserOps to - // mine. Do NOT increment, as that creates an unfillable nonce gap. - freshNonce = new(big.Int).Set(userOp.Nonce) - l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry", - "on_chain_nonce", onChainNonce.String(), - "userOp_nonce", userOp.Nonce.String()) - // Wait briefly for prior UserOps to mine; without this delay - // the retry loop burns through attempts in milliseconds. - time.Sleep(2 * time.Second) + // On-chain nonce is behind our attempted nonce. Two possibilities: + // (a) Prior UserOps at lower nonces are genuinely pending in the bundler + // mempool and just haven't mined yet → wait for them. + // (b) The cache is stale: a previous UserOp was dropped from the bundler + // mempool without the cache being invalidated, so there is nothing + // at on_chain..userOp.Nonce-1 to ever mine. Waiting is futile and + // leads to the stuck loop described in issue #510. + // Distinguish the two by inspecting the bundler mempool. If no predecessor + // UserOp exists for this sender at a nonce in [on_chain, userOp.Nonce), + // then case (b) holds — rewind the cache to the on-chain nonce. + // Bound the mempool RPC so a hung bundler can't stall the retry loop. + mempoolCtx, cancelMempool := context.WithTimeout(context.Background(), 5*time.Second) + pendingOps, mempoolErr := bundlerClient.GetPendingUserOpsForSender(mempoolCtx, entrypoint, userOp.Sender) + cancelMempool() + + predecessorPending := false + if mempoolErr == nil { + for _, op := range pendingOps { + // EIP-4337 bundlers return nonces as "0x..." hex strings. + opNonce := new(big.Int) + nonceStr := strings.TrimPrefix(op.Nonce, "0x") + if _, ok := opNonce.SetString(nonceStr, 16); !ok { + l.Debug("Failed to parse pending op nonce, skipping", + "nonce", op.Nonce, "sender", userOp.Sender.Hex()) + continue + } + if opNonce.Cmp(onChainNonce) >= 0 && opNonce.Cmp(userOp.Nonce) < 0 { + predecessorPending = true + break + } + } + } else { + l.Warn("Failed to inspect bundler mempool while diagnosing nonce conflict; assuming predecessor pending", + "error", mempoolErr) + predecessorPending = true + } + + if !predecessorPending { + // Stale cache: nothing in the mempool can ever mine to bridge the gap. + // Rewind to the on-chain nonce so the rebuild uses a value the + // EntryPoint will accept. + l.Warn("Nonce cache desync detected: no predecessor UserOp pending in bundler mempool, rewinding cache to on-chain nonce", + "sender", userOp.Sender.Hex(), + "on_chain_nonce", onChainNonce.String(), + "stale_cached_nonce", userOp.Nonce.String()) + globalNonceManager.ResetNonce(userOp.Sender) + freshNonce = new(big.Int).Set(onChainNonce) + } else { + freshNonce = new(big.Int).Set(userOp.Nonce) + l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry", + "on_chain_nonce", onChainNonce.String(), + "userOp_nonce", userOp.Nonce.String()) + // Wait briefly for prior UserOps to mine; without this delay + // the retry loop burns through attempts in milliseconds. + time.Sleep(2 * time.Second) + } } else { // GetNextNonce returned the same nonce we already tried and on-chain has // reached this nonce. A UserOp is pending at this nonce in the mempool.