AvaProtocol · chrisli30 · Apr 9, 2026 · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -109,6 +109,7 @@ The `getWorkflow` output is the ground truth — it shows the trigger's `topics`
 - All feature branches and PRs must target `staging`, never `main` directly.
 - The `main` branch is updated only by merging `staging` → `main` after migration checks pass.
 - Before merging to `main`, run `go run scripts/compare_storage_structure.go main` to check for breaking storage changes.
+- **PR titles must follow [Conventional Commits](https://www.conventionalcommits.org/) / semantic-release format** (`feat:`, `fix:`, `chore:`, `docs:`, `refactor:`, `test:`, `ci:`, `perf:`, `BREAKING CHANGE:` footer for majors). Squash merges use the PR title as the commit subject, and `.github/workflows/release-on-pr-close.yml` runs `go-semantic-release` against it to compute the next version. A non-conforming title (e.g. `release: staging → main ...`) yields no version bump and no release. For staging→main release PRs, pick the highest-impact prefix across the bundled changes (e.g. `feat:` if any feature is included).
 
 ## Development Guidelines
 

diff --git a/core/taskengine/manual_trigger_loop_test.go b/core/taskengine/manual_trigger_loop_test.go
@@ -427,10 +427,13 @@ func TestLoopNode_ContractWrite_InvalidAddress_PartialFailure(t *testing.T) {
 
 	step, err := vm.RunNodeWithInputs(node, inputVariables)
 
-	// The loop should report failure because the second iteration failed
+	// Per AvaProtocol/EigenLayer-AVS#511, per-iteration runner failures do not
+	// fail the loop step. The loop ran to completion, so step.Success is true
+	// and the per-iteration outcomes are reflected in the data array (nil for
+	// failed iterations).
 	require.NotNil(t, step, "Execution step should not be nil")
-	assert.False(t, step.Success, "Loop should report failure when an iteration fails")
-	assert.Contains(t, step.Error, "invalid address", "Error should mention invalid address")
+	assert.True(t, step.Success, "Loop step should succeed when it ran to completion, even if an iteration failed")
+	assert.Empty(t, step.Error, "Loop step error should be empty when loop ran to completion")
 
 	// The output should still contain results (first iteration data, second nil)
 	loopOutput := step.GetLoop()

diff --git a/core/taskengine/vm.go b/core/taskengine/vm.go
@@ -4653,6 +4653,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 	results := make([]interface{}, len(inputArray))
 	iterationSteps := make([]*avsproto.Execution_Step, 0, len(inputArray)) // Collect iteration steps for gas aggregation
 	success := true
+	infraFailure := false
 	var firstError error
 
 	if concurrent {
@@ -4701,6 +4702,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				if err := eq.Submit(task); err != nil {
 					log.WriteString(fmt.Sprintf("\nError submitting iteration %d: %s", iterationIndex, err.Error()))
 					success = false
+					infraFailure = true
 					if firstError == nil {
 						firstError = err
 					}
@@ -4713,11 +4715,13 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 			select {
 			case result := <-resultChannel:
 				if result.Error != nil {
-					success = false
 					if firstError == nil {
 						firstError = result.Error
 					}
 					log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error()))
+					// Per-iteration runner failure: leave results[i] = nil as a
+					// partial-success placeholder. Do NOT mark the loop as failed —
+					// the per-iteration outcome is reflected in the data array.
 				} else {
 					results[i] = result.Data
 				}
@@ -4726,6 +4730,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				}
 			case <-time.After(iterationTimeout):
 				success = false
+				infraFailure = true
 				err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout)
 				if firstError == nil {
 					firstError = err
@@ -4761,6 +4766,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 
 			if err := eq.Submit(task); err != nil {
 				success = false
+				infraFailure = true
 				if firstError == nil {
 					firstError = err
 				}
@@ -4772,11 +4778,12 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 			select {
 			case result := <-resultChannel:
 				if result.Error != nil {
-					success = false
 					if firstError == nil {
 						firstError = result.Error
 					}
 					log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error()))
+					// Per-iteration runner failure: leave results[i] = nil as a
+					// partial-success placeholder. Do NOT mark the loop as failed.
 				} else {
 					results[i] = result.Data
 				}
@@ -4785,6 +4792,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				}
 			case <-time.After(iterationTimeout):
 				success = false
+				infraFailure = true
 				err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout)
 				if firstError == nil {
 					firstError = err
@@ -4831,12 +4839,18 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 		Loop: loopOutput,
 	}
 
-	if !success && firstError != nil {
+	// Only treat infrastructure failures (queue submit errors, iteration timeouts)
+	// as a hard step failure. Per-iteration runner errors (e.g. a contract call
+	// reverting in one iteration of a Loop > ContractRead) are reflected as nil
+	// entries in the results array — the loop ran to completion, so we preserve
+	// OutputData and return success so the client can inspect partial results.
+	// See AvaProtocol/EigenLayer-AVS#511.
+	if infraFailure && firstError != nil {
 		finalizeStep(s, false, nil, firstError.Error(), log.String())
 		return s, firstError
 	}
 
-	finalizeStep(s, true, nil, "", log.String())
+	finalizeStep(s, success, nil, "", log.String())
 	return s, nil
 }
 

diff --git a/pkg/erc4337/preset/builder.go b/pkg/erc4337/preset/builder.go
@@ -1155,17 +1155,62 @@ func sendUserOpCore(
 					// On-chain or cache has advanced past our nonce — use the new value
 					l.Debug("Nonce advanced", "old_nonce", userOp.Nonce.String(), "new_nonce", freshNonce.String())
 				} else if onChainNonce.Cmp(userOp.Nonce) < 0 {
-					// On-chain nonce is behind our attempted nonce. Prior UserOps (at lower
-					// nonces) haven't mined yet, causing the bundler to reject ours with AA25.
-					// The nonce itself is correct — we just need to wait for prior UserOps to
-					// mine. Do NOT increment, as that creates an unfillable nonce gap.
-					freshNonce = new(big.Int).Set(userOp.Nonce)
-					l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry",
-						"on_chain_nonce", onChainNonce.String(),
-						"userOp_nonce", userOp.Nonce.String())
-					// Wait briefly for prior UserOps to mine; without this delay
-					// the retry loop burns through attempts in milliseconds.
-					time.Sleep(2 * time.Second)
+					// On-chain nonce is behind our attempted nonce. Two possibilities:
+					//   (a) Prior UserOps at lower nonces are genuinely pending in the bundler
+					//       mempool and just haven't mined yet → wait for them.
+					//   (b) The cache is stale: a previous UserOp was dropped from the bundler
+					//       mempool without the cache being invalidated, so there is nothing
+					//       at on_chain..userOp.Nonce-1 to ever mine. Waiting is futile and
+					//       leads to the stuck loop described in issue #510.
+					// Distinguish the two by inspecting the bundler mempool. If no predecessor
+					// UserOp exists for this sender at a nonce in [on_chain, userOp.Nonce),
+					// then case (b) holds — rewind the cache to the on-chain nonce.
+					// Bound the mempool RPC so a hung bundler can't stall the retry loop.
+					mempoolCtx, cancelMempool := context.WithTimeout(context.Background(), 5*time.Second)
+					pendingOps, mempoolErr := bundlerClient.GetPendingUserOpsForSender(mempoolCtx, entrypoint, userOp.Sender)
+					cancelMempool()
+
+					predecessorPending := false
+					if mempoolErr == nil {
+						for _, op := range pendingOps {
+							// EIP-4337 bundlers return nonces as "0x..." hex strings.
+							opNonce := new(big.Int)
+							nonceStr := strings.TrimPrefix(op.Nonce, "0x")
+							if _, ok := opNonce.SetString(nonceStr, 16); !ok {
+								l.Debug("Failed to parse pending op nonce, skipping",
+									"nonce", op.Nonce, "sender", userOp.Sender.Hex())
+								continue
+							}
+							if opNonce.Cmp(onChainNonce) >= 0 && opNonce.Cmp(userOp.Nonce) < 0 {
+								predecessorPending = true
+								break
+							}
+						}
+					} else {
+						l.Warn("Failed to inspect bundler mempool while diagnosing nonce conflict; assuming predecessor pending",
+							"error", mempoolErr)
+						predecessorPending = true
+					}
+
+					if !predecessorPending {
+						// Stale cache: nothing in the mempool can ever mine to bridge the gap.
+						// Rewind to the on-chain nonce so the rebuild uses a value the
+						// EntryPoint will accept.
+						l.Warn("Nonce cache desync detected: no predecessor UserOp pending in bundler mempool, rewinding cache to on-chain nonce",
+							"sender", userOp.Sender.Hex(),
+							"on_chain_nonce", onChainNonce.String(),
+							"stale_cached_nonce", userOp.Nonce.String())
+						globalNonceManager.ResetNonce(userOp.Sender)
+						freshNonce = new(big.Int).Set(onChainNonce)
+					} else {
+						freshNonce = new(big.Int).Set(userOp.Nonce)
+						l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry",
+							"on_chain_nonce", onChainNonce.String(),
+							"userOp_nonce", userOp.Nonce.String())
+						// Wait briefly for prior UserOps to mine; without this delay
+						// the retry loop burns through attempts in milliseconds.
+						time.Sleep(2 * time.Second)
+					}
 				} else {
 					// GetNextNonce returned the same nonce we already tried and on-chain has
 					// reached this nonce. A UserOp is pending at this nonce in the mempool.