From 27305838afa87211df4b664aebf865d57d92a8e9 Mon Sep 17 00:00:00 2001
From: Will Zimmerman <will@avaprotocol.org>
Date: Wed, 8 Apr 2026 12:33:12 -0700
Subject: [PATCH 1/4] docs: update CLAUDE.md for PR title instructions

---
 CLAUDE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 433abdf9..758ac347 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -109,6 +109,7 @@ The `getWorkflow` output is the ground truth — it shows the trigger's `topics`
 - All feature branches and PRs must target `staging`, never `main` directly.
 - The `main` branch is updated only by merging `staging` → `main` after migration checks pass.
 - Before merging to `main`, run `go run scripts/compare_storage_structure.go main` to check for breaking storage changes.
+- **PR titles must follow [Conventional Commits](https://www.conventionalcommits.org/) / semantic-release format** (`feat:`, `fix:`, `chore:`, `docs:`, `refactor:`, `test:`, `ci:`, `perf:`, `BREAKING CHANGE:` footer for majors). Squash merges use the PR title as the commit subject, and `.github/workflows/release-on-pr-close.yml` runs `go-semantic-release` against it to compute the next version. A non-conforming title (e.g. `release: staging → main ...`) yields no version bump and no release. For staging→main release PRs, pick the highest-impact prefix across the bundled changes (e.g. `feat:` if any feature is included).
 
 ## Development Guidelines
 

From 645a700b769d151f42de261f01abc3ed8f13ddaa Mon Sep 17 00:00:00 2001
From: Will Zimmerman <will@avaprotocol.org>
Date: Wed, 8 Apr 2026 18:11:07 -0700
Subject: [PATCH 2/4] fix: preserve Loop partial-success results on
 per-iteration failure

Loop runner was discarding its partial-success results array on the
error return path: any per-iteration runner failure caused the step to
return a non-nil error, which the response builder treats as "drop
OutputData", so the SDK saw data: null instead of [null, {...}].

Distinguish infrastructure failures (queue submit errors, iteration
timeouts) from per-iteration runner failures. Only the former fail the
loop step; the latter leave results[i] = nil as a partial-success
placeholder and the loop returns success with the array preserved.

Fixes #511.
---
 core/taskengine/manual_trigger_loop_test.go |  8 +++++---
 core/taskengine/vm.go                       | 22 +++++++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/core/taskengine/manual_trigger_loop_test.go b/core/taskengine/manual_trigger_loop_test.go
index 34fcc18a..3b7fec4f 100644
--- a/core/taskengine/manual_trigger_loop_test.go
+++ b/core/taskengine/manual_trigger_loop_test.go
@@ -427,10 +427,12 @@ func TestLoopNode_ContractWrite_InvalidAddress_PartialFailure(t *testing.T) {
 
 	step, err := vm.RunNodeWithInputs(node, inputVariables)
 
-	// The loop should report failure because the second iteration failed
+	// Per AvaProtocol/EigenLayer-AVS#511, per-iteration runner failures do not
+	// fail the loop step. The loop ran to completion, so step.Success is true
+	// and the per-iteration outcomes are reflected in the data array (nil for
+	// failed iterations).
 	require.NotNil(t, step, "Execution step should not be nil")
-	assert.False(t, step.Success, "Loop should report failure when an iteration fails")
-	assert.Contains(t, step.Error, "invalid address", "Error should mention invalid address")
+	assert.True(t, step.Success, "Loop step should succeed when it ran to completion, even if an iteration failed")
 
 	// The output should still contain results (first iteration data, second nil)
 	loopOutput := step.GetLoop()
diff --git a/core/taskengine/vm.go b/core/taskengine/vm.go
index 8c9d04c4..851395da 100644
--- a/core/taskengine/vm.go
+++ b/core/taskengine/vm.go
@@ -4653,6 +4653,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 	results := make([]interface{}, len(inputArray))
 	iterationSteps := make([]*avsproto.Execution_Step, 0, len(inputArray)) // Collect iteration steps for gas aggregation
 	success := true
+	infraFailure := false
 	var firstError error
 
 	if concurrent {
@@ -4701,6 +4702,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				if err := eq.Submit(task); err != nil {
 					log.WriteString(fmt.Sprintf("\nError submitting iteration %d: %s", iterationIndex, err.Error()))
 					success = false
+					infraFailure = true
 					if firstError == nil {
 						firstError = err
 					}
@@ -4713,11 +4715,13 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 			select {
 			case result := <-resultChannel:
 				if result.Error != nil {
-					success = false
 					if firstError == nil {
 						firstError = result.Error
 					}
 					log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error()))
+					// Per-iteration runner failure: leave results[i] = nil as a
+					// partial-success placeholder. Do NOT mark the loop as failed —
+					// the per-iteration outcome is reflected in the data array.
 				} else {
 					results[i] = result.Data
 				}
@@ -4726,6 +4730,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				}
 			case <-time.After(iterationTimeout):
 				success = false
+				infraFailure = true
 				err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout)
 				if firstError == nil {
 					firstError = err
@@ -4761,6 +4766,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 
 			if err := eq.Submit(task); err != nil {
 				success = false
+				infraFailure = true
 				if firstError == nil {
 					firstError = err
 				}
@@ -4772,11 +4778,12 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 			select {
 			case result := <-resultChannel:
 				if result.Error != nil {
-					success = false
 					if firstError == nil {
 						firstError = result.Error
 					}
 					log.WriteString(fmt.Sprintf("\nError in iteration %d: %s", i, result.Error.Error()))
+					// Per-iteration runner failure: leave results[i] = nil as a
+					// partial-success placeholder. Do NOT mark the loop as failed.
 				} else {
 					results[i] = result.Data
 				}
@@ -4785,6 +4792,7 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 				}
 			case <-time.After(iterationTimeout):
 				success = false
+				infraFailure = true
 				err := fmt.Errorf("iteration %d timed out after %s", i, iterationTimeout)
 				if firstError == nil {
 					firstError = err
@@ -4831,12 +4839,18 @@ func (v *VM) executeLoopWithQueue(stepID string, taskNode *avsproto.TaskNode, no
 		Loop: loopOutput,
 	}
 
-	if !success && firstError != nil {
+	// Only treat infrastructure failures (queue submit errors, iteration timeouts)
+	// as a hard step failure. Per-iteration runner errors (e.g. a contract call
+	// reverting in one iteration of a Loop > ContractRead) are reflected as nil
+	// entries in the results array — the loop ran to completion, so we preserve
+	// OutputData and return success so the client can inspect partial results.
+	// See AvaProtocol/EigenLayer-AVS#511.
+	if infraFailure && firstError != nil {
 		finalizeStep(s, false, nil, firstError.Error(), log.String())
 		return s, firstError
 	}
 
-	finalizeStep(s, true, nil, "", log.String())
+	finalizeStep(s, success, nil, "", log.String())
 	return s, nil
 }
 

From 2cb9f43804d09c5ee60556c5f551a38b95d2fa6b Mon Sep 17 00:00:00 2001
From: Will Zimmerman <will@avaprotocol.org>
Date: Wed, 8 Apr 2026 18:11:24 -0700
Subject: [PATCH 3/4] fix: rewind paymaster nonce cache when bundler mempool
 has no predecessor (#510)

When on_chain_nonce < userOp.Nonce and the bundler returned AA25, the
conflict handler always assumed a predecessor UserOp would eventually
mine. If the predecessor had been dropped from the bundler mempool, the
cache stayed stale, the rebuild loop kept reusing the same nonce, and
every subsequent paymaster-sponsored UserOp for that wallet failed until
the aggregator was restarted.

Now inspect the bundler mempool: if no pending UserOp exists in
[on_chain, userOp.Nonce, reset the nonce cache and rewind to the
on-chain nonce so the rebuild self-heals.

EOF
)
---
 pkg/erc4337/preset/builder.go | 61 ++++++++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/pkg/erc4337/preset/builder.go b/pkg/erc4337/preset/builder.go
index 19710943..131e4afa 100644
--- a/pkg/erc4337/preset/builder.go
+++ b/pkg/erc4337/preset/builder.go
@@ -1155,17 +1155,56 @@ func sendUserOpCore(
 					// On-chain or cache has advanced past our nonce — use the new value
 					l.Debug("Nonce advanced", "old_nonce", userOp.Nonce.String(), "new_nonce", freshNonce.String())
 				} else if onChainNonce.Cmp(userOp.Nonce) < 0 {
-					// On-chain nonce is behind our attempted nonce. Prior UserOps (at lower
-					// nonces) haven't mined yet, causing the bundler to reject ours with AA25.
-					// The nonce itself is correct — we just need to wait for prior UserOps to
-					// mine. Do NOT increment, as that creates an unfillable nonce gap.
-					freshNonce = new(big.Int).Set(userOp.Nonce)
-					l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry",
-						"on_chain_nonce", onChainNonce.String(),
-						"userOp_nonce", userOp.Nonce.String())
-					// Wait briefly for prior UserOps to mine; without this delay
-					// the retry loop burns through attempts in milliseconds.
-					time.Sleep(2 * time.Second)
+					// On-chain nonce is behind our attempted nonce. Two possibilities:
+					//   (a) Prior UserOps at lower nonces are genuinely pending in the bundler
+					//       mempool and just haven't mined yet → wait for them.
+					//   (b) The cache is stale: a previous UserOp was dropped from the bundler
+					//       mempool without the cache being invalidated, so there is nothing
+					//       at on_chain..userOp.Nonce-1 to ever mine. Waiting is futile and
+					//       leads to the stuck loop described in issue #510.
+					// Distinguish the two by inspecting the bundler mempool. If no predecessor
+					// UserOp exists for this sender at a nonce in [on_chain, userOp.Nonce),
+					// then case (b) holds — rewind the cache to the on-chain nonce.
+					predecessorPending := false
+					if pendingOps, mempoolErr := bundlerClient.GetPendingUserOpsForSender(context.Background(), entrypoint, userOp.Sender); mempoolErr == nil {
+						for _, op := range pendingOps {
+							opNonce := new(big.Int)
+							if _, ok := opNonce.SetString(strings.TrimPrefix(op.Nonce, "0x"), 16); !ok {
+								// Fall back to decimal parse
+								if _, ok := opNonce.SetString(op.Nonce, 10); !ok {
+									continue
+								}
+							}
+							if opNonce.Cmp(onChainNonce) >= 0 && opNonce.Cmp(userOp.Nonce) < 0 {
+								predecessorPending = true
+								break
+							}
+						}
+					} else {
+						l.Warn("Failed to inspect bundler mempool while diagnosing nonce conflict; assuming predecessor pending",
+							"error", mempoolErr)
+						predecessorPending = true
+					}
+
+					if !predecessorPending {
+						// Stale cache: nothing in the mempool can ever mine to bridge the gap.
+						// Rewind to the on-chain nonce so the rebuild uses a value the
+						// EntryPoint will accept.
+						l.Warn("Nonce cache desync detected: no predecessor UserOp pending in bundler mempool, rewinding cache to on-chain nonce",
+							"sender", userOp.Sender.Hex(),
+							"on_chain_nonce", onChainNonce.String(),
+							"stale_cached_nonce", userOp.Nonce.String())
+						globalNonceManager.ResetNonce(userOp.Sender)
+						freshNonce = new(big.Int).Set(onChainNonce)
+					} else {
+						freshNonce = new(big.Int).Set(userOp.Nonce)
+						l.Debug("On-chain nonce behind, waiting for prior UserOps to mine before retry",
+							"on_chain_nonce", onChainNonce.String(),
+							"userOp_nonce", userOp.Nonce.String())
+						// Wait briefly for prior UserOps to mine; without this delay
+						// the retry loop burns through attempts in milliseconds.
+						time.Sleep(2 * time.Second)
+					}
 				} else {
 					// GetNextNonce returned the same nonce we already tried and on-chain has
 					// reached this nonce. A UserOp is pending at this nonce in the mempool.

From 27fa04c48b4493858baeec1892bd5a64a9ae9cac Mon Sep 17 00:00:00 2001
From: Will Zimmerman <will@avaprotocol.org>
Date: Wed, 8 Apr 2026 19:11:39 -0700
Subject: [PATCH 4/4] fix: address PR review feedback for nonce desync recovery

- Bound mempool inspection RPC with a 5s timeout so a hung bundler cannot stall the retry loop.
- Drop misleading decimal-fallback nonce parser; bundlers return hex per EIP-4337. Log skipped unparseable nonces at debug level.
- Assert step.Error is empty in the Loop partial-success test so the no-error contract is locked in.
---
 core/taskengine/manual_trigger_loop_test.go |  1 +
 pkg/erc4337/preset/builder.go               | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/core/taskengine/manual_trigger_loop_test.go b/core/taskengine/manual_trigger_loop_test.go
index 3b7fec4f..e088ffef 100644
--- a/core/taskengine/manual_trigger_loop_test.go
+++ b/core/taskengine/manual_trigger_loop_test.go
@@ -433,6 +433,7 @@ func TestLoopNode_ContractWrite_InvalidAddress_PartialFailure(t *testing.T) {
 	// failed iterations).
 	require.NotNil(t, step, "Execution step should not be nil")
 	assert.True(t, step.Success, "Loop step should succeed when it ran to completion, even if an iteration failed")
+	assert.Empty(t, step.Error, "Loop step error should be empty when loop ran to completion")
 
 	// The output should still contain results (first iteration data, second nil)
 	loopOutput := step.GetLoop()
diff --git a/pkg/erc4337/preset/builder.go b/pkg/erc4337/preset/builder.go
index 131e4afa..b2ecf107 100644
--- a/pkg/erc4337/preset/builder.go
+++ b/pkg/erc4337/preset/builder.go
@@ -1165,15 +1165,21 @@ func sendUserOpCore(
 					// Distinguish the two by inspecting the bundler mempool. If no predecessor
 					// UserOp exists for this sender at a nonce in [on_chain, userOp.Nonce),
 					// then case (b) holds — rewind the cache to the on-chain nonce.
+					// Bound the mempool RPC so a hung bundler can't stall the retry loop.
+					mempoolCtx, cancelMempool := context.WithTimeout(context.Background(), 5*time.Second)
+					pendingOps, mempoolErr := bundlerClient.GetPendingUserOpsForSender(mempoolCtx, entrypoint, userOp.Sender)
+					cancelMempool()
+
 					predecessorPending := false
-					if pendingOps, mempoolErr := bundlerClient.GetPendingUserOpsForSender(context.Background(), entrypoint, userOp.Sender); mempoolErr == nil {
+					if mempoolErr == nil {
 						for _, op := range pendingOps {
+							// EIP-4337 bundlers return nonces as "0x..." hex strings.
 							opNonce := new(big.Int)
-							if _, ok := opNonce.SetString(strings.TrimPrefix(op.Nonce, "0x"), 16); !ok {
-								// Fall back to decimal parse
-								if _, ok := opNonce.SetString(op.Nonce, 10); !ok {
-									continue
-								}
+							nonceStr := strings.TrimPrefix(op.Nonce, "0x")
+							if _, ok := opNonce.SetString(nonceStr, 16); !ok {
+								l.Debug("Failed to parse pending op nonce, skipping",
+									"nonce", op.Nonce, "sender", userOp.Sender.Hex())
+								continue
 							}
 							if opNonce.Cmp(onChainNonce) >= 0 && opNonce.Cmp(userOp.Nonce) < 0 {
 								predecessorPending = true