From 38145e5c769043d5aceefb8506a31e91491bf051 Mon Sep 17 00:00:00 2001 From: Wei Lin Date: Sat, 11 Apr 2026 01:45:44 -0700 Subject: [PATCH 1/5] docs: add wallet salt index migration runbook with verified Base apply MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track the operational runbook for the /ava backfill-wallet-salt-index migration in version control. Captures the production rollout pattern that worked for Base on ap-prod1 (2026-04-11): - The container's config dir is /app/config (read-only bind from /home/ava/ap-aggregator-setup//config), filename is aggregator.yaml regardless of chain. - The aggregator container's launch command is /ava aggregator with no explicit --config flag, so the runtime picks up the default. The backfill subcommand does NOT inherit that default and requires --config /app/config/aggregator.yaml passed explicitly. - Use docker run --rm --volumes-from aggregator-base to inherit the stopped container's mounts without needing to know host paths. Includes verified Base numbers (702 rows / 665 stale / 34 canonical / 3 negative-salt skipped), notes on the Skipped — RPC derive error counter actually catching legacy negative-salt rows from a past bug, and a chain-by-chain table with the current migration status. --- .github/workflows/migration-instructions.md | 260 ++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 .github/workflows/migration-instructions.md diff --git a/.github/workflows/migration-instructions.md b/.github/workflows/migration-instructions.md new file mode 100644 index 00000000..20b9546b --- /dev/null +++ b/.github/workflows/migration-instructions.md @@ -0,0 +1,260 @@ +# Backfill Wallet Salt Index — Production Runbook + +One-off migration to populate the `wsalt:::` secondary +index and flip stale wallet rows from upgraded factories. Required after +deploying the fix from PR #520. + +## Prerequisites + +- The new aggregator image (with `/ava backfill-wallet-salt-index` baked in) + must already be running on the production server. Watchtower normally rolls + it out once a release tag lands on `latest`. +- SSH access to the target server (`ap-prod1` for Base/Ethereum, + `ap-staging1` for Sepolia/Base-Sepolia). +- A short maintenance window per chain — the aggregator must be stopped + during the migration because BadgerDB is single-writer. + +## Why this isn't a one-liner + +You can't `docker exec aggregator-base /ava backfill-wallet-salt-index ...` +against the running container — `aggregator-base` already holds the +BadgerDB lock and the second `/ava` process would fail with +`Cannot acquire directory lock`. So the dance is: + +1. Stop the aggregator (releases the lock) +2. Run a one-shot container that **inherits the stopped container's volume + mounts** via `--volumes-from`, so it sees the same config and the same + BadgerDB +3. Restart the aggregator + +`--volumes-from` borrows the mounts but **not** the image — the one-shot +runs whatever image you pass in `docker run`, so as long as +`avaprotocol/ap-avs:latest` resolves to the new image, the backfill +subcommand is available. + +## Step-by-step (Base, on `ap-prod1`) + +```bash +ssh ap-prod1 +``` + +### 1. Find the config filename and inspect mounts + +The container's actual mounts and config path differ per chain. Always +verify before running the migration. + +```bash +docker inspect aggregator-base --format '{{json .Mounts}}' | jq +``` + +Verified output for Base on `ap-prod1` (2026-04-11): + +```json +[ + { + "Type": "bind", + "Source": "/home/ava/ap-aggregator-setup/base/config", + "Destination": "/app/config", + "Mode": "ro" + }, + { + "Type": "bind", + "Source": "/home/ava/ap-aggregator-setup/base/backup", + "Destination": "/tmp/ap-avs-backup", + "Mode": "rw" + }, + { + "Type": "bind", + "Source": "/data/base-avs-aggregator", + "Destination": "/tmp/ap-avs", + "Mode": "rw" + } +] +``` + +The config directory is `/app/config` (read-only), so the YAML lives +inside `/home/ava/ap-aggregator-setup//config/` on the host. Find +the actual filename: + +```bash +ls /home/ava/ap-aggregator-setup/base/config/ +# aggregator.yaml bundler.env +``` + +Then double-check the running aggregator's launch command: + +```bash +docker inspect aggregator-base --format '{{.Config.Cmd}}' +# [aggregator] +``` + +The container runs `/ava aggregator` with **no explicit `--config`**, so +it picks up the default `./config/aggregator.yaml` (the WORKDIR is +`/app`, which resolves to `/app/config/aggregator.yaml`). Use that same +path explicitly when invoking the backfill subcommand below — the +backfill command does NOT inherit the same default and requires an +explicit `--config` flag. + +### 2. Snapshot the BadgerDB volume (cheap insurance) + +Restic backs up `/tmp/ap-avs/db` every 4h, but a fresh snapshot before +mutation is free peace of mind. From inside the existing backup +container: + +```bash +docker exec backup restic backup /tmp/ap-avs/db --tag pre-wallet-salt-backfill +``` + +Or if you'd rather just verify a recent snapshot exists: + +```bash +docker exec backup restic snapshots --tag base | tail -5 +``` + +### 3. Stop the aggregator + +```bash +docker stop aggregator-base +``` + +This releases the BadgerDB lock. Watchtower won't try to restart it +unless a new image tag lands on `latest`, so a sub-minute window is +safe to run unsupervised. + +### 4. Dry-run the backfill + +```bash +docker run --rm --volumes-from aggregator-base avaprotocol/ap-avs:latest \ + backfill-wallet-salt-index --config /app/config/aggregator.yaml --dry-run +``` + +**Inspect the printed summary carefully.** You're looking at: + +- `Total wallet rows scanned` — sanity check this against your expected + population for the chain. +- `Newly marked stale` — for Base, this should be substantial (the + bug fix landing). For Sepolia and Ethereum it'll likely be zero + (factory implementation hasn't been rotated on those chains). +- `Skipped — RPC derive error` — must be 0 or very low. The label is + slightly misleading: this counter also includes wallets with + **negative salts** that the factory's `getAddress(uint256)` ABI + can't accept (`abi: negatively-signed value cannot be packed into + uint parameter`). Those are legacy bug-induced rows that the + migration safely skips. Anything else (real RPC failures or + unexpected ABI errors) deserves investigation before applying. +- `Skipped — missing factory` — legacy rows from before `Factory` was + persisted on the model. Safe to leave alone; new GetWallet calls will + populate them. + +**Verified Base apply (2026-04-11):** + +``` +Total wallet rows scanned: 702 + Canonical (live derive matches): 34 + Stale (live derive differs): 665 + Skipped — RPC derive error: 3 (all were salt=-12 negative-salt rows) +``` + +That ~95% stale ratio is normal for Base and reflects how many times +the Base factory's `accountImplementation` has rotated since the +aggregator started persisting wallet rows. + +### 5. Apply for real + +If the dry-run looks reasonable: + +```bash +docker run --rm --volumes-from aggregator-base avaprotocol/ap-avs:latest \ + backfill-wallet-salt-index --config /app/config/aggregator.yaml +``` + +The migration is **fail-fast**: if any individual `db.Set` or +`MarkWalletStale` operation fails, the whole run aborts with a non-zero +exit code. The migration is idempotent, so you can fix the underlying +issue and re-run from scratch. + +### 6. Restart the aggregator + +```bash +docker start aggregator-base +docker logs --tail 30 aggregator-base +``` + +Confirm it comes up cleanly (look for `Engine started successfully`). + +### 7. Sanity check via the SDK from your laptop + +For Base, the previously affected EOA is +`0xc60e71bd0f2e6d8832Fea1a2d56091C48493C788`. Run: + +```bash +yarn start --avs-target base getWallets 0xc60e71bd0f2e6d8832Fea1a2d56091C48493C788 +``` + +You should see exactly **one** `salt: '0'` entry — the canonical one for +today's factory implementation. For the bug-report EOA above, that's +`0x5d814Cc9E94B2656f59Ee439D44AA1b6ca21434f`. The previous era-1 and +era-2 derivations are gone from the response entirely (filtered by the +`StaleDerivation` guard in `ListWallets`). + +Note: any wallets across **other** salts (`1`, `10`, `1000`, etc.) that +were already `isHidden: true` in the pre-migration response will also +disappear from the response after the migration runs. They were stale +in the same way the salt-0 zombies were — the migration flagged them +all and the new `ListWallets` hard filter drops them. The on-chain +addresses are still in BadgerDB; they're just hidden from the gRPC +response. Recover via direct `socat` REPL access if anything funded +those addresses. + +## Other chains + +Same procedure, different container/host paths. Always run step 1 first +to verify, but for reference: + +| Chain | Server | Container | Verified host config dir | Notes | +|---|---|---|---|---| +| Base | `ap-prod1` | `aggregator-base` | `/home/ava/ap-aggregator-setup/base/config` | ✅ Migrated 2026-04-11. 702 rows, 665 stale, 34 canonical, 3 negative-salt skipped. | +| Ethereum | `ap-prod1` | `aggregator-ethereum` | `/home/ava/ap-aggregator-setup/ethereum/config` (likely) | TBD — verify with `docker inspect`. Factory likely hasn't been rotated, expect ~0 stale. | +| Sepolia | `ap-staging1` | `aggregator-sepolia` | `/home/ava/ap-aggregator-setup/sepolia/config` (likely) | Dry-run on 2026-04-10 showed 220 rows, 0 stale. | +| Base-Sepolia | `ap-staging1` | `aggregator-base-sepolia` | `/home/ava/ap-aggregator-setup/base-sepolia/config` (likely) | TBD. | + +For non-Base chains the migration will likely show zero stale rows +(factory implementation hasn't been rotated), but it's still worth +running so the secondary index is populated for any future +implementation rotation. + +The config filename inside the container is **always** +`/app/config/aggregator.yaml` regardless of chain — the deployment +template uses one canonical filename per host directory, not chain-named +filenames. (Don't trust the chain in the filename, trust `docker +inspect`.) + +## If something goes wrong + +- **Migration aborts mid-run with a write error**: storage hit a transient + issue. The migration is idempotent — fix the underlying cause and + re-run from step 4. Already-written index entries are correct; + re-running just re-validates them. +- **Aggregator won't restart after migration**: check `docker logs + aggregator-base` for the error. The migration only writes new keys + (`wsalt:` prefix) and flips flags on existing wallet records — it + doesn't touch tasks, executions, or any other state. If startup fails, + it's almost certainly unrelated to the migration; restore from the + pre-migration restic snapshot if you need to roll back. +- **getWallets still shows zombie rows after migration**: check the + migration's printed summary for `Newly marked stale` count and + cross-reference against the SDK output. Make sure you bounced the + aggregator container (step 6) — it caches some wallet state in + memory and needs a restart to pick up the marked-stale rows. + +## Background + +See PR #520 for the full bug report. TL;DR: wallet records are keyed by +their derived address (`w::
`), so when a factory's +account implementation is upgraded the same `(owner, factory, salt)` +triple starts deriving to a new address — and because the lookup is +keyed by address, the old row never gets touched. Multiple records +accumulate for the same logical (owner, factory, salt) slot, all +reported as `salt: "0"` in `ListWallets`. The fix adds a +`wsalt:::` secondary index and a backfill +migration to flag the zombies. From 95baaa916c34d7b95d49d7e094471ea54200b1e7 Mon Sep 17 00:00:00 2001 From: Wei Lin Date: Sat, 11 Apr 2026 01:53:11 -0700 Subject: [PATCH 2/5] docs: move wallet salt index runbook to avs-infra repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The migration runbook is operational documentation that lives more naturally in the avs-infra repo (which holds Terraform configs, deploy scripts, and other ops-flavored docs like Chain_Endpoint_Key_Rotation.md and EIP-7702-Smart-Wallet-Migration.md) than in the application repo. It also was never actually tracked under .github/workflows/ correctly — that path is for GitHub Actions workflow YAML files, not standalone markdown runbooks. Moved to: avs-infra/Wallet_Salt_Index_Migration.md --- .github/workflows/migration-instructions.md | 260 -------------------- 1 file changed, 260 deletions(-) delete mode 100644 .github/workflows/migration-instructions.md diff --git a/.github/workflows/migration-instructions.md b/.github/workflows/migration-instructions.md deleted file mode 100644 index 20b9546b..00000000 --- a/.github/workflows/migration-instructions.md +++ /dev/null @@ -1,260 +0,0 @@ -# Backfill Wallet Salt Index — Production Runbook - -One-off migration to populate the `wsalt:::` secondary -index and flip stale wallet rows from upgraded factories. Required after -deploying the fix from PR #520. - -## Prerequisites - -- The new aggregator image (with `/ava backfill-wallet-salt-index` baked in) - must already be running on the production server. Watchtower normally rolls - it out once a release tag lands on `latest`. -- SSH access to the target server (`ap-prod1` for Base/Ethereum, - `ap-staging1` for Sepolia/Base-Sepolia). -- A short maintenance window per chain — the aggregator must be stopped - during the migration because BadgerDB is single-writer. - -## Why this isn't a one-liner - -You can't `docker exec aggregator-base /ava backfill-wallet-salt-index ...` -against the running container — `aggregator-base` already holds the -BadgerDB lock and the second `/ava` process would fail with -`Cannot acquire directory lock`. So the dance is: - -1. Stop the aggregator (releases the lock) -2. Run a one-shot container that **inherits the stopped container's volume - mounts** via `--volumes-from`, so it sees the same config and the same - BadgerDB -3. Restart the aggregator - -`--volumes-from` borrows the mounts but **not** the image — the one-shot -runs whatever image you pass in `docker run`, so as long as -`avaprotocol/ap-avs:latest` resolves to the new image, the backfill -subcommand is available. - -## Step-by-step (Base, on `ap-prod1`) - -```bash -ssh ap-prod1 -``` - -### 1. Find the config filename and inspect mounts - -The container's actual mounts and config path differ per chain. Always -verify before running the migration. - -```bash -docker inspect aggregator-base --format '{{json .Mounts}}' | jq -``` - -Verified output for Base on `ap-prod1` (2026-04-11): - -```json -[ - { - "Type": "bind", - "Source": "/home/ava/ap-aggregator-setup/base/config", - "Destination": "/app/config", - "Mode": "ro" - }, - { - "Type": "bind", - "Source": "/home/ava/ap-aggregator-setup/base/backup", - "Destination": "/tmp/ap-avs-backup", - "Mode": "rw" - }, - { - "Type": "bind", - "Source": "/data/base-avs-aggregator", - "Destination": "/tmp/ap-avs", - "Mode": "rw" - } -] -``` - -The config directory is `/app/config` (read-only), so the YAML lives -inside `/home/ava/ap-aggregator-setup//config/` on the host. Find -the actual filename: - -```bash -ls /home/ava/ap-aggregator-setup/base/config/ -# aggregator.yaml bundler.env -``` - -Then double-check the running aggregator's launch command: - -```bash -docker inspect aggregator-base --format '{{.Config.Cmd}}' -# [aggregator] -``` - -The container runs `/ava aggregator` with **no explicit `--config`**, so -it picks up the default `./config/aggregator.yaml` (the WORKDIR is -`/app`, which resolves to `/app/config/aggregator.yaml`). Use that same -path explicitly when invoking the backfill subcommand below — the -backfill command does NOT inherit the same default and requires an -explicit `--config` flag. - -### 2. Snapshot the BadgerDB volume (cheap insurance) - -Restic backs up `/tmp/ap-avs/db` every 4h, but a fresh snapshot before -mutation is free peace of mind. From inside the existing backup -container: - -```bash -docker exec backup restic backup /tmp/ap-avs/db --tag pre-wallet-salt-backfill -``` - -Or if you'd rather just verify a recent snapshot exists: - -```bash -docker exec backup restic snapshots --tag base | tail -5 -``` - -### 3. Stop the aggregator - -```bash -docker stop aggregator-base -``` - -This releases the BadgerDB lock. Watchtower won't try to restart it -unless a new image tag lands on `latest`, so a sub-minute window is -safe to run unsupervised. - -### 4. Dry-run the backfill - -```bash -docker run --rm --volumes-from aggregator-base avaprotocol/ap-avs:latest \ - backfill-wallet-salt-index --config /app/config/aggregator.yaml --dry-run -``` - -**Inspect the printed summary carefully.** You're looking at: - -- `Total wallet rows scanned` — sanity check this against your expected - population for the chain. -- `Newly marked stale` — for Base, this should be substantial (the - bug fix landing). For Sepolia and Ethereum it'll likely be zero - (factory implementation hasn't been rotated on those chains). -- `Skipped — RPC derive error` — must be 0 or very low. The label is - slightly misleading: this counter also includes wallets with - **negative salts** that the factory's `getAddress(uint256)` ABI - can't accept (`abi: negatively-signed value cannot be packed into - uint parameter`). Those are legacy bug-induced rows that the - migration safely skips. Anything else (real RPC failures or - unexpected ABI errors) deserves investigation before applying. -- `Skipped — missing factory` — legacy rows from before `Factory` was - persisted on the model. Safe to leave alone; new GetWallet calls will - populate them. - -**Verified Base apply (2026-04-11):** - -``` -Total wallet rows scanned: 702 - Canonical (live derive matches): 34 - Stale (live derive differs): 665 - Skipped — RPC derive error: 3 (all were salt=-12 negative-salt rows) -``` - -That ~95% stale ratio is normal for Base and reflects how many times -the Base factory's `accountImplementation` has rotated since the -aggregator started persisting wallet rows. - -### 5. Apply for real - -If the dry-run looks reasonable: - -```bash -docker run --rm --volumes-from aggregator-base avaprotocol/ap-avs:latest \ - backfill-wallet-salt-index --config /app/config/aggregator.yaml -``` - -The migration is **fail-fast**: if any individual `db.Set` or -`MarkWalletStale` operation fails, the whole run aborts with a non-zero -exit code. The migration is idempotent, so you can fix the underlying -issue and re-run from scratch. - -### 6. Restart the aggregator - -```bash -docker start aggregator-base -docker logs --tail 30 aggregator-base -``` - -Confirm it comes up cleanly (look for `Engine started successfully`). - -### 7. Sanity check via the SDK from your laptop - -For Base, the previously affected EOA is -`0xc60e71bd0f2e6d8832Fea1a2d56091C48493C788`. Run: - -```bash -yarn start --avs-target base getWallets 0xc60e71bd0f2e6d8832Fea1a2d56091C48493C788 -``` - -You should see exactly **one** `salt: '0'` entry — the canonical one for -today's factory implementation. For the bug-report EOA above, that's -`0x5d814Cc9E94B2656f59Ee439D44AA1b6ca21434f`. The previous era-1 and -era-2 derivations are gone from the response entirely (filtered by the -`StaleDerivation` guard in `ListWallets`). - -Note: any wallets across **other** salts (`1`, `10`, `1000`, etc.) that -were already `isHidden: true` in the pre-migration response will also -disappear from the response after the migration runs. They were stale -in the same way the salt-0 zombies were — the migration flagged them -all and the new `ListWallets` hard filter drops them. The on-chain -addresses are still in BadgerDB; they're just hidden from the gRPC -response. Recover via direct `socat` REPL access if anything funded -those addresses. - -## Other chains - -Same procedure, different container/host paths. Always run step 1 first -to verify, but for reference: - -| Chain | Server | Container | Verified host config dir | Notes | -|---|---|---|---|---| -| Base | `ap-prod1` | `aggregator-base` | `/home/ava/ap-aggregator-setup/base/config` | ✅ Migrated 2026-04-11. 702 rows, 665 stale, 34 canonical, 3 negative-salt skipped. | -| Ethereum | `ap-prod1` | `aggregator-ethereum` | `/home/ava/ap-aggregator-setup/ethereum/config` (likely) | TBD — verify with `docker inspect`. Factory likely hasn't been rotated, expect ~0 stale. | -| Sepolia | `ap-staging1` | `aggregator-sepolia` | `/home/ava/ap-aggregator-setup/sepolia/config` (likely) | Dry-run on 2026-04-10 showed 220 rows, 0 stale. | -| Base-Sepolia | `ap-staging1` | `aggregator-base-sepolia` | `/home/ava/ap-aggregator-setup/base-sepolia/config` (likely) | TBD. | - -For non-Base chains the migration will likely show zero stale rows -(factory implementation hasn't been rotated), but it's still worth -running so the secondary index is populated for any future -implementation rotation. - -The config filename inside the container is **always** -`/app/config/aggregator.yaml` regardless of chain — the deployment -template uses one canonical filename per host directory, not chain-named -filenames. (Don't trust the chain in the filename, trust `docker -inspect`.) - -## If something goes wrong - -- **Migration aborts mid-run with a write error**: storage hit a transient - issue. The migration is idempotent — fix the underlying cause and - re-run from step 4. Already-written index entries are correct; - re-running just re-validates them. -- **Aggregator won't restart after migration**: check `docker logs - aggregator-base` for the error. The migration only writes new keys - (`wsalt:` prefix) and flips flags on existing wallet records — it - doesn't touch tasks, executions, or any other state. If startup fails, - it's almost certainly unrelated to the migration; restore from the - pre-migration restic snapshot if you need to roll back. -- **getWallets still shows zombie rows after migration**: check the - migration's printed summary for `Newly marked stale` count and - cross-reference against the SDK output. Make sure you bounced the - aggregator container (step 6) — it caches some wallet state in - memory and needs a restart to pick up the marked-stale rows. - -## Background - -See PR #520 for the full bug report. TL;DR: wallet records are keyed by -their derived address (`w::
`), so when a factory's -account implementation is upgraded the same `(owner, factory, salt)` -triple starts deriving to a new address — and because the lookup is -keyed by address, the old row never gets touched. Multiple records -accumulate for the same logical (owner, factory, salt) slot, all -reported as `salt: "0"` in `ListWallets`. The fix adds a -`wsalt:::` secondary index and a backfill -migration to flag the zombies. From 2c60d1adc4b5aca376217c1e89a0c0dbf3fe3679 Mon Sep 17 00:00:00 2001 From: Wei Lin Date: Sat, 11 Apr 2026 02:30:16 -0700 Subject: [PATCH 3/5] fix: refuse zero-address subject in API key auth flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defense-in-depth against the dummy-target-address bypass we hit while trying to recover funds from a stale Base smart wallet. The two-step auth flow lets a caller request a JWT with sub = whatever wallet address they passed to GetSignatureFormat, and a buggy SDK was passing 0x0000000000000000000000000000000000000000 as a dummy. The IsHexAddress check at verifyAuth's last-line defense passed (zero IS a valid hex address), the user's identity silently degraded to the zero address, and every w:: ownership lookup failed with a confusing "smart wallet not found for owner 0x0000..." error. The SDK side has been fixed in ava-sdk-js commit 2dc8027 (which refuses the API-key path when targetAddress is missing). This commit adds the matching server-side defense at all three points where the zero address could enter the auth flow: 1. GetSignatureFormat — the earliest point. Refuses to issue a signature template with the zero address as Wallet:. 2. GetKey — the mint point. Refuses to mint a JWT bound to the zero address even if a caller manages to construct a valid request. 3. verifyAuth — the last-line defense. Refuses any inbound JWT whose sub claim is the zero address, regardless of how it was minted. Three unit tests cover each defense layer. --- aggregator/auth.go | 29 +++++++++- aggregator/auth_test.go | 124 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/aggregator/auth.go b/aggregator/auth.go index ddba8f98..cb77d8b8 100644 --- a/aggregator/auth.go +++ b/aggregator/auth.go @@ -122,6 +122,13 @@ func (r *RpcServer) GetKey(ctx context.Context, payload *avsproto.GetKeyReq) (*a return nil, status.Errorf(codes.InvalidArgument, "Invalid wallet address format") } ownerAddress := common.HexToAddress(walletStr) + // Refuse to mint a token bound to the zero address. The two-step + // auth flow lets a caller request a JWT with sub = walletStr, so we + // must reject 0x0…0 here as well as in verifyAuth — defense-in-depth + // against the dummy-target-address bypass. + if ownerAddress == (common.Address{}) { + return nil, status.Errorf(codes.InvalidArgument, "Wallet address cannot be the zero address") + } if strings.Contains(payload.Signature, ".") { // API key directly @@ -256,8 +263,21 @@ func (r *RpcServer) verifyAuth(ctx context.Context) (*model.User, error) { return nil, fmt.Errorf("%s: subject must be a valid EOA address", auth.InvalidAuthenticationKey) } + // Defense-in-depth: refuse the zero address. common.IsHexAddress + // returns true for "0x0000…0000", and a buggy SDK that minted a + // JWT with the zero-address subject (e.g. via the + // dummy-target-address bypass) would otherwise pass validation + // and silently fail every w:: lookup downstream. + // The zero address is never a real EOA — reject it explicitly. + subjectAddr := common.HexToAddress(subject) + if subjectAddr == (common.Address{}) { + r.config.Logger.Error("API key has zero-address subject; refusing authentication", + "subject", subject) + return nil, fmt.Errorf("%s: subject cannot be the zero address", auth.InvalidAuthenticationKey) + } + user := model.User{ - Address: common.HexToAddress(subject), + Address: subjectAddr, } // caching to reduce hitting eth rpc node @@ -312,6 +332,13 @@ func (r *RpcServer) GetSignatureFormat(ctx context.Context, req *avsproto.GetSig if !common.IsHexAddress(walletAddress) { return nil, status.Errorf(codes.InvalidArgument, "Invalid Ethereum wallet address format") } + // Refuse to issue a signature template for the zero address. This is + // the earliest point in the auth flow where we can stop a caller + // (typically a buggy SDK) from accidentally requesting a JWT bound + // to 0x0…0. See verifyAuth for the matching last-line defense. + if common.HexToAddress(walletAddress) == (common.Address{}) { + return nil, status.Errorf(codes.InvalidArgument, "Wallet address cannot be the zero address") + } // Use smart wallet chain ID (r.chainID) instead of global EigenLayer chain ID // This ensures authentication uses the correct chain for smart wallet operations diff --git a/aggregator/auth_test.go b/aggregator/auth_test.go index ff17c243..51f46467 100644 --- a/aggregator/auth_test.go +++ b/aggregator/auth_test.go @@ -22,6 +22,7 @@ import ( "github.com/ethereum/go-ethereum/ethclient" "github.com/golang-jwt/jwt/v5" "google.golang.org/grpc/codes" + grpcmetadata "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" ) @@ -308,3 +309,126 @@ func TestGetSignatureFormat(t *testing.T) { t.Errorf("expected message to contain %s but got %s", expectedChainIDStr, message) } } + +// Defense-in-depth tests: confirm the auth flow refuses the zero +// address at all three points where it could otherwise sneak in +// (GetSignatureFormat → GetKey → verifyAuth). Without these checks +// a buggy SDK can request a JWT bound to 0x0…0 and silently fail every +// w:: ownership lookup downstream — see PR #520 fallout. + +func TestGetSignatureFormat_RejectsZeroAddress(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + req := &avsproto.GetSignatureFormatReq{ + Wallet: "0x0000000000000000000000000000000000000000", + } + + _, err := r.GetSignatureFormat(context.Background(), req) + if err == nil { + t.Fatalf("expected GetSignatureFormat to reject zero address but it succeeded") + } + statusErr, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got: %v", err) + } + if statusErr.Code() != codes.InvalidArgument { + t.Errorf("expected InvalidArgument, got: %v", statusErr.Code()) + } + if !strings.Contains(statusErr.Message(), "zero address") { + t.Errorf("expected error message to mention 'zero address', got: %q", statusErr.Message()) + } +} + +func TestGetKey_RejectsZeroAddressWallet(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + chainID := int64(11155111) + issuedTs, _ := time.Parse(time.RFC3339, "2025-01-01T00:00:00Z") + expiredTs, _ := time.Parse(time.RFC3339, "2030-01-01T00:00:00Z") + + // Build a valid-looking message but with the zero address as the + // wallet. We don't even need to sign it correctly — the zero-address + // check fires before the signature verification. + message := fmt.Sprintf(authTemplate, + chainID, + "1", + issuedTs.UTC().Format("2006-01-02T15:04:05.000Z"), + expiredTs.UTC().Format("2006-01-02T15:04:05.000Z"), + "0x0000000000000000000000000000000000000000") + + payload := &avsproto.GetKeyReq{ + Message: message, + Signature: "0xdeadbeef", // doesn't matter, never reached + } + + _, err := r.GetKey(context.Background(), payload) + if err == nil { + t.Fatalf("expected GetKey to reject zero address wallet but it succeeded") + } + statusErr, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got: %v", err) + } + if statusErr.Code() != codes.InvalidArgument { + t.Errorf("expected InvalidArgument, got: %v", statusErr.Code()) + } + if !strings.Contains(statusErr.Message(), "zero address") { + t.Errorf("expected error message to mention 'zero address', got: %q", statusErr.Message()) + } +} + +func TestVerifyAuth_RejectsZeroAddressSubject(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + // Manually mint a JWT with sub = 0x0…0 and the correct audience. + // This simulates the bypass we observed: the SDK had requested a + // re-minted token via the dummy-zero-address path, and an older + // build of the server happily accepted it. + claims := &jwt.RegisteredClaims{ + ExpiresAt: jwt.NewNumericDate(time.Now().Add(time.Hour)), + Issuer: auth.Issuer, + Subject: "0x0000000000000000000000000000000000000000", + Audience: jwt.ClaimStrings{"11155111"}, + } + token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + signedToken, signErr := token.SignedString(r.config.JwtSecret) + if signErr != nil { + t.Fatalf("failed to sign test token: %v", signErr) + } + + // Build a metadata context the way grpc would for an inbound request. + md := grpcmetadata.New(map[string]string{"authkey": signedToken}) + ctx := grpcmetadata.NewIncomingContext(context.Background(), md) + + _, err := r.verifyAuth(ctx) + if err == nil { + t.Fatalf("expected verifyAuth to reject zero-address subject but it succeeded") + } + if !strings.Contains(err.Error(), "zero address") { + t.Errorf("expected error to mention 'zero address', got: %q", err.Error()) + } +} From aa6b3ddfd5c3113f4d36d636addedb1625a2beac Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 13 Apr 2026 15:52:05 -0700 Subject: [PATCH 4/5] refactor: replace PARTIAL_SUCCESS with clear SUCCESS/FAILED/ERROR execution status (#521) Co-authored-by: Wei Lin Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- core/taskengine/engine.go | 21 +---- core/taskengine/executor.go | 21 +---- core/taskengine/executor_test.go | 14 +-- core/taskengine/partial_success_test.go | 54 +++++------ core/taskengine/vm.go | 81 ++++------------ core/taskengine/vm_runner_rest.go | 21 ++--- core/taskengine/vm_scheduler_fix_test.go | 12 +-- .../changes/0001-execution-status-redesign.md | 92 +++++++++++++++++++ .../preset/builder_execution_success_test.go | 50 +++++----- protobuf/avs.pb.go | 36 +++++--- protobuf/avs.proto | 12 ++- 11 files changed, 220 insertions(+), 194 deletions(-) create mode 100644 docs/changes/0001-execution-status-redesign.md diff --git a/core/taskengine/engine.go b/core/taskengine/engine.go index 9499628c..15e7f769 100644 --- a/core/taskengine/engine.go +++ b/core/taskengine/engine.go @@ -3051,7 +3051,7 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n } // Step 10: Analyze execution results from all steps - _, executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() + executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() // Step 11: Calculate total gas cost for the workflow @@ -3073,20 +3073,7 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n switch resultStatus { case ExecutionSuccess: n.logger.Info("workflow simulation completed successfully", "task_id", task.Id, "simulation_id", simulationID, "steps", len(execution.Steps)) - case ExecutionPartialSuccess: - // Clean up error message to avoid stack traces in logs - cleanErrorMsg := executionError - stackTraceRegex := regexp.MustCompile(`(?m)^\s*at .*$`) - cleanErrorMsg = stackTraceRegex.ReplaceAllString(cleanErrorMsg, "") - cleanErrorMsg = strings.TrimSpace(cleanErrorMsg) - - n.logger.Warn("workflow simulation completed with partial success", - "error", cleanErrorMsg, - "task_id", task.Id, - "simulation_id", simulationID, - "failed_steps", failedStepCount, - "total_steps", len(vm.ExecutionLogs)) - case ExecutionFailure: + case ExecutionFailed: // Clean up error message to avoid stack traces in logs cleanErrorMsg := executionError stackTraceRegex := regexp.MustCompile(`(?m)^\s*at .*$`) @@ -3103,12 +3090,10 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n // Handle VM-level errors if they occurred if runErr != nil { - // This should not happen if AnalyzeExecutionResult is working correctly, - // but handle it as a fallback for VM-level errors n.logger.Error("workflow simulation had VM-level error", "vm_error", runErr, "task_id", task.Id, "simulation_id", simulationID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } } diff --git a/core/taskengine/executor.go b/core/taskengine/executor.go index 0e5edc1b..ffb5786f 100644 --- a/core/taskengine/executor.go +++ b/core/taskengine/executor.go @@ -618,7 +618,7 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) } // Analyze execution results from all steps (including failed ones) - _, executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() + executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() // Calculate total gas cost for the workflow @@ -652,14 +652,7 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) switch resultStatus { case ExecutionSuccess: x.logger.Info("task execution completed successfully", "task_id", task.Id, "execution_id", queueData.ExecutionID, "total_steps", len(vm.ExecutionLogs)) - case ExecutionPartialSuccess: - x.logger.Warn("task execution completed with partial success", - "error", executionError, - "task_id", task.Id, - "execution_id", queueData.ExecutionID, - "failed_steps", failedStepCount, - "total_steps", len(vm.ExecutionLogs)) - case ExecutionFailure: + case ExecutionFailed: x.logger.Error("task execution completed with failures", "error", executionError, "task_id", task.Id, @@ -669,12 +662,10 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) } if runTaskErr != nil { - // This should not happen if AnalyzeExecutionResult is working correctly, - // but handle it as a fallback for VM-level errors x.logger.Error("task execution had VM-level error", "vm_error", runTaskErr, "task_id", task.Id, "execution_id", queueData.ExecutionID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runTaskErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } } @@ -732,10 +723,8 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) switch resultStatus { case ExecutionSuccess: x.logger.Info("successfully executing task", "task_id", task.Id, "triggermark", queueData) - case ExecutionPartialSuccess: - x.logger.Info("task execution completed with partial success", "task_id", task.Id, "failed_steps", failedStepCount, "triggermark", queueData) - default: // ExecutionFailure or other - x.logger.Warn("task execution completed with step failures", "task_id", task.Id, "failed_steps", failedStepCount) + case ExecutionFailed: + x.logger.Warn("task execution completed with step failures", "task_id", task.Id, "failed_steps", failedStepCount, "triggermark", queueData) } return execution, nil diff --git a/core/taskengine/executor_test.go b/core/taskengine/executor_test.go index 43fe992d..9a32c2e3 100644 --- a/core/taskengine/executor_test.go +++ b/core/taskengine/executor_test.go @@ -3,7 +3,6 @@ package taskengine import ( "net/http" "net/http/httptest" - "strings" "testing" "time" @@ -225,14 +224,15 @@ func TestExecutorRunTaskWithBranchSilentFailureBehavior(t *testing.T) { t.Errorf("Expected no error with silent failure behavior, but got: %v", err) } - // Branch workflows with skipped nodes should report PARTIAL_SUCCESS - if execution.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS { - t.Errorf("Expected partial success status (branch path with skipped nodes), but got: %v with error: %s", execution.Status, execution.Error) + // Branch workflows with skipped nodes are SUCCESS — the workflow executed its + // chosen path correctly; skipping nodes due to branching is expected behavior. + if execution.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS { + t.Errorf("Expected success status (branch path with skipped nodes is normal), but got: %v with error: %s", execution.Status, execution.Error) } - // Should have a partial execution message explaining the branching - if execution.Error == "" || !strings.Contains(execution.Error, "Partial execution") { - t.Errorf("Expected partial execution message, but got: %s", execution.Error) + // No error when all executed steps succeeded + if execution.Error != "" { + t.Errorf("Expected empty error for successful branch execution, but got: %s", execution.Error) } // Find the branch step regardless of ordering diff --git a/core/taskengine/partial_success_test.go b/core/taskengine/partial_success_test.go index 0268a4ed..59330ce0 100644 --- a/core/taskengine/partial_success_test.go +++ b/core/taskengine/partial_success_test.go @@ -39,12 +39,9 @@ func TestAnalyzeExecutionResult_AllSuccess(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if !success { - t.Errorf("Expected success=true, got success=%v", success) - } if errorMessage != "" { t.Errorf("Expected empty error message, got: %s", errorMessage) } @@ -56,8 +53,8 @@ func TestAnalyzeExecutionResult_AllSuccess(t *testing.T) { } } -// TestAnalyzeExecutionResult_PartialSuccess tests the case where some steps succeed and some fail -func TestAnalyzeExecutionResult_PartialSuccess(t *testing.T) { +// TestAnalyzeExecutionResult_SomeStepsFailed tests the case where some steps succeed and some fail +func TestAnalyzeExecutionResult_SomeStepsFailed(t *testing.T) { vm := NewVM() vm.logger = testutil.GetLogger() @@ -89,20 +86,17 @@ func TestAnalyzeExecutionResult_PartialSuccess(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for partial success, got success=%v", success) - } if errorMessage == "" { - t.Errorf("Expected non-empty error message for partial success") + t.Errorf("Expected non-empty error message when some steps failed") } if failedCount != 1 { t.Errorf("Expected failedCount=1, got failedCount=%d", failedCount) } - if resultStatus != ExecutionPartialSuccess { - t.Errorf("Expected resultStatus=ExecutionPartialSuccess, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed, got resultStatus=%v", resultStatus) } // Check that error message contains failure information @@ -139,20 +133,17 @@ func TestAnalyzeExecutionResult_AllFailure(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for all failures, got success=%v", success) - } if errorMessage == "" { t.Errorf("Expected non-empty error message for all failures") } if failedCount != 3 { t.Errorf("Expected failedCount=3, got failedCount=%d", failedCount) } - if resultStatus != ExecutionFailure { - t.Errorf("Expected resultStatus=ExecutionFailure, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed, got resultStatus=%v", resultStatus) } // Check that error message contains failure information @@ -170,25 +161,22 @@ func TestAnalyzeExecutionResult_NoSteps(t *testing.T) { // No execution logs vm.ExecutionLogs = []*avsproto.Execution_Step{} - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for no steps, got success=%v", success) - } if errorMessage != "no execution steps found" { t.Errorf("Expected specific error message for no steps, got: %s", errorMessage) } if failedCount != 0 { t.Errorf("Expected failedCount=0 for no steps, got failedCount=%d", failedCount) } - if resultStatus != ExecutionFailure { - t.Errorf("Expected resultStatus=ExecutionFailure for no steps, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed for no steps, got resultStatus=%v", resultStatus) } } -// TestGetExecutionStatus_PartialSuccess tests the GetExecutionStatus method for partial success -func TestGetExecutionStatus_PartialSuccess(t *testing.T) { +// TestGetExecutionStatus_StepFailures tests the GetExecutionStatus method when some steps fail +func TestGetExecutionStatus_StepFailures(t *testing.T) { // Set up test database and engine db := testutil.TestMustDB() defer storage.Destroy(db.(*storage.BadgerStorage)) @@ -209,13 +197,13 @@ func TestGetExecutionStatus_PartialSuccess(t *testing.T) { }, } - // Create execution with partial success (some steps succeed, some fail) + // Create execution where some steps succeed and some fail execution := &avsproto.Execution{ Id: "test-execution-id", StartAt: time.Now().UnixMilli(), EndAt: time.Now().UnixMilli(), - Status: avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS, // Overall status is partial success - Error: "Partial success: 1 of 3 steps failed: Database Query", + Status: avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED, + Error: "1 of 3 steps failed: Database Query", Index: 0, // First execution Steps: []*avsproto.Execution_Step{ { @@ -269,9 +257,9 @@ func TestGetExecutionStatus_PartialSuccess(t *testing.T) { t.Fatalf("GetExecutionStatus failed: %v", err) } - // Verify that it returns PARTIAL_SUCCESS status - if statusResp.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS { - t.Errorf("Expected EXECUTION_STATUS_PARTIAL_SUCCESS, got %v", statusResp.Status) + // Verify that it returns FAILED status (some steps failed) + if statusResp.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED { + t.Errorf("Expected EXECUTION_STATUS_FAILED, got %v", statusResp.Status) } } diff --git a/core/taskengine/vm.go b/core/taskengine/vm.go index ec33e584..cbbfb4f0 100644 --- a/core/taskengine/vm.go +++ b/core/taskengine/vm.go @@ -3208,12 +3208,12 @@ func (v *VM) createExecutionStep(nodeId string, success bool, errorMsg string, l type ExecutionResultStatus int const ( - // ExecutionSuccess indicates all steps completed successfully + // ExecutionSuccess indicates all executed steps succeeded (includes branch/conditional skips) ExecutionSuccess ExecutionResultStatus = iota - // ExecutionPartialSuccess indicates some steps succeeded but at least one failed - ExecutionPartialSuccess - // ExecutionFailure indicates execution failed (all steps failed or critical failure) - ExecutionFailure + // ExecutionFailed indicates one or more node-level steps failed during execution + ExecutionFailed + // ExecutionError indicates a system-level failure (VM could not run the workflow) + ExecutionError ) // getStepDisplayName extracts the display name for a step, preferring the name over ID @@ -3225,77 +3225,36 @@ func getStepDisplayName(step *avsproto.Execution_Step) string { return stepName } -// AnalyzeExecutionResult examines all execution steps and determines overall success/failure/partial status -// Returns (success, errorMessage, failedStepCount, resultStatus) -func (v *VM) AnalyzeExecutionResult() (bool, string, int, ExecutionResultStatus) { +// AnalyzeExecutionResult examines all execution steps and determines overall success/failure status. +// Returns (errorMessage, failedStepCount, resultStatus) +func (v *VM) AnalyzeExecutionResult() (string, int, ExecutionResultStatus) { v.mu.Lock() defer v.mu.Unlock() if len(v.ExecutionLogs) == 0 { - return false, "no execution steps found", 0, ExecutionFailure + return "no execution steps found", 0, ExecutionFailed } var failedStepNames []string - var successfulStepNames []string - var firstErrorMessage string for _, step := range v.ExecutionLogs { - stepName := getStepDisplayName(step) - if !step.Success && step.Error != "" { - if firstErrorMessage == "" { - firstErrorMessage = step.Error - } - failedStepNames = append(failedStepNames, stepName) - } else if step.Success { - successfulStepNames = append(successfulStepNames, stepName) + failedStepNames = append(failedStepNames, getStepDisplayName(step)) } } failedCount := len(failedStepNames) - totalSteps := len(v.ExecutionLogs) - - // Determine execution status and success flag - var resultStatus ExecutionResultStatus - var success bool - var errorMessage string if failedCount == 0 { - // All executed steps succeeded. However, if not all workflow steps executed - // (e.g., due to branch selections or conditional skips), report PARTIAL_SUCCESS - // to reflect that the workflow did not traverse all configured nodes. - executedCount := len(v.ExecutionLogs) - totalWorkflowSteps := 1 + len(v.TaskNodes) // 1 trigger + all nodes - if v.GetTaskId() == "" && len(v.TaskNodes) == 1 { - // single-node immediate execution - totalWorkflowSteps = 1 - } - - if executedCount < totalWorkflowSteps { - resultStatus = ExecutionPartialSuccess - success = false // do not mark full success when nodes were skipped - errorMessage = fmt.Sprintf("Partial execution: %d out of %d steps executed (branch/conditional path)", executedCount, totalWorkflowSteps) - } else { - // All steps that exist in the workflow executed and succeeded - resultStatus = ExecutionSuccess - success = true - errorMessage = "" - } - } else if failedCount > 0 { - // Distinguish between all failed vs some failed (for internal status tracking) - if failedCount == totalSteps { - // All steps failed - resultStatus = ExecutionFailure - } else { - // Some steps succeeded, some failed - partial success for internal tracking - resultStatus = ExecutionPartialSuccess - } - success = false - // Use simple error message format (no prefix) for both cases - errorMessage = formatExecutionErrorMessage("", failedCount, totalSteps, failedStepNames) + // All executed steps succeeded. Branch/conditional skips are normal + // workflow behavior and count as SUCCESS — the workflow did what it + // was configured to do. + return "", 0, ExecutionSuccess } - return success, errorMessage, failedCount, resultStatus + // One or more steps failed (covers both partial and total failure). + errorMessage := formatExecutionErrorMessage("", failedCount, len(v.ExecutionLogs), failedStepNames) + return errorMessage, failedCount, ExecutionFailed } // CalculateTotalGasCost sums up gas costs from all execution steps that involve blockchain operations @@ -3366,10 +3325,10 @@ func convertToExecutionStatus(resultStatus ExecutionResultStatus) avsproto.Execu switch resultStatus { case ExecutionSuccess: return avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS - case ExecutionPartialSuccess: - return avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS - case ExecutionFailure: + case ExecutionFailed: return avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + case ExecutionError: + return avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR default: return avsproto.ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED } diff --git a/core/taskengine/vm_runner_rest.go b/core/taskengine/vm_runner_rest.go index db2ac8ba..c37902ef 100644 --- a/core/taskengine/vm_runner_rest.go +++ b/core/taskengine/vm_runner_rest.go @@ -902,18 +902,17 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs var resultStatus ExecutionResultStatus var statusText, statusBgColor, statusTextColor string if failed { - resultStatus = ExecutionFailure + resultStatus = ExecutionFailed statusText = fmt.Sprintf("but failed at the '%s' step due to %s.", safeName(failedName), firstLine(failedReason)) statusBgColor = "#FEE2E2" // light red statusTextColor = "#991B1B" // dark red - } else if skippedCount > 0 { - resultStatus = ExecutionPartialSuccess - statusText = fmt.Sprintf("but %d nodes were skipped due to Branch condition.", skippedCount) - statusBgColor = "#FEF3C7" // light yellow - statusTextColor = "#92400E" // dark yellow/amber } else { resultStatus = ExecutionSuccess - statusText = "All steps completed successfully" + if skippedCount > 0 { + statusText = fmt.Sprintf("All steps completed successfully (%d nodes skipped by Branch condition).", skippedCount) + } else { + statusText = "All steps completed successfully" + } statusBgColor = "#D1FAE5" // light green statusTextColor = "#065F46" // dark green } @@ -923,9 +922,7 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs switch resultStatus { case ExecutionSuccess: iconSvg = `` - case ExecutionPartialSuccess: - iconSvg = `` - case ExecutionFailure: + case ExecutionFailed: iconSvg = `` } statusHtml := fmt.Sprintf( @@ -944,9 +941,7 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs switch resultStatus { case ExecutionSuccess: subjectStatusText = "successfully completed" - case ExecutionPartialSuccess: - subjectStatusText = "partially executed" - case ExecutionFailure: + case ExecutionFailed: subjectStatusText = "failed to execute" } diff --git a/core/taskengine/vm_scheduler_fix_test.go b/core/taskengine/vm_scheduler_fix_test.go index 51c6d439..03972c96 100644 --- a/core/taskengine/vm_scheduler_fix_test.go +++ b/core/taskengine/vm_scheduler_fix_test.go @@ -246,13 +246,11 @@ func TestSchedulerExecutesNodeAfterBranch(t *testing.T) { require.NoError(t, err) require.NotNil(t, execution) - // Verify execution was partially successful (branch path means not all nodes executed) - // When a branch workflow executes, not all configured nodes run (only one branch path), - // which correctly results in PARTIAL_SUCCESS status - assert.Equal(t, avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS, execution.Status, - "Branch workflows should report PARTIAL_SUCCESS when not all nodes execute") - assert.Contains(t, execution.Error, "Partial execution", "Should report partial execution due to branch path") - assert.Contains(t, execution.Error, "6 out of 7 steps executed", "Should show correct step counts") + // Branch workflows with skipped nodes are SUCCESS — the workflow executed its + // chosen path correctly; skipping nodes due to branching is expected behavior. + assert.Equal(t, avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS, execution.Status, + "Branch workflows should report SUCCESS even when not all nodes execute") + assert.Empty(t, execution.Error, "No error expected when all executed steps succeeded") // Debug: print what executed t.Logf("Executed %d steps:", len(execution.Steps)) diff --git a/docs/changes/0001-execution-status-redesign.md b/docs/changes/0001-execution-status-redesign.md new file mode 100644 index 00000000..dd90efe2 --- /dev/null +++ b/docs/changes/0001-execution-status-redesign.md @@ -0,0 +1,92 @@ +# Execution Status Redesign: Replace PARTIAL_SUCCESS with SUCCESS/FAILED/ERROR + +- **Date**: 2026-04-13 +- **Status**: Implemented +- **Branch**: `fix/remove-dead-success-bool` + +## Context + +The `EXECUTION_STATUS_PARTIAL_SUCCESS` enum was used for two unrelated scenarios: + +1. **Branch skips** — the workflow has conditional branches; some nodes were not + executed because the branch condition routed elsewhere. Nothing failed. +2. **Step failures** — one or more nodes actually failed during execution + (e.g., an ERC-20 transfer reverted). + +Clients had to work around this by inspecting every step individually: + +```ts +const isConditionalSkip = + status === ExecutionStatus.PartialSuccess && + steps.every((step) => step.success); +``` + +This workaround should no longer be necessary. + +## Decision + +Three execution statuses, orthogonal to step count: + +| Scenario | Status | `steps.length` vs task node count | `execution.error` | +|---------------------------------|-----------|-----------------------------------|------------------------------------| +| All nodes ran, all succeeded | `SUCCESS` | equal | empty | +| Branch skipped nodes, all OK | `SUCCESS` | less than total | empty | +| Some nodes failed | `FAILED` | any | `"N of M steps failed: node1, …"` | +| All nodes failed | `FAILED` | equal | `"N of N steps failed: node1, …"` | +| No steps executed | `FAILED` | zero | `"no execution steps found"` | +| System-level failure (VM crash) | `ERROR` | zero (or partial if crash mid-run) | `"VM execution error: …"` | + +**How to determine what happened:** + +- **`status`** answers: did the workflow succeed? + - `SUCCESS` — yes, every executed step passed. Branch skips are normal. + - `FAILED` — no, at least one step failed. Check `execution.error` and + individual `step.success` / `step.error` for details. + - `ERROR` — the system could not run the workflow at all (compilation + failure, VM crash). This is not a user-fixable workflow issue. + +- **`steps` array** answers: what ran and what was skipped? + - Compare `steps.length` against the task's total node count to know + how many nodes were skipped by branching. + - Each step has `success`, `error`, and `name` for per-node detail. + +- **`execution.error`** answers: what went wrong? + - Empty string when `status` is `SUCCESS`. + - Contains a summary like `"1 of 5 steps failed: loop1"` when `FAILED`. + - Contains the system error message when `ERROR`. + +## Proto Changes + +```protobuf +enum ExecutionStatus { + EXECUTION_STATUS_UNSPECIFIED = 0; + EXECUTION_STATUS_PENDING = 1; + EXECUTION_STATUS_SUCCESS = 2; + EXECUTION_STATUS_FAILED = 3; + reserved 4; + reserved "EXECUTION_STATUS_PARTIAL_SUCCESS"; + EXECUTION_STATUS_ERROR = 5; +} +``` + +- Enum value `4` is reserved and will not be reused. +- New enum value `ERROR = 5` for system-level failures. + +## SDK/Client Migration + +1. Remove any `PartialSuccess` handling or `isConditionalSkip` workarounds. +2. Treat `SUCCESS` as the only positive outcome. Branch skips no longer + produce a warning status. +3. Treat `FAILED` as the single status for any node-level execution failure, + regardless of whether some or all steps failed. +4. Treat `ERROR` as a system-level problem (not caused by the workflow + configuration itself). + +## Consequences + +- Branch-skip workflows stop surfacing as warnings in the UI. +- The `steps` array is the source of truth for what executed and what + was skipped — no status-level signal needed for coverage. +- Email summaries for branch-skip workflows now show a green success + badge with a note like "3 nodes skipped by Branch condition" instead + of a yellow warning badge. diff --git a/pkg/erc4337/preset/builder_execution_success_test.go b/pkg/erc4337/preset/builder_execution_success_test.go index f88de385..ccc34f7b 100644 --- a/pkg/erc4337/preset/builder_execution_success_test.go +++ b/pkg/erc4337/preset/builder_execution_success_test.go @@ -84,31 +84,37 @@ func TestUserOpWithdrawalSkipsReimbursementWhenBalanceInsufficient(t *testing.T) reserve := big.NewInt(100000000000000) // 0.0001 ETH withdrawalAmount := new(big.Int).Sub(balance, reserve) - calldata, err := aa.PackExecute(secondaryWallet, withdrawalAmount, []byte{}) - require.NoError(t, err, "Failed to pack execute calldata") + // If balance is already below the reserve, the precondition (insufficient + // funds for reimbursement) is already satisfied — skip the withdrawal. + if withdrawalAmount.Sign() > 0 { + calldata, err := aa.PackExecute(secondaryWallet, withdrawalAmount, []byte{}) + require.NoError(t, err, "Failed to pack execute calldata") - paymasterRequest := GetVerifyingPaymasterRequestForDuration( - smartWalletConfig.PaymasterAddress, - 15*time.Minute, - ) + paymasterRequest := GetVerifyingPaymasterRequestForDuration( + smartWalletConfig.PaymasterAddress, + 15*time.Minute, + ) - // Withdrawal should succeed — system skips reimbursement when balance is insufficient - userOp, receipt, err := SendUserOp( - smartWalletConfig, - owner, - calldata, - paymasterRequest, - &primaryWallet, - nil, - nil, // executionFeeWei - nil, - ) - require.NoError(t, err, "Withdrawal should succeed even without reimbursement") - require.NotNil(t, userOp, "UserOp should be built") - if receipt == nil { - t.Skip("UserOp sent but receipt not available (confirmation timeout)") + // Withdrawal should succeed — system skips reimbursement when balance is insufficient + userOp, receipt, err := SendUserOp( + smartWalletConfig, + owner, + calldata, + paymasterRequest, + &primaryWallet, + nil, + nil, // executionFeeWei + nil, + ) + require.NoError(t, err, "Withdrawal should succeed even without reimbursement") + require.NotNil(t, userOp, "UserOp should be built") + if receipt == nil { + t.Skip("UserOp sent but receipt not available (confirmation timeout)") + } + t.Logf("Withdrawal succeeded. TX Hash: %s Gas used: %d", receipt.TxHash.Hex(), receipt.GasUsed) + } else { + t.Logf("Balance already below reserve (%s < %s), skipping withdrawal", balance.String(), reserve.String()) } - t.Logf("Withdrawal succeeded. TX Hash: %s Gas used: %d", receipt.TxHash.Hex(), receipt.GasUsed) // Send the funds back from the secondary wallet to the primary wallet secondaryBalance, err := client.BalanceAt(context.Background(), secondaryWallet, nil) diff --git a/protobuf/avs.pb.go b/protobuf/avs.pb.go index a2411e83..9370ded7 100644 --- a/protobuf/avs.pb.go +++ b/protobuf/avs.pb.go @@ -547,15 +547,21 @@ func (TaskStatus) EnumDescriptor() ([]byte, []int) { return file_avs_proto_rawDescGZIP(), []int{6} } -// Execution Status re-present a run of the task +// ExecutionStatus represents the outcome of a task execution. +// +// SUCCESS – every executed step succeeded (includes branch/conditional skips). +// FAILED – one or more node-level steps failed during execution. +// ERROR – system-level failure; the VM could not run the workflow at all. +// +// Value 4 (formerly PARTIAL_SUCCESS) is reserved and must not be reused. type ExecutionStatus int32 const ( - ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED ExecutionStatus = 0 - ExecutionStatus_EXECUTION_STATUS_PENDING ExecutionStatus = 1 - ExecutionStatus_EXECUTION_STATUS_SUCCESS ExecutionStatus = 2 - ExecutionStatus_EXECUTION_STATUS_FAILED ExecutionStatus = 3 - ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS ExecutionStatus = 4 + ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED ExecutionStatus = 0 + ExecutionStatus_EXECUTION_STATUS_PENDING ExecutionStatus = 1 + ExecutionStatus_EXECUTION_STATUS_SUCCESS ExecutionStatus = 2 + ExecutionStatus_EXECUTION_STATUS_FAILED ExecutionStatus = 3 + ExecutionStatus_EXECUTION_STATUS_ERROR ExecutionStatus = 5 ) // Enum value maps for ExecutionStatus. @@ -565,14 +571,14 @@ var ( 1: "EXECUTION_STATUS_PENDING", 2: "EXECUTION_STATUS_SUCCESS", 3: "EXECUTION_STATUS_FAILED", - 4: "EXECUTION_STATUS_PARTIAL_SUCCESS", + 5: "EXECUTION_STATUS_ERROR", } ExecutionStatus_value = map[string]int32{ - "EXECUTION_STATUS_UNSPECIFIED": 0, - "EXECUTION_STATUS_PENDING": 1, - "EXECUTION_STATUS_SUCCESS": 2, - "EXECUTION_STATUS_FAILED": 3, - "EXECUTION_STATUS_PARTIAL_SUCCESS": 4, + "EXECUTION_STATUS_UNSPECIFIED": 0, + "EXECUTION_STATUS_PENDING": 1, + "EXECUTION_STATUS_SUCCESS": 2, + "EXECUTION_STATUS_FAILED": 3, + "EXECUTION_STATUS_ERROR": 5, } ) @@ -10290,13 +10296,13 @@ const file_avs_proto_rawDesc = "" + "\n" + "\x06Failed\x10\x02\x12\v\n" + "\aRunning\x10\x04\x12\f\n" + - "\bDisabled\x10\x05*\xb2\x01\n" + + "\bDisabled\x10\x05*\xd0\x01\n" + "\x0fExecutionStatus\x12 \n" + "\x1cEXECUTION_STATUS_UNSPECIFIED\x10\x00\x12\x1c\n" + "\x18EXECUTION_STATUS_PENDING\x10\x01\x12\x1c\n" + "\x18EXECUTION_STATUS_SUCCESS\x10\x02\x12\x1b\n" + - "\x17EXECUTION_STATUS_FAILED\x10\x03\x12$\n" + - " EXECUTION_STATUS_PARTIAL_SUCCESS\x10\x042\xd5\x10\n" + + "\x17EXECUTION_STATUS_FAILED\x10\x03\x12\x1a\n" + + "\x16EXECUTION_STATUS_ERROR\x10\x05\"\x04\b\x04\x10\x04* EXECUTION_STATUS_PARTIAL_SUCCESS2\xd5\x10\n" + "\n" + "Aggregator\x126\n" + "\x06GetKey\x12\x15.aggregator.GetKeyReq\x1a\x13.aggregator.KeyResp\"\x00\x12]\n" + diff --git a/protobuf/avs.proto b/protobuf/avs.proto index c5c2aa96..243802c1 100644 --- a/protobuf/avs.proto +++ b/protobuf/avs.proto @@ -349,13 +349,21 @@ enum TaskStatus { Disabled = 5; } -// Execution Status re-present a run of the task +// ExecutionStatus represents the outcome of a task execution. +// +// SUCCESS – every executed step succeeded (includes branch/conditional skips). +// FAILED – one or more node-level steps failed during execution. +// ERROR – system-level failure; the VM could not run the workflow at all. +// +// Value 4 (formerly PARTIAL_SUCCESS) is reserved and must not be reused. enum ExecutionStatus { EXECUTION_STATUS_UNSPECIFIED = 0; EXECUTION_STATUS_PENDING = 1; EXECUTION_STATUS_SUCCESS = 2; EXECUTION_STATUS_FAILED = 3; - EXECUTION_STATUS_PARTIAL_SUCCESS = 4; + reserved 4; + reserved "EXECUTION_STATUS_PARTIAL_SUCCESS"; + EXECUTION_STATUS_ERROR = 5; } From 5500a5410c0cd634c708db2cb3d3490ac330bf98 Mon Sep 17 00:00:00 2001 From: Wei Lin Date: Mon, 13 Apr 2026 16:04:27 -0700 Subject: [PATCH 5/5] fix: always set ERROR status on VM-level failures and use t.Skip for low-balance test VM-level errors (runTaskErr/runErr) now always override execution status to ERROR, even when AnalyzeExecutionResult already populated the error field. Previously, the ERROR status was skipped if execution.Error was non-empty, causing VM crashes to be misreported as FAILED. Also changed the reimbursement test's low-balance branch from t.Logf to t.Skipf so CI doesn't report a false positive when the precondition is already met. --- core/taskengine/engine.go | 4 +++- core/taskengine/executor.go | 4 +++- pkg/erc4337/preset/builder_execution_success_test.go | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/core/taskengine/engine.go b/core/taskengine/engine.go index 15e7f769..aad95f2b 100644 --- a/core/taskengine/engine.go +++ b/core/taskengine/engine.go @@ -3093,8 +3093,10 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n n.logger.Error("workflow simulation had VM-level error", "vm_error", runErr, "task_id", task.Id, "simulation_id", simulationID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR + } else { + execution.Error = fmt.Sprintf("VM execution error: %s (step analysis: %s)", runErr.Error(), execution.Error) } + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } return execution, nil diff --git a/core/taskengine/executor.go b/core/taskengine/executor.go index ffb5786f..a0ac0f18 100644 --- a/core/taskengine/executor.go +++ b/core/taskengine/executor.go @@ -665,8 +665,10 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) x.logger.Error("task execution had VM-level error", "vm_error", runTaskErr, "task_id", task.Id, "execution_id", queueData.ExecutionID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runTaskErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR + } else { + execution.Error = fmt.Sprintf("VM execution error: %s (step analysis: %s)", runTaskErr.Error(), execution.Error) } + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } // batch update storage for task + execution log diff --git a/pkg/erc4337/preset/builder_execution_success_test.go b/pkg/erc4337/preset/builder_execution_success_test.go index ccc34f7b..ea8437a0 100644 --- a/pkg/erc4337/preset/builder_execution_success_test.go +++ b/pkg/erc4337/preset/builder_execution_success_test.go @@ -113,7 +113,7 @@ func TestUserOpWithdrawalSkipsReimbursementWhenBalanceInsufficient(t *testing.T) } t.Logf("Withdrawal succeeded. TX Hash: %s Gas used: %d", receipt.TxHash.Hex(), receipt.GasUsed) } else { - t.Logf("Balance already below reserve (%s < %s), skipping withdrawal", balance.String(), reserve.String()) + t.Skipf("Balance already below reserve (%s < %s), withdrawal precondition already met", balance.String(), reserve.String()) } // Send the funds back from the secondary wallet to the primary wallet