diff --git a/aggregator/auth.go b/aggregator/auth.go index ddba8f98..cb77d8b8 100644 --- a/aggregator/auth.go +++ b/aggregator/auth.go @@ -122,6 +122,13 @@ func (r *RpcServer) GetKey(ctx context.Context, payload *avsproto.GetKeyReq) (*a return nil, status.Errorf(codes.InvalidArgument, "Invalid wallet address format") } ownerAddress := common.HexToAddress(walletStr) + // Refuse to mint a token bound to the zero address. The two-step + // auth flow lets a caller request a JWT with sub = walletStr, so we + // must reject 0x0…0 here as well as in verifyAuth — defense-in-depth + // against the dummy-target-address bypass. + if ownerAddress == (common.Address{}) { + return nil, status.Errorf(codes.InvalidArgument, "Wallet address cannot be the zero address") + } if strings.Contains(payload.Signature, ".") { // API key directly @@ -256,8 +263,21 @@ func (r *RpcServer) verifyAuth(ctx context.Context) (*model.User, error) { return nil, fmt.Errorf("%s: subject must be a valid EOA address", auth.InvalidAuthenticationKey) } + // Defense-in-depth: refuse the zero address. common.IsHexAddress + // returns true for "0x0000…0000", and a buggy SDK that minted a + // JWT with the zero-address subject (e.g. via the + // dummy-target-address bypass) would otherwise pass validation + // and silently fail every w:: lookup downstream. + // The zero address is never a real EOA — reject it explicitly. + subjectAddr := common.HexToAddress(subject) + if subjectAddr == (common.Address{}) { + r.config.Logger.Error("API key has zero-address subject; refusing authentication", + "subject", subject) + return nil, fmt.Errorf("%s: subject cannot be the zero address", auth.InvalidAuthenticationKey) + } + user := model.User{ - Address: common.HexToAddress(subject), + Address: subjectAddr, } // caching to reduce hitting eth rpc node @@ -312,6 +332,13 @@ func (r *RpcServer) GetSignatureFormat(ctx context.Context, req *avsproto.GetSig if !common.IsHexAddress(walletAddress) { return nil, status.Errorf(codes.InvalidArgument, "Invalid Ethereum wallet address format") } + // Refuse to issue a signature template for the zero address. This is + // the earliest point in the auth flow where we can stop a caller + // (typically a buggy SDK) from accidentally requesting a JWT bound + // to 0x0…0. See verifyAuth for the matching last-line defense. + if common.HexToAddress(walletAddress) == (common.Address{}) { + return nil, status.Errorf(codes.InvalidArgument, "Wallet address cannot be the zero address") + } // Use smart wallet chain ID (r.chainID) instead of global EigenLayer chain ID // This ensures authentication uses the correct chain for smart wallet operations diff --git a/aggregator/auth_test.go b/aggregator/auth_test.go index ff17c243..51f46467 100644 --- a/aggregator/auth_test.go +++ b/aggregator/auth_test.go @@ -22,6 +22,7 @@ import ( "github.com/ethereum/go-ethereum/ethclient" "github.com/golang-jwt/jwt/v5" "google.golang.org/grpc/codes" + grpcmetadata "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" ) @@ -308,3 +309,126 @@ func TestGetSignatureFormat(t *testing.T) { t.Errorf("expected message to contain %s but got %s", expectedChainIDStr, message) } } + +// Defense-in-depth tests: confirm the auth flow refuses the zero +// address at all three points where it could otherwise sneak in +// (GetSignatureFormat → GetKey → verifyAuth). Without these checks +// a buggy SDK can request a JWT bound to 0x0…0 and silently fail every +// w:: ownership lookup downstream — see PR #520 fallout. + +func TestGetSignatureFormat_RejectsZeroAddress(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + req := &avsproto.GetSignatureFormatReq{ + Wallet: "0x0000000000000000000000000000000000000000", + } + + _, err := r.GetSignatureFormat(context.Background(), req) + if err == nil { + t.Fatalf("expected GetSignatureFormat to reject zero address but it succeeded") + } + statusErr, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got: %v", err) + } + if statusErr.Code() != codes.InvalidArgument { + t.Errorf("expected InvalidArgument, got: %v", statusErr.Code()) + } + if !strings.Contains(statusErr.Message(), "zero address") { + t.Errorf("expected error message to mention 'zero address', got: %q", statusErr.Message()) + } +} + +func TestGetKey_RejectsZeroAddressWallet(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + chainID := int64(11155111) + issuedTs, _ := time.Parse(time.RFC3339, "2025-01-01T00:00:00Z") + expiredTs, _ := time.Parse(time.RFC3339, "2030-01-01T00:00:00Z") + + // Build a valid-looking message but with the zero address as the + // wallet. We don't even need to sign it correctly — the zero-address + // check fires before the signature verification. + message := fmt.Sprintf(authTemplate, + chainID, + "1", + issuedTs.UTC().Format("2006-01-02T15:04:05.000Z"), + expiredTs.UTC().Format("2006-01-02T15:04:05.000Z"), + "0x0000000000000000000000000000000000000000") + + payload := &avsproto.GetKeyReq{ + Message: message, + Signature: "0xdeadbeef", // doesn't matter, never reached + } + + _, err := r.GetKey(context.Background(), payload) + if err == nil { + t.Fatalf("expected GetKey to reject zero address wallet but it succeeded") + } + statusErr, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got: %v", err) + } + if statusErr.Code() != codes.InvalidArgument { + t.Errorf("expected InvalidArgument, got: %v", statusErr.Code()) + } + if !strings.Contains(statusErr.Message(), "zero address") { + t.Errorf("expected error message to mention 'zero address', got: %q", statusErr.Message()) + } +} + +func TestVerifyAuth_RejectsZeroAddressSubject(t *testing.T) { + logger, _ := sdklogging.NewZapLogger("development") + + r := RpcServer{ + config: &config.Config{ + JwtSecret: []byte("test123"), + Logger: logger, + }, + chainID: big.NewInt(11155111), + } + + // Manually mint a JWT with sub = 0x0…0 and the correct audience. + // This simulates the bypass we observed: the SDK had requested a + // re-minted token via the dummy-zero-address path, and an older + // build of the server happily accepted it. + claims := &jwt.RegisteredClaims{ + ExpiresAt: jwt.NewNumericDate(time.Now().Add(time.Hour)), + Issuer: auth.Issuer, + Subject: "0x0000000000000000000000000000000000000000", + Audience: jwt.ClaimStrings{"11155111"}, + } + token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + signedToken, signErr := token.SignedString(r.config.JwtSecret) + if signErr != nil { + t.Fatalf("failed to sign test token: %v", signErr) + } + + // Build a metadata context the way grpc would for an inbound request. + md := grpcmetadata.New(map[string]string{"authkey": signedToken}) + ctx := grpcmetadata.NewIncomingContext(context.Background(), md) + + _, err := r.verifyAuth(ctx) + if err == nil { + t.Fatalf("expected verifyAuth to reject zero-address subject but it succeeded") + } + if !strings.Contains(err.Error(), "zero address") { + t.Errorf("expected error to mention 'zero address', got: %q", err.Error()) + } +} diff --git a/core/taskengine/engine.go b/core/taskengine/engine.go index 9499628c..aad95f2b 100644 --- a/core/taskengine/engine.go +++ b/core/taskengine/engine.go @@ -3051,7 +3051,7 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n } // Step 10: Analyze execution results from all steps - _, executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() + executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() // Step 11: Calculate total gas cost for the workflow @@ -3073,20 +3073,7 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n switch resultStatus { case ExecutionSuccess: n.logger.Info("workflow simulation completed successfully", "task_id", task.Id, "simulation_id", simulationID, "steps", len(execution.Steps)) - case ExecutionPartialSuccess: - // Clean up error message to avoid stack traces in logs - cleanErrorMsg := executionError - stackTraceRegex := regexp.MustCompile(`(?m)^\s*at .*$`) - cleanErrorMsg = stackTraceRegex.ReplaceAllString(cleanErrorMsg, "") - cleanErrorMsg = strings.TrimSpace(cleanErrorMsg) - - n.logger.Warn("workflow simulation completed with partial success", - "error", cleanErrorMsg, - "task_id", task.Id, - "simulation_id", simulationID, - "failed_steps", failedStepCount, - "total_steps", len(vm.ExecutionLogs)) - case ExecutionFailure: + case ExecutionFailed: // Clean up error message to avoid stack traces in logs cleanErrorMsg := executionError stackTraceRegex := regexp.MustCompile(`(?m)^\s*at .*$`) @@ -3103,13 +3090,13 @@ func (n *Engine) SimulateTask(user *model.User, trigger *avsproto.TaskTrigger, n // Handle VM-level errors if they occurred if runErr != nil { - // This should not happen if AnalyzeExecutionResult is working correctly, - // but handle it as a fallback for VM-level errors n.logger.Error("workflow simulation had VM-level error", "vm_error", runErr, "task_id", task.Id, "simulation_id", simulationID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + } else { + execution.Error = fmt.Sprintf("VM execution error: %s (step analysis: %s)", runErr.Error(), execution.Error) } + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } return execution, nil diff --git a/core/taskengine/executor.go b/core/taskengine/executor.go index 0e5edc1b..a0ac0f18 100644 --- a/core/taskengine/executor.go +++ b/core/taskengine/executor.go @@ -618,7 +618,7 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) } // Analyze execution results from all steps (including failed ones) - _, executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() + executionError, failedStepCount, resultStatus := vm.AnalyzeExecutionResult() // Calculate total gas cost for the workflow @@ -652,14 +652,7 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) switch resultStatus { case ExecutionSuccess: x.logger.Info("task execution completed successfully", "task_id", task.Id, "execution_id", queueData.ExecutionID, "total_steps", len(vm.ExecutionLogs)) - case ExecutionPartialSuccess: - x.logger.Warn("task execution completed with partial success", - "error", executionError, - "task_id", task.Id, - "execution_id", queueData.ExecutionID, - "failed_steps", failedStepCount, - "total_steps", len(vm.ExecutionLogs)) - case ExecutionFailure: + case ExecutionFailed: x.logger.Error("task execution completed with failures", "error", executionError, "task_id", task.Id, @@ -669,13 +662,13 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) } if runTaskErr != nil { - // This should not happen if AnalyzeExecutionResult is working correctly, - // but handle it as a fallback for VM-level errors x.logger.Error("task execution had VM-level error", "vm_error", runTaskErr, "task_id", task.Id, "execution_id", queueData.ExecutionID) if execution.Error == "" { execution.Error = fmt.Sprintf("VM execution error: %s", runTaskErr.Error()) - execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + } else { + execution.Error = fmt.Sprintf("VM execution error: %s (step analysis: %s)", runTaskErr.Error(), execution.Error) } + execution.Status = avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR } // batch update storage for task + execution log @@ -732,10 +725,8 @@ func (x *TaskExecutor) RunTask(task *model.Task, queueData *QueueExecutionData) switch resultStatus { case ExecutionSuccess: x.logger.Info("successfully executing task", "task_id", task.Id, "triggermark", queueData) - case ExecutionPartialSuccess: - x.logger.Info("task execution completed with partial success", "task_id", task.Id, "failed_steps", failedStepCount, "triggermark", queueData) - default: // ExecutionFailure or other - x.logger.Warn("task execution completed with step failures", "task_id", task.Id, "failed_steps", failedStepCount) + case ExecutionFailed: + x.logger.Warn("task execution completed with step failures", "task_id", task.Id, "failed_steps", failedStepCount, "triggermark", queueData) } return execution, nil diff --git a/core/taskengine/executor_test.go b/core/taskengine/executor_test.go index 43fe992d..9a32c2e3 100644 --- a/core/taskengine/executor_test.go +++ b/core/taskengine/executor_test.go @@ -3,7 +3,6 @@ package taskengine import ( "net/http" "net/http/httptest" - "strings" "testing" "time" @@ -225,14 +224,15 @@ func TestExecutorRunTaskWithBranchSilentFailureBehavior(t *testing.T) { t.Errorf("Expected no error with silent failure behavior, but got: %v", err) } - // Branch workflows with skipped nodes should report PARTIAL_SUCCESS - if execution.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS { - t.Errorf("Expected partial success status (branch path with skipped nodes), but got: %v with error: %s", execution.Status, execution.Error) + // Branch workflows with skipped nodes are SUCCESS — the workflow executed its + // chosen path correctly; skipping nodes due to branching is expected behavior. + if execution.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS { + t.Errorf("Expected success status (branch path with skipped nodes is normal), but got: %v with error: %s", execution.Status, execution.Error) } - // Should have a partial execution message explaining the branching - if execution.Error == "" || !strings.Contains(execution.Error, "Partial execution") { - t.Errorf("Expected partial execution message, but got: %s", execution.Error) + // No error when all executed steps succeeded + if execution.Error != "" { + t.Errorf("Expected empty error for successful branch execution, but got: %s", execution.Error) } // Find the branch step regardless of ordering diff --git a/core/taskengine/partial_success_test.go b/core/taskengine/partial_success_test.go index 0268a4ed..59330ce0 100644 --- a/core/taskengine/partial_success_test.go +++ b/core/taskengine/partial_success_test.go @@ -39,12 +39,9 @@ func TestAnalyzeExecutionResult_AllSuccess(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if !success { - t.Errorf("Expected success=true, got success=%v", success) - } if errorMessage != "" { t.Errorf("Expected empty error message, got: %s", errorMessage) } @@ -56,8 +53,8 @@ func TestAnalyzeExecutionResult_AllSuccess(t *testing.T) { } } -// TestAnalyzeExecutionResult_PartialSuccess tests the case where some steps succeed and some fail -func TestAnalyzeExecutionResult_PartialSuccess(t *testing.T) { +// TestAnalyzeExecutionResult_SomeStepsFailed tests the case where some steps succeed and some fail +func TestAnalyzeExecutionResult_SomeStepsFailed(t *testing.T) { vm := NewVM() vm.logger = testutil.GetLogger() @@ -89,20 +86,17 @@ func TestAnalyzeExecutionResult_PartialSuccess(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for partial success, got success=%v", success) - } if errorMessage == "" { - t.Errorf("Expected non-empty error message for partial success") + t.Errorf("Expected non-empty error message when some steps failed") } if failedCount != 1 { t.Errorf("Expected failedCount=1, got failedCount=%d", failedCount) } - if resultStatus != ExecutionPartialSuccess { - t.Errorf("Expected resultStatus=ExecutionPartialSuccess, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed, got resultStatus=%v", resultStatus) } // Check that error message contains failure information @@ -139,20 +133,17 @@ func TestAnalyzeExecutionResult_AllFailure(t *testing.T) { }, } - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for all failures, got success=%v", success) - } if errorMessage == "" { t.Errorf("Expected non-empty error message for all failures") } if failedCount != 3 { t.Errorf("Expected failedCount=3, got failedCount=%d", failedCount) } - if resultStatus != ExecutionFailure { - t.Errorf("Expected resultStatus=ExecutionFailure, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed, got resultStatus=%v", resultStatus) } // Check that error message contains failure information @@ -170,25 +161,22 @@ func TestAnalyzeExecutionResult_NoSteps(t *testing.T) { // No execution logs vm.ExecutionLogs = []*avsproto.Execution_Step{} - success, errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() + errorMessage, failedCount, resultStatus := vm.AnalyzeExecutionResult() // Verify results - if success { - t.Errorf("Expected success=false for no steps, got success=%v", success) - } if errorMessage != "no execution steps found" { t.Errorf("Expected specific error message for no steps, got: %s", errorMessage) } if failedCount != 0 { t.Errorf("Expected failedCount=0 for no steps, got failedCount=%d", failedCount) } - if resultStatus != ExecutionFailure { - t.Errorf("Expected resultStatus=ExecutionFailure for no steps, got resultStatus=%v", resultStatus) + if resultStatus != ExecutionFailed { + t.Errorf("Expected resultStatus=ExecutionFailed for no steps, got resultStatus=%v", resultStatus) } } -// TestGetExecutionStatus_PartialSuccess tests the GetExecutionStatus method for partial success -func TestGetExecutionStatus_PartialSuccess(t *testing.T) { +// TestGetExecutionStatus_StepFailures tests the GetExecutionStatus method when some steps fail +func TestGetExecutionStatus_StepFailures(t *testing.T) { // Set up test database and engine db := testutil.TestMustDB() defer storage.Destroy(db.(*storage.BadgerStorage)) @@ -209,13 +197,13 @@ func TestGetExecutionStatus_PartialSuccess(t *testing.T) { }, } - // Create execution with partial success (some steps succeed, some fail) + // Create execution where some steps succeed and some fail execution := &avsproto.Execution{ Id: "test-execution-id", StartAt: time.Now().UnixMilli(), EndAt: time.Now().UnixMilli(), - Status: avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS, // Overall status is partial success - Error: "Partial success: 1 of 3 steps failed: Database Query", + Status: avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED, + Error: "1 of 3 steps failed: Database Query", Index: 0, // First execution Steps: []*avsproto.Execution_Step{ { @@ -269,9 +257,9 @@ func TestGetExecutionStatus_PartialSuccess(t *testing.T) { t.Fatalf("GetExecutionStatus failed: %v", err) } - // Verify that it returns PARTIAL_SUCCESS status - if statusResp.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS { - t.Errorf("Expected EXECUTION_STATUS_PARTIAL_SUCCESS, got %v", statusResp.Status) + // Verify that it returns FAILED status (some steps failed) + if statusResp.Status != avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED { + t.Errorf("Expected EXECUTION_STATUS_FAILED, got %v", statusResp.Status) } } diff --git a/core/taskengine/vm.go b/core/taskengine/vm.go index ec33e584..cbbfb4f0 100644 --- a/core/taskengine/vm.go +++ b/core/taskengine/vm.go @@ -3208,12 +3208,12 @@ func (v *VM) createExecutionStep(nodeId string, success bool, errorMsg string, l type ExecutionResultStatus int const ( - // ExecutionSuccess indicates all steps completed successfully + // ExecutionSuccess indicates all executed steps succeeded (includes branch/conditional skips) ExecutionSuccess ExecutionResultStatus = iota - // ExecutionPartialSuccess indicates some steps succeeded but at least one failed - ExecutionPartialSuccess - // ExecutionFailure indicates execution failed (all steps failed or critical failure) - ExecutionFailure + // ExecutionFailed indicates one or more node-level steps failed during execution + ExecutionFailed + // ExecutionError indicates a system-level failure (VM could not run the workflow) + ExecutionError ) // getStepDisplayName extracts the display name for a step, preferring the name over ID @@ -3225,77 +3225,36 @@ func getStepDisplayName(step *avsproto.Execution_Step) string { return stepName } -// AnalyzeExecutionResult examines all execution steps and determines overall success/failure/partial status -// Returns (success, errorMessage, failedStepCount, resultStatus) -func (v *VM) AnalyzeExecutionResult() (bool, string, int, ExecutionResultStatus) { +// AnalyzeExecutionResult examines all execution steps and determines overall success/failure status. +// Returns (errorMessage, failedStepCount, resultStatus) +func (v *VM) AnalyzeExecutionResult() (string, int, ExecutionResultStatus) { v.mu.Lock() defer v.mu.Unlock() if len(v.ExecutionLogs) == 0 { - return false, "no execution steps found", 0, ExecutionFailure + return "no execution steps found", 0, ExecutionFailed } var failedStepNames []string - var successfulStepNames []string - var firstErrorMessage string for _, step := range v.ExecutionLogs { - stepName := getStepDisplayName(step) - if !step.Success && step.Error != "" { - if firstErrorMessage == "" { - firstErrorMessage = step.Error - } - failedStepNames = append(failedStepNames, stepName) - } else if step.Success { - successfulStepNames = append(successfulStepNames, stepName) + failedStepNames = append(failedStepNames, getStepDisplayName(step)) } } failedCount := len(failedStepNames) - totalSteps := len(v.ExecutionLogs) - - // Determine execution status and success flag - var resultStatus ExecutionResultStatus - var success bool - var errorMessage string if failedCount == 0 { - // All executed steps succeeded. However, if not all workflow steps executed - // (e.g., due to branch selections or conditional skips), report PARTIAL_SUCCESS - // to reflect that the workflow did not traverse all configured nodes. - executedCount := len(v.ExecutionLogs) - totalWorkflowSteps := 1 + len(v.TaskNodes) // 1 trigger + all nodes - if v.GetTaskId() == "" && len(v.TaskNodes) == 1 { - // single-node immediate execution - totalWorkflowSteps = 1 - } - - if executedCount < totalWorkflowSteps { - resultStatus = ExecutionPartialSuccess - success = false // do not mark full success when nodes were skipped - errorMessage = fmt.Sprintf("Partial execution: %d out of %d steps executed (branch/conditional path)", executedCount, totalWorkflowSteps) - } else { - // All steps that exist in the workflow executed and succeeded - resultStatus = ExecutionSuccess - success = true - errorMessage = "" - } - } else if failedCount > 0 { - // Distinguish between all failed vs some failed (for internal status tracking) - if failedCount == totalSteps { - // All steps failed - resultStatus = ExecutionFailure - } else { - // Some steps succeeded, some failed - partial success for internal tracking - resultStatus = ExecutionPartialSuccess - } - success = false - // Use simple error message format (no prefix) for both cases - errorMessage = formatExecutionErrorMessage("", failedCount, totalSteps, failedStepNames) + // All executed steps succeeded. Branch/conditional skips are normal + // workflow behavior and count as SUCCESS — the workflow did what it + // was configured to do. + return "", 0, ExecutionSuccess } - return success, errorMessage, failedCount, resultStatus + // One or more steps failed (covers both partial and total failure). + errorMessage := formatExecutionErrorMessage("", failedCount, len(v.ExecutionLogs), failedStepNames) + return errorMessage, failedCount, ExecutionFailed } // CalculateTotalGasCost sums up gas costs from all execution steps that involve blockchain operations @@ -3366,10 +3325,10 @@ func convertToExecutionStatus(resultStatus ExecutionResultStatus) avsproto.Execu switch resultStatus { case ExecutionSuccess: return avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS - case ExecutionPartialSuccess: - return avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS - case ExecutionFailure: + case ExecutionFailed: return avsproto.ExecutionStatus_EXECUTION_STATUS_FAILED + case ExecutionError: + return avsproto.ExecutionStatus_EXECUTION_STATUS_ERROR default: return avsproto.ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED } diff --git a/core/taskengine/vm_runner_rest.go b/core/taskengine/vm_runner_rest.go index db2ac8ba..c37902ef 100644 --- a/core/taskengine/vm_runner_rest.go +++ b/core/taskengine/vm_runner_rest.go @@ -902,18 +902,17 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs var resultStatus ExecutionResultStatus var statusText, statusBgColor, statusTextColor string if failed { - resultStatus = ExecutionFailure + resultStatus = ExecutionFailed statusText = fmt.Sprintf("but failed at the '%s' step due to %s.", safeName(failedName), firstLine(failedReason)) statusBgColor = "#FEE2E2" // light red statusTextColor = "#991B1B" // dark red - } else if skippedCount > 0 { - resultStatus = ExecutionPartialSuccess - statusText = fmt.Sprintf("but %d nodes were skipped due to Branch condition.", skippedCount) - statusBgColor = "#FEF3C7" // light yellow - statusTextColor = "#92400E" // dark yellow/amber } else { resultStatus = ExecutionSuccess - statusText = "All steps completed successfully" + if skippedCount > 0 { + statusText = fmt.Sprintf("All steps completed successfully (%d nodes skipped by Branch condition).", skippedCount) + } else { + statusText = "All steps completed successfully" + } statusBgColor = "#D1FAE5" // light green statusTextColor = "#065F46" // dark green } @@ -923,9 +922,7 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs switch resultStatus { case ExecutionSuccess: iconSvg = `` - case ExecutionPartialSuccess: - iconSvg = `` - case ExecutionFailure: + case ExecutionFailed: iconSvg = `` } statusHtml := fmt.Sprintf( @@ -944,9 +941,7 @@ func (r *RestProcessor) Execute(stepID string, node *avsproto.RestAPINode) (*avs switch resultStatus { case ExecutionSuccess: subjectStatusText = "successfully completed" - case ExecutionPartialSuccess: - subjectStatusText = "partially executed" - case ExecutionFailure: + case ExecutionFailed: subjectStatusText = "failed to execute" } diff --git a/core/taskengine/vm_scheduler_fix_test.go b/core/taskengine/vm_scheduler_fix_test.go index 51c6d439..03972c96 100644 --- a/core/taskengine/vm_scheduler_fix_test.go +++ b/core/taskengine/vm_scheduler_fix_test.go @@ -246,13 +246,11 @@ func TestSchedulerExecutesNodeAfterBranch(t *testing.T) { require.NoError(t, err) require.NotNil(t, execution) - // Verify execution was partially successful (branch path means not all nodes executed) - // When a branch workflow executes, not all configured nodes run (only one branch path), - // which correctly results in PARTIAL_SUCCESS status - assert.Equal(t, avsproto.ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS, execution.Status, - "Branch workflows should report PARTIAL_SUCCESS when not all nodes execute") - assert.Contains(t, execution.Error, "Partial execution", "Should report partial execution due to branch path") - assert.Contains(t, execution.Error, "6 out of 7 steps executed", "Should show correct step counts") + // Branch workflows with skipped nodes are SUCCESS — the workflow executed its + // chosen path correctly; skipping nodes due to branching is expected behavior. + assert.Equal(t, avsproto.ExecutionStatus_EXECUTION_STATUS_SUCCESS, execution.Status, + "Branch workflows should report SUCCESS even when not all nodes execute") + assert.Empty(t, execution.Error, "No error expected when all executed steps succeeded") // Debug: print what executed t.Logf("Executed %d steps:", len(execution.Steps)) diff --git a/docs/changes/0001-execution-status-redesign.md b/docs/changes/0001-execution-status-redesign.md new file mode 100644 index 00000000..dd90efe2 --- /dev/null +++ b/docs/changes/0001-execution-status-redesign.md @@ -0,0 +1,92 @@ +# Execution Status Redesign: Replace PARTIAL_SUCCESS with SUCCESS/FAILED/ERROR + +- **Date**: 2026-04-13 +- **Status**: Implemented +- **Branch**: `fix/remove-dead-success-bool` + +## Context + +The `EXECUTION_STATUS_PARTIAL_SUCCESS` enum was used for two unrelated scenarios: + +1. **Branch skips** — the workflow has conditional branches; some nodes were not + executed because the branch condition routed elsewhere. Nothing failed. +2. **Step failures** — one or more nodes actually failed during execution + (e.g., an ERC-20 transfer reverted). + +Clients had to work around this by inspecting every step individually: + +```ts +const isConditionalSkip = + status === ExecutionStatus.PartialSuccess && + steps.every((step) => step.success); +``` + +This workaround should no longer be necessary. + +## Decision + +Three execution statuses, orthogonal to step count: + +| Scenario | Status | `steps.length` vs task node count | `execution.error` | +|---------------------------------|-----------|-----------------------------------|------------------------------------| +| All nodes ran, all succeeded | `SUCCESS` | equal | empty | +| Branch skipped nodes, all OK | `SUCCESS` | less than total | empty | +| Some nodes failed | `FAILED` | any | `"N of M steps failed: node1, …"` | +| All nodes failed | `FAILED` | equal | `"N of N steps failed: node1, …"` | +| No steps executed | `FAILED` | zero | `"no execution steps found"` | +| System-level failure (VM crash) | `ERROR` | zero (or partial if crash mid-run) | `"VM execution error: …"` | + +**How to determine what happened:** + +- **`status`** answers: did the workflow succeed? + - `SUCCESS` — yes, every executed step passed. Branch skips are normal. + - `FAILED` — no, at least one step failed. Check `execution.error` and + individual `step.success` / `step.error` for details. + - `ERROR` — the system could not run the workflow at all (compilation + failure, VM crash). This is not a user-fixable workflow issue. + +- **`steps` array** answers: what ran and what was skipped? + - Compare `steps.length` against the task's total node count to know + how many nodes were skipped by branching. + - Each step has `success`, `error`, and `name` for per-node detail. + +- **`execution.error`** answers: what went wrong? + - Empty string when `status` is `SUCCESS`. + - Contains a summary like `"1 of 5 steps failed: loop1"` when `FAILED`. + - Contains the system error message when `ERROR`. + +## Proto Changes + +```protobuf +enum ExecutionStatus { + EXECUTION_STATUS_UNSPECIFIED = 0; + EXECUTION_STATUS_PENDING = 1; + EXECUTION_STATUS_SUCCESS = 2; + EXECUTION_STATUS_FAILED = 3; + reserved 4; + reserved "EXECUTION_STATUS_PARTIAL_SUCCESS"; + EXECUTION_STATUS_ERROR = 5; +} +``` + +- Enum value `4` is reserved and will not be reused. +- New enum value `ERROR = 5` for system-level failures. + +## SDK/Client Migration + +1. Remove any `PartialSuccess` handling or `isConditionalSkip` workarounds. +2. Treat `SUCCESS` as the only positive outcome. Branch skips no longer + produce a warning status. +3. Treat `FAILED` as the single status for any node-level execution failure, + regardless of whether some or all steps failed. +4. Treat `ERROR` as a system-level problem (not caused by the workflow + configuration itself). + +## Consequences + +- Branch-skip workflows stop surfacing as warnings in the UI. +- The `steps` array is the source of truth for what executed and what + was skipped — no status-level signal needed for coverage. +- Email summaries for branch-skip workflows now show a green success + badge with a note like "3 nodes skipped by Branch condition" instead + of a yellow warning badge. diff --git a/pkg/erc4337/preset/builder_execution_success_test.go b/pkg/erc4337/preset/builder_execution_success_test.go index f88de385..ea8437a0 100644 --- a/pkg/erc4337/preset/builder_execution_success_test.go +++ b/pkg/erc4337/preset/builder_execution_success_test.go @@ -84,31 +84,37 @@ func TestUserOpWithdrawalSkipsReimbursementWhenBalanceInsufficient(t *testing.T) reserve := big.NewInt(100000000000000) // 0.0001 ETH withdrawalAmount := new(big.Int).Sub(balance, reserve) - calldata, err := aa.PackExecute(secondaryWallet, withdrawalAmount, []byte{}) - require.NoError(t, err, "Failed to pack execute calldata") + // If balance is already below the reserve, the precondition (insufficient + // funds for reimbursement) is already satisfied — skip the withdrawal. + if withdrawalAmount.Sign() > 0 { + calldata, err := aa.PackExecute(secondaryWallet, withdrawalAmount, []byte{}) + require.NoError(t, err, "Failed to pack execute calldata") - paymasterRequest := GetVerifyingPaymasterRequestForDuration( - smartWalletConfig.PaymasterAddress, - 15*time.Minute, - ) + paymasterRequest := GetVerifyingPaymasterRequestForDuration( + smartWalletConfig.PaymasterAddress, + 15*time.Minute, + ) - // Withdrawal should succeed — system skips reimbursement when balance is insufficient - userOp, receipt, err := SendUserOp( - smartWalletConfig, - owner, - calldata, - paymasterRequest, - &primaryWallet, - nil, - nil, // executionFeeWei - nil, - ) - require.NoError(t, err, "Withdrawal should succeed even without reimbursement") - require.NotNil(t, userOp, "UserOp should be built") - if receipt == nil { - t.Skip("UserOp sent but receipt not available (confirmation timeout)") + // Withdrawal should succeed — system skips reimbursement when balance is insufficient + userOp, receipt, err := SendUserOp( + smartWalletConfig, + owner, + calldata, + paymasterRequest, + &primaryWallet, + nil, + nil, // executionFeeWei + nil, + ) + require.NoError(t, err, "Withdrawal should succeed even without reimbursement") + require.NotNil(t, userOp, "UserOp should be built") + if receipt == nil { + t.Skip("UserOp sent but receipt not available (confirmation timeout)") + } + t.Logf("Withdrawal succeeded. TX Hash: %s Gas used: %d", receipt.TxHash.Hex(), receipt.GasUsed) + } else { + t.Skipf("Balance already below reserve (%s < %s), withdrawal precondition already met", balance.String(), reserve.String()) } - t.Logf("Withdrawal succeeded. TX Hash: %s Gas used: %d", receipt.TxHash.Hex(), receipt.GasUsed) // Send the funds back from the secondary wallet to the primary wallet secondaryBalance, err := client.BalanceAt(context.Background(), secondaryWallet, nil) diff --git a/protobuf/avs.pb.go b/protobuf/avs.pb.go index a2411e83..9370ded7 100644 --- a/protobuf/avs.pb.go +++ b/protobuf/avs.pb.go @@ -547,15 +547,21 @@ func (TaskStatus) EnumDescriptor() ([]byte, []int) { return file_avs_proto_rawDescGZIP(), []int{6} } -// Execution Status re-present a run of the task +// ExecutionStatus represents the outcome of a task execution. +// +// SUCCESS – every executed step succeeded (includes branch/conditional skips). +// FAILED – one or more node-level steps failed during execution. +// ERROR – system-level failure; the VM could not run the workflow at all. +// +// Value 4 (formerly PARTIAL_SUCCESS) is reserved and must not be reused. type ExecutionStatus int32 const ( - ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED ExecutionStatus = 0 - ExecutionStatus_EXECUTION_STATUS_PENDING ExecutionStatus = 1 - ExecutionStatus_EXECUTION_STATUS_SUCCESS ExecutionStatus = 2 - ExecutionStatus_EXECUTION_STATUS_FAILED ExecutionStatus = 3 - ExecutionStatus_EXECUTION_STATUS_PARTIAL_SUCCESS ExecutionStatus = 4 + ExecutionStatus_EXECUTION_STATUS_UNSPECIFIED ExecutionStatus = 0 + ExecutionStatus_EXECUTION_STATUS_PENDING ExecutionStatus = 1 + ExecutionStatus_EXECUTION_STATUS_SUCCESS ExecutionStatus = 2 + ExecutionStatus_EXECUTION_STATUS_FAILED ExecutionStatus = 3 + ExecutionStatus_EXECUTION_STATUS_ERROR ExecutionStatus = 5 ) // Enum value maps for ExecutionStatus. @@ -565,14 +571,14 @@ var ( 1: "EXECUTION_STATUS_PENDING", 2: "EXECUTION_STATUS_SUCCESS", 3: "EXECUTION_STATUS_FAILED", - 4: "EXECUTION_STATUS_PARTIAL_SUCCESS", + 5: "EXECUTION_STATUS_ERROR", } ExecutionStatus_value = map[string]int32{ - "EXECUTION_STATUS_UNSPECIFIED": 0, - "EXECUTION_STATUS_PENDING": 1, - "EXECUTION_STATUS_SUCCESS": 2, - "EXECUTION_STATUS_FAILED": 3, - "EXECUTION_STATUS_PARTIAL_SUCCESS": 4, + "EXECUTION_STATUS_UNSPECIFIED": 0, + "EXECUTION_STATUS_PENDING": 1, + "EXECUTION_STATUS_SUCCESS": 2, + "EXECUTION_STATUS_FAILED": 3, + "EXECUTION_STATUS_ERROR": 5, } ) @@ -10290,13 +10296,13 @@ const file_avs_proto_rawDesc = "" + "\n" + "\x06Failed\x10\x02\x12\v\n" + "\aRunning\x10\x04\x12\f\n" + - "\bDisabled\x10\x05*\xb2\x01\n" + + "\bDisabled\x10\x05*\xd0\x01\n" + "\x0fExecutionStatus\x12 \n" + "\x1cEXECUTION_STATUS_UNSPECIFIED\x10\x00\x12\x1c\n" + "\x18EXECUTION_STATUS_PENDING\x10\x01\x12\x1c\n" + "\x18EXECUTION_STATUS_SUCCESS\x10\x02\x12\x1b\n" + - "\x17EXECUTION_STATUS_FAILED\x10\x03\x12$\n" + - " EXECUTION_STATUS_PARTIAL_SUCCESS\x10\x042\xd5\x10\n" + + "\x17EXECUTION_STATUS_FAILED\x10\x03\x12\x1a\n" + + "\x16EXECUTION_STATUS_ERROR\x10\x05\"\x04\b\x04\x10\x04* EXECUTION_STATUS_PARTIAL_SUCCESS2\xd5\x10\n" + "\n" + "Aggregator\x126\n" + "\x06GetKey\x12\x15.aggregator.GetKeyReq\x1a\x13.aggregator.KeyResp\"\x00\x12]\n" + diff --git a/protobuf/avs.proto b/protobuf/avs.proto index c5c2aa96..243802c1 100644 --- a/protobuf/avs.proto +++ b/protobuf/avs.proto @@ -349,13 +349,21 @@ enum TaskStatus { Disabled = 5; } -// Execution Status re-present a run of the task +// ExecutionStatus represents the outcome of a task execution. +// +// SUCCESS – every executed step succeeded (includes branch/conditional skips). +// FAILED – one or more node-level steps failed during execution. +// ERROR – system-level failure; the VM could not run the workflow at all. +// +// Value 4 (formerly PARTIAL_SUCCESS) is reserved and must not be reused. enum ExecutionStatus { EXECUTION_STATUS_UNSPECIFIED = 0; EXECUTION_STATUS_PENDING = 1; EXECUTION_STATUS_SUCCESS = 2; EXECUTION_STATUS_FAILED = 3; - EXECUTION_STATUS_PARTIAL_SUCCESS = 4; + reserved 4; + reserved "EXECUTION_STATUS_PARTIAL_SUCCESS"; + EXECUTION_STATUS_ERROR = 5; }