diff --git a/packages/api/internal/orchestrator/autoresume_test.go b/packages/api/internal/orchestrator/autoresume_test.go index 362ea2a7aa..034698053c 100644 --- a/packages/api/internal/orchestrator/autoresume_test.go +++ b/packages/api/internal/orchestrator/autoresume_test.go @@ -113,6 +113,7 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) { require.NoError(t, err) assert.False(t, alreadyDone) require.NotNil(t, finish) + finish(t.Context(), nil) pausingSandbox, err := o.GetSandbox(t.Context(), sbx.TeamID, sbx.SandboxID) @@ -125,7 +126,7 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) { assert.ErrorIs(t, err, ErrSandboxStillTransitioning) }) - t.Run("pausing sandbox wait failure returns internal error", func(t *testing.T) { + t.Run("concurrently pausing sandbox returns internal error", func(t *testing.T) { t.Parallel() o := newTestAutoResumeOrchestrator() @@ -136,10 +137,15 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) { require.NoError(t, err) assert.False(t, alreadyDone) require.NotNil(t, finish) - finish(t.Context(), errors.New("boom")) pausingSandbox, err := o.GetSandbox(t.Context(), sbx.TeamID, sbx.SandboxID) require.NoError(t, err) + assert.Equal(t, sandbox.StatePausing, pausingSandbox.State) + + go func() { + time.Sleep(50 * time.Millisecond) + finish(t.Context(), errors.New("boom")) + }() _, handled, err := o.HandleExistingSandboxAutoResume(t.Context(), sbx.TeamID, sbx.SandboxID, pausingSandbox, time.Minute) require.Error(t, err) diff --git a/packages/api/internal/orchestrator/client.go b/packages/api/internal/orchestrator/client.go index 918680c77e..93082bb19f 100644 --- a/packages/api/internal/orchestrator/client.go +++ b/packages/api/internal/orchestrator/client.go @@ -212,6 +212,12 @@ func (o *Orchestrator) discoverClusterNode(ctx context.Context, clusterID uuid.U ctx, span := tracer.Start(ctx, "discover-cluster-node") defer span.End() + if o.clusters == nil { + logger.L().Error(ctx, "Cluster pool not initialized during on-demand node discovery", logger.WithClusterID(clusterID)) + + return + } + cluster, found := o.clusters.GetClusterById(clusterID) if !found { logger.L().Error(ctx, "Cluster not found during on-demand node discovery", logger.WithClusterID(clusterID)) diff --git a/packages/api/internal/orchestrator/delete_instance.go b/packages/api/internal/orchestrator/delete_instance.go index 729bb4c450..578161de15 100644 --- a/packages/api/internal/orchestrator/delete_instance.go +++ b/packages/api/internal/orchestrator/delete_instance.go @@ -7,6 +7,8 @@ import ( "github.com/google/uuid" "go.uber.org/zap" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "github.com/e2b-dev/infra/packages/api/internal/orchestrator/nodemanager" "github.com/e2b-dev/infra/packages/api/internal/sandbox" @@ -17,7 +19,7 @@ import ( sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox" ) -func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sandboxID string, opts sandbox.RemoveOpts) error { +func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sandboxID string, opts sandbox.RemoveOpts) (err error) { ctx, span := tracer.Start(ctx, "remove-sandbox") defer span.End() @@ -83,15 +85,18 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand return nil } - defer func() { go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) }() - defer o.sandboxStore.Remove(ctx, teamID, sandboxID) err = o.removeSandboxFromNode(ctx, sbx, opts.Action) - if err != nil { - logger.L().Error(ctx, "Error pausing sandbox", zap.Error(err), logger.WithSandboxID(sbx.SandboxID)) + if errors.Is(err, ErrSandboxNotFound) { + logger.L().Warn(ctx, "Sandbox not found during removal, treating as not found", zap.Error(err), logger.WithSandboxID(sbx.SandboxID)) + } else if err != nil { + logger.L().Error(ctx, "Error removing sandbox from node", zap.Error(err), logger.WithSandboxID(sbx.SandboxID)) return ErrSandboxOperationFailed } + o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) + go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) + return nil } @@ -154,6 +159,11 @@ func (o *Orchestrator) killSandboxOnNode(ctx context.Context, node *nodemanager. client, ctx := node.GetSandboxDeleteCtx(ctx, sbx.SandboxID, sbx.ExecutionID) _, err := client.Sandbox.Delete(ctx, req) if err != nil { + grpcErr, ok := status.FromError(err) + if ok && grpcErr.Code() == codes.NotFound { + return ErrSandboxNotFound + } + return fmt.Errorf("failed to delete sandbox '%s': %w", sbx.SandboxID, err) } diff --git a/packages/api/internal/orchestrator/pause_instance.go b/packages/api/internal/orchestrator/pause_instance.go index 09b4e90f43..8acc9097a3 100644 --- a/packages/api/internal/orchestrator/pause_instance.go +++ b/packages/api/internal/orchestrator/pause_instance.go @@ -45,6 +45,12 @@ func (o *Orchestrator) pauseSandbox(ctx context.Context, node *nodemanager.Node, return PauseQueueExhaustedError{} } + if errors.Is(err, ErrSandboxNotFound) { + telemetry.ReportCriticalError(ctx, "sandbox not found when pausing", err) + + return ErrSandboxNotFound + } + if err != nil && !errors.Is(err, PauseQueueExhaustedError{}) { telemetry.ReportCriticalError(ctx, "error pausing sandbox", err) @@ -97,6 +103,10 @@ func snapshotInstance(ctx context.Context, node *nodemanager.Node, sbx sandbox.S return PauseQueueExhaustedError{} } + if st.Code() == codes.NotFound { + return ErrSandboxNotFound + } + return fmt.Errorf("failed to pause sandbox '%s': %w", sbx.SandboxID, err) } diff --git a/packages/api/internal/sandbox/storage/memory/operations.go b/packages/api/internal/sandbox/storage/memory/operations.go index 6d0834c5f9..62706396a3 100644 --- a/packages/api/internal/sandbox/storage/memory/operations.go +++ b/packages/api/internal/sandbox/storage/memory/operations.go @@ -172,14 +172,14 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO } } + originalState := sbx._data.State newState := opts.Action.TargetState if transition != nil { - currentState := sbx._data.State sbx.mu.Unlock() - if currentState != newState && !sandbox.AllowedTransitions[currentState][newState] { - return false, nil, &sandbox.InvalidStateTransitionError{CurrentState: currentState, TargetState: newState} + if originalState != newState && !sandbox.AllowedTransitions[originalState][newState] { + return false, nil, &sandbox.InvalidStateTransitionError{CurrentState: originalState, TargetState: newState} } logger.L().Debug(ctx, "State transition already in progress to the same state, waiting", logger.WithSandboxID(sbx.SandboxID()), zap.String("state", string(newState))) @@ -190,9 +190,9 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO // If the transition is to the same state just wait switch { - case currentState == newState: + case originalState == newState: return true, func(context.Context, error) {}, nil - case sandbox.AllowedTransitions[currentState][newState]: + case sandbox.AllowedTransitions[originalState][newState]: return startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: opts.Action}) default: return false, nil, fmt.Errorf("unexpected state transition") @@ -238,11 +238,11 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO } if err != nil { - // Keep the transition in place so the error stays - return + // Revert the state change if the transition failed and it's not a transient transition + sbx._data.State = originalState } - // The transition is completed and the next transition can be started + // Remove the transition so the next transition can be started sbx.transition = nil } diff --git a/packages/api/internal/sandbox/storage/memory/operations_test.go b/packages/api/internal/sandbox/storage/memory/operations_test.go index 7d995e8ce4..38d140056b 100644 --- a/packages/api/internal/sandbox/storage/memory/operations_test.go +++ b/packages/api/internal/sandbox/storage/memory/operations_test.go @@ -229,19 +229,21 @@ func TestStartRemoving_Error(t *testing.T) { assert.False(t, alreadyDone2) assert.Nil(t, finish2) - // From Failed state, no transitions are allowed + // Failed transition should be cleared so subsequent transitions can proceed. alreadyDone3, finish3, err3 := startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: sandbox.StateActionPause}) - require.Error(t, err3) - require.ErrorIs(t, err3, failureErr) + require.NoError(t, err3) assert.False(t, alreadyDone3) - assert.Nil(t, finish3) + require.NotNil(t, finish3) + finish3(ctx, nil) + assert.Equal(t, sandbox.StatePausing, sbx.State()) - // Trying to transition to Killed should also fail + // Follow-up transition should also work. alreadyDone4, finish4, err4 := startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: sandbox.StateActionKill}) - require.Error(t, err4) - require.ErrorIs(t, err4, failureErr) + require.NoError(t, err4) assert.False(t, alreadyDone4) - assert.Nil(t, finish4) + require.NotNil(t, finish4) + finish4(ctx, nil) + assert.Equal(t, sandbox.StateKilling, sbx.State()) } // Test context timeout during wait