diff --git a/cli/azd/cmd/middleware/error.go b/cli/azd/cmd/middleware/error.go index 3afe57ffff7..f2c723dd9a7 100644 --- a/cli/azd/cmd/middleware/error.go +++ b/cli/azd/cmd/middleware/error.go @@ -29,6 +29,7 @@ import ( "github.com/azure/azure-dev/cli/azd/pkg/environment/azdcontext" "github.com/azure/azure-dev/cli/azd/pkg/errorhandler" "github.com/azure/azure-dev/cli/azd/pkg/extensions" + "github.com/azure/azure-dev/cli/azd/pkg/infra/provisioning" "github.com/azure/azure-dev/cli/azd/pkg/infra/provisioning/bicep" "github.com/azure/azure-dev/cli/azd/pkg/input" "github.com/azure/azure-dev/cli/azd/pkg/output" @@ -85,6 +86,11 @@ func shouldSkipAgentHandling(err error) bool { errors.Is(err, consent.ErrElicitationDenied) || errors.Is(err, consent.ErrSamplingDenied) || errors.Is(err, internal.ErrAbortedByUser) || + errors.Is(err, provisioning.ErrDeploymentInterruptedLeaveRunning) || + errors.Is(err, provisioning.ErrDeploymentCanceledByUser) || + errors.Is(err, provisioning.ErrDeploymentCancelTimeout) || + errors.Is(err, provisioning.ErrDeploymentCancelTooLate) || + errors.Is(err, provisioning.ErrDeploymentCancelFailed) || errors.Is(err, environment.ErrNotFound) || errors.Is(err, environment.ErrNameNotSpecified) || diff --git a/cli/azd/docs/provision-cancellation.md b/cli/azd/docs/provision-cancellation.md new file mode 100644 index 00000000000..b8a81199a17 --- /dev/null +++ b/cli/azd/docs/provision-cancellation.md @@ -0,0 +1,64 @@ +# Provision cancellation (Ctrl+C) + +When `azd provision` (or `azd up`) submits a Bicep deployment to Azure, the +deployment runs asynchronously on the Azure side. If the user presses +Ctrl+C while azd is waiting for that deployment to +finish, azd will pause and ask what to do instead of exiting immediately. + +## Behavior + +1. azd stops the live progress reporter and presents an interactive prompt + that includes the Azure portal URL of the running deployment. +2. The user picks one of: + - **Leave the Azure deployment running and stop azd** (default). azd + exits with a non-zero status; the Azure deployment continues to + completion. The user can monitor or cancel it from the portal link. + - **Cancel the Azure deployment**. azd submits an ARM cancel request + against the deployment and waits up to 2 minutes for Azure to confirm a + terminal state (`Canceled`, `Failed`, or `Succeeded`). +3. Additional Ctrl+C presses while the prompt is + showing (or while a cancel request is in flight) are ignored so the user + can finish reading and choose deliberately. + +## Outcomes when "Cancel" is selected + +| Outcome | When | +|---------|------| +| Cancellation confirmed | Azure transitions the deployment to `Canceled` within the wait budget. azd exits non-zero with a clear message. | +| Cancel arrived too late | Azure reports the deployment finished (`Succeeded` / `Failed`) before the cancel request took effect. azd surfaces the final state plus the portal URL. | +| Cancel still pending | Azure does not reach a terminal state within the wait budget. azd warns that cancellation is still in progress and prints the portal URL. | +| Cancel request failed | The ARM `Cancel` API itself returned an error. azd prints the error and the portal URL. | + +When the deployment URL is available, azd prints it so the user can follow +up manually from the browser. The URL is omitted if azd was unable to +resolve it (for example, when the ARM service is unreachable). + +## Provider scope + +| Provider | Behavior on Ctrl+C during provision | +|---------|--------------------------------------| +| Bicep (subscription scope) | Interactive prompt (described above). | +| Bicep (resource group scope) | Interactive prompt (described above). | +| Deployment Stacks | Currently treated as "leave running" — the stacks ARM API does not expose a per-deployment cancel surface today. | +| Terraform | Unchanged: the Terraform CLI does not expose a safe per-apply cancel; pressing Ctrl+C exits azd and Terraform handles its own teardown. | + +## Telemetry + +A `provision.cancellation` attribute is recorded on the provisioning span +with one of: + +- `none` — provisioning completed normally without an interrupt. +- `leave_running` — user chose to let the Azure deployment continue. +- `canceled` — cancel request succeeded and Azure reached `Canceled`. +- `cancel_too_late` — Azure reached `Succeeded` / `Failed` before cancel + took effect. +- `cancel_timed_out` — Azure did not reach a terminal state within the + wait budget. +- `cancel_failed` — the ARM `Cancel` API call itself returned an error. + +## Non-interactive mode + +If azd is running without a TTY (e.g. CI), the prompt cannot be displayed. +In that case azd defaults to **leave running** behavior so that an +unattended deployment is never silently cancelled by an environment +signal. diff --git a/cli/azd/internal/cmd/errors.go b/cli/azd/internal/cmd/errors.go index 32fc26169c2..35c3391f405 100644 --- a/cli/azd/internal/cmd/errors.go +++ b/cli/azd/internal/cmd/errors.go @@ -284,8 +284,20 @@ func classifySentinel(err error) string { return "internal.not_git_repo" case errors.Is(err, azapi.ErrPreviewNotSupported): return "internal.preview_not_supported" + case errors.Is(err, azapi.ErrCancelNotSupported): + return "internal.cancel_not_supported" case errors.Is(err, provisioning.ErrBindMountOperationDisabled): return "internal.bind_mount_disabled" + case errors.Is(err, provisioning.ErrDeploymentInterruptedLeaveRunning): + return "user.canceled.leave_running" + case errors.Is(err, provisioning.ErrDeploymentCanceledByUser): + return "user.canceled.deployment_canceled" + case errors.Is(err, provisioning.ErrDeploymentCancelTimeout): + return "user.canceled.cancel_timed_out" + case errors.Is(err, provisioning.ErrDeploymentCancelTooLate): + return "user.canceled.cancel_too_late" + case errors.Is(err, provisioning.ErrDeploymentCancelFailed): + return "user.canceled.cancel_failed" case errors.Is(err, update.ErrNeedsElevation): return "update.elevationRequired" case errors.Is(err, pipeline.ErrRemoteHostIsNotAzDo): diff --git a/cli/azd/internal/tracing/fields/fields.go b/cli/azd/internal/tracing/fields/fields.go index f3817711ee1..c2ced2e0db3 100644 --- a/cli/azd/internal/tracing/fields/fields.go +++ b/cli/azd/internal/tracing/fields/fields.go @@ -430,6 +430,24 @@ var ( } ) +// Provision-related fields +var ( + // ProvisionCancellationKey records how a Ctrl+C interrupt during + // `azd provision` / `azd up` was handled. + // + // Example: "none" (no interrupt observed), "leave_running" (user chose to + // keep the Azure deployment running), "canceled" (Azure confirmed the + // deployment reached the Canceled state), "cancel_timed_out" (cancel was + // submitted but azd stopped waiting for the terminal state), + // "cancel_too_late" (Azure finished the deployment before the cancel took + // effect), "cancel_failed" (the cancel request itself returned an error). + ProvisionCancellationKey = AttributeKey{ + Key: attribute.Key("provision.cancellation"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } +) + // The value used for ServiceNameKey const ServiceNameAzd = "azd" diff --git a/cli/azd/pkg/azapi/deployments.go b/cli/azd/pkg/azapi/deployments.go index 1e079370a4c..b7650c4c8bc 100644 --- a/cli/azd/pkg/azapi/deployments.go +++ b/cli/azd/pkg/azapi/deployments.go @@ -32,6 +32,11 @@ const ( var ErrPreviewNotSupported = errors.New("preview not supported") +// ErrCancelNotSupported indicates that the deployment provider does not support +// cancelling an in-flight deployment (e.g. deployment stacks). Callers can use +// errors.Is to detect this case and fall back to "leave running" behavior. +var ErrCancelNotSupported = errors.New("cancel not supported for this deployment kind") + const emptySubscriptionArmTemplate = `{ "$schema": "https://schema.management.azure.com/schemas/2018-05-01/subscriptionDeploymentTemplate.json#", "contentVersion": "1.0.0.0", @@ -226,6 +231,25 @@ type DeploymentService interface { options map[string]any, progress *async.Progress[DeleteDeploymentProgress], ) error + // CancelSubscriptionDeployment requests Azure to cancel a running + // subscription-scoped deployment. The call returns immediately after the + // cancel request is accepted; callers should poll the deployment to observe + // the terminal state (Canceled, Failed, or Succeeded). + CancelSubscriptionDeployment( + ctx context.Context, + subscriptionId string, + deploymentName string, + ) error + // CancelResourceGroupDeployment requests Azure to cancel a running + // resource-group-scoped deployment. The call returns immediately after the + // cancel request is accepted; callers should poll the deployment to observe + // the terminal state (Canceled, Failed, or Succeeded). + CancelResourceGroupDeployment( + ctx context.Context, + subscriptionId string, + resourceGroupName string, + deploymentName string, + ) error } type DeleteResourceState string diff --git a/cli/azd/pkg/azapi/stack_deployments.go b/cli/azd/pkg/azapi/stack_deployments.go index 7a7b0ba71ec..563e505dc75 100644 --- a/cli/azd/pkg/azapi/stack_deployments.go +++ b/cli/azd/pkg/azapi/stack_deployments.go @@ -660,6 +660,29 @@ func (d *StackDeployments) CalculateTemplateHash( return d.standardDeployments.CalculateTemplateHash(ctx, subscriptionId, template) } +// CancelSubscriptionDeployment is not supported for deployment stacks. The +// deployment stacks ARM API does not expose a per-stack cancel operation; +// stopping a stack mid-deployment requires deleting the stack itself. Returns +// ErrCancelNotSupported so callers can distinguish this from a real failure. +func (d *StackDeployments) CancelSubscriptionDeployment( + ctx context.Context, + subscriptionId string, + deploymentName string, +) error { + return ErrCancelNotSupported +} + +// CancelResourceGroupDeployment is not supported for deployment stacks. See +// CancelSubscriptionDeployment for details. +func (d *StackDeployments) CancelResourceGroupDeployment( + ctx context.Context, + subscriptionId string, + resourceGroupName string, + deploymentName string, +) error { + return ErrCancelNotSupported +} + func (d *StackDeployments) createClient(ctx context.Context, subscriptionId string) (*armdeploymentstacks.Client, error) { credential, err := d.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { diff --git a/cli/azd/pkg/azapi/standard_deployments.go b/cli/azd/pkg/azapi/standard_deployments.go index efc55ed44cb..8e08df4b181 100644 --- a/cli/azd/pkg/azapi/standard_deployments.go +++ b/cli/azd/pkg/azapi/standard_deployments.go @@ -551,6 +551,47 @@ func (ds *StandardDeployments) DeleteResourceGroupDeployment( return nil } +// CancelSubscriptionDeployment requests Azure to cancel a running +// subscription-scoped deployment. The ARM Cancel call returns immediately once +// the request is accepted; callers should poll the deployment to observe the +// terminal state (Canceled, Failed, or Succeeded). +func (ds *StandardDeployments) CancelSubscriptionDeployment( + ctx context.Context, + subscriptionId string, + deploymentName string, +) error { + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) + if err != nil { + return fmt.Errorf("creating deployments client: %w", err) + } + + if _, err := deploymentClient.CancelAtSubscriptionScope(ctx, deploymentName, nil); err != nil { + return fmt.Errorf("cancelling subscription deployment: %w", err) + } + return nil +} + +// CancelResourceGroupDeployment requests Azure to cancel a running +// resource-group-scoped deployment. The ARM Cancel call returns immediately +// once the request is accepted; callers should poll the deployment to observe +// the terminal state (Canceled, Failed, or Succeeded). +func (ds *StandardDeployments) CancelResourceGroupDeployment( + ctx context.Context, + subscriptionId string, + resourceGroupName string, + deploymentName string, +) error { + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) + if err != nil { + return fmt.Errorf("creating deployments client: %w", err) + } + + if _, err := deploymentClient.Cancel(ctx, resourceGroupName, deploymentName, nil); err != nil { + return fmt.Errorf("cancelling resource group deployment: %w", err) + } + return nil +} + func (ds *StandardDeployments) WhatIfDeployToSubscription( ctx context.Context, subscriptionId string, diff --git a/cli/azd/pkg/infra/provisioning/bicep/bicep_provider.go b/cli/azd/pkg/infra/provisioning/bicep/bicep_provider.go index d49d347b050..d40751bb2c4 100644 --- a/cli/azd/pkg/infra/provisioning/bicep/bicep_provider.go +++ b/cli/azd/pkg/infra/provisioning/bicep/bicep_provider.go @@ -767,18 +767,46 @@ func (p *BicepProvider) Deploy(ctx context.Context) (*provisioning.DeployResult, // Start the deployment p.console.ShowSpinner(ctx, "Creating/Updating resources", input.Step) + deployCtx, interruptStarted, interruptCh, markDeployCompleted, interruptCleanup := + p.installDeploymentInterruptHandler(ctx, deployment, cancelProgress) + cleanupOnce := sync.OnceFunc(interruptCleanup) + defer cleanupOnce() + deployResult, err := p.deployModule( - ctx, + deployCtx, deployment, planned.RawArmTemplate, planned.Parameters, deploymentTags, optionsMap, ) + + // Try to atomically claim the "completed" state. If the interrupt + // handler already claimed "interrupting", the CAS fails and we must + // wait for the handler's outcome so the user's Ctrl+C is never + // silently dropped. + if !markDeployCompleted() { + // Handler has claimed the interrupt — wait for its outcome. + <-interruptStarted + outcome := <-interruptCh + cleanupOnce() + tracing.SetUsageAttributes( + fields.ProvisionCancellationKey.String(outcome.telemetryValue)) + return nil, applyInterruptOutcome(outcome, err) + } + + // Deploy completed naturally — tear the handler down before + // post-processing to avoid resurfacing the cancel/leave prompt over + // subsequent output. + cleanupOnce() + if err != nil { + tracing.SetUsageAttributes(fields.ProvisionCancellationKey.String("none")) return nil, err } + tracing.SetUsageAttributes(fields.ProvisionCancellationKey.String("none")) + result.Outputs = provisioning.OutputParametersFromArmOutputs( planned.Template.Outputs, azapi.CreateDeploymentOutput(deployResult.Outputs), diff --git a/cli/azd/pkg/infra/provisioning/bicep/interrupt.go b/cli/azd/pkg/infra/provisioning/bicep/interrupt.go new file mode 100644 index 00000000000..fd0e47bc0ba --- /dev/null +++ b/cli/azd/pkg/infra/provisioning/bicep/interrupt.go @@ -0,0 +1,424 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package bicep + +import ( + "context" + "errors" + "fmt" + "log" + "sync" + "sync/atomic" + "time" + + "github.com/azure/azure-dev/cli/azd/pkg/azapi" + "github.com/azure/azure-dev/cli/azd/pkg/infra" + "github.com/azure/azure-dev/cli/azd/pkg/infra/provisioning" + "github.com/azure/azure-dev/cli/azd/pkg/input" + "github.com/azure/azure-dev/cli/azd/pkg/output" +) + +// Default timeouts for interrupt-driven cancellation. +const ( + // cancelRequestTimeout bounds the time spent waiting for the ARM Cancel + // API call itself to return. + cancelRequestTimeout = 30 * time.Second + // cancelTerminalTimeout bounds the total time we wait for the Azure + // deployment to transition to a terminal state after the cancel request + // has been accepted. + cancelTerminalTimeout = 2 * time.Minute + // cancelPollInterval controls how often we poll the deployment for state + // changes after submitting cancel. + cancelPollInterval = 5 * time.Second +) + +// User-facing labels for the interrupt prompt. Kept as constants so tests can +// reason about the prompt selection without depending on copy edits. +const ( + interruptOptionCancel = "Cancel the Azure deployment" + interruptOptionLeaveRunning = "Leave the Azure deployment running and stop azd" +) + +// interruptOutcome is produced by the interrupt handler and consumed by the +// main deploy goroutine after the ARM operation unblocks. +type interruptOutcome struct { + // err is the typed sentinel error from pkg/infra/provisioning that + // describes how the interrupt was handled. + err error + // telemetryValue is the value to record on the cancellation telemetry + // attribute (see fields.ProvisionCancellationKey). + telemetryValue string +} + +// deployState tracks the lifecycle of the deployment so the interrupt handler +// and the Deploy goroutine can coordinate without races. +type deployState int32 + +const ( + deployStateRunning deployState = iota // ARM deploy is in flight + deployStateInterrupting // handler claimed the Ctrl+C + deployStateCompleted // Deploy returned naturally +) + +// installDeploymentInterruptHandler registers a Ctrl+C handler covering the +// in-flight ARM deployment. It returns: +// +// - deployCtx: a context derived from ctx that the caller MUST pass to the +// ARM deploy call; it will be cancelled as soon as the user presses +// Ctrl+C, which unblocks PollUntilDone and returns control to Deploy. +// - startedCh: closed as soon as the user presses Ctrl+C (before the prompt +// is shown). Callers should check it after the deploy call returns to +// decide whether to block-wait for an interrupt outcome instead of taking +// the normal success path. This is what guarantees that a Ctrl+C arriving +// while the deployment happens to finish naturally cannot be silently +// dropped. +// - outcomeCh: receives the interrupt outcome once the user has chosen. +// The channel is buffered (size 1). +// - markCompleted: must be called by Deploy right after deployModule returns +// (before the select on startedCh) to atomically claim the "completed" +// state. If the interrupt handler already claimed "interrupting", this +// returns false and the caller must wait for the outcome. +// - cleanup: must be called (via defer) to unregister the interrupt handler +// and release the deploy context. +// +// onInterruptStart, if non-nil, is invoked synchronously at the start of the +// interrupt handler before any prompt is shown. Callers use this hook to stop +// background activity (e.g. the deployment progress reporter) so it doesn't +// stomp on the prompt rendering. +func (p *BicepProvider) installDeploymentInterruptHandler( + ctx context.Context, + deployment infra.Deployment, + onInterruptStart func(), +) ( + deployCtx context.Context, + startedCh <-chan struct{}, + outcomeCh <-chan interruptOutcome, + markCompleted func() bool, + cleanup func(), +) { + deployCtx, cancelDeploy := context.WithCancel(ctx) + ch := make(chan interruptOutcome, 1) + started := make(chan struct{}) + + var state atomic.Int32 // deployState values + + pop := input.PushInterruptHandler(sync.OnceValue(func() bool { + // Try to claim the "interrupting" state. If Deploy already set + // "completed", the prompt is unnecessary — the deployment finished + // naturally and the success path should run instead. + if !state.CompareAndSwap( + int32(deployStateRunning), + int32(deployStateInterrupting), + ) { + return false + } + + // Signal interrupt-in-progress and unblock the ARM deploy call + // immediately so Deploy can transition to "wait for outcome" mode + // rather than racing against a natural completion. + close(started) + cancelDeploy() + + if onInterruptStart != nil { + onInterruptStart() + } + // Stop the in-progress spinner so we can render the prompt cleanly. + p.console.StopSpinner(ctx, "", input.Step) + + outcome := p.runInterruptPrompt(ctx, deployment) + ch <- outcome + // Returning true tells the runtime that we own the shutdown sequence. + // We don't actually os.Exit here — Deploy will return the typed + // sentinel error and the action / error middleware translates that + // into the user-facing exit message. + return true + })) + + markCompleted = func() bool { + return state.CompareAndSwap( + int32(deployStateRunning), + int32(deployStateCompleted), + ) + } + + cleanup = func() { + pop() + cancelDeploy() + } + return deployCtx, started, ch, markCompleted, cleanup +} + +// runInterruptPrompt presents the user with the choice of cancelling the +// running Azure deployment or leaving it to run. It returns the outcome that +// should be propagated back to Deploy. +func (p *BicepProvider) runInterruptPrompt( + ctx context.Context, + deployment infra.Deployment, +) interruptOutcome { + // Best-effort URL fetch — bounded so a slow/unreachable ARM endpoint + // doesn't block the prompt indefinitely. + urlCtx, urlDone := context.WithTimeout( + context.WithoutCancel(ctx), cancelRequestTimeout) + portalUrl, urlErr := deployment.DeploymentUrl(urlCtx) + urlDone() + if urlErr != nil { + // Not fatal — we just won't include the URL in the prompt. + log.Printf("interrupt handler: failed to fetch deployment URL: %v", urlErr) + } + + help := "An Azure deployment is currently in progress." + if portalUrl != "" { + help = fmt.Sprintf("%s\nPortal: %s", help, portalUrl) + } + + choice, err := p.console.Select(ctx, input.ConsoleOptions{ + Message: "azd was interrupted. What would you like to do?", + Help: help, + Options: []string{ + interruptOptionLeaveRunning, + interruptOptionCancel, + }, + DefaultValue: interruptOptionLeaveRunning, + }) + if err != nil { + // If we can't even show the prompt (e.g. non-interactive), fall back + // to the safer "leave running" behavior so the user can decide + // manually via the portal. + log.Printf("interrupt handler: failed to show prompt, defaulting to leave-running: %v", err) + if portalUrl != "" { + p.console.Message(ctx, + output.WithHighLightFormat("The Azure deployment will continue running. Track it here:\n %s", + portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentInterruptedLeaveRunning, + telemetryValue: "leave_running", + } + } + + switch choice { + case 0: // leave running + if portalUrl != "" { + p.console.Message(ctx, + output.WithHighLightFormat("The Azure deployment will continue running. Track it here:\n %s", + portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentInterruptedLeaveRunning, + telemetryValue: "leave_running", + } + case 1: // cancel + return p.cancelAndAwaitTerminal(ctx, deployment, portalUrl) + default: + // Should never happen, but fall back to leave-running. + return interruptOutcome{ + err: provisioning.ErrDeploymentInterruptedLeaveRunning, + telemetryValue: "leave_running", + } + } +} + +// cancelAndAwaitTerminal submits the Azure cancel request and polls the +// deployment until it reaches a terminal provisioning state (Canceled, Failed, +// or Succeeded) or the wait budget is exhausted. +func (p *BicepProvider) cancelAndAwaitTerminal( + ctx context.Context, + deployment infra.Deployment, + portalUrl string, +) interruptOutcome { + p.console.ShowSpinner(ctx, "Canceling Azure deployment", input.Step) + + // Use a fresh context for the cancel API call so it isn't affected by + // the deploy-side cancellation we issue right after. + cancelReqCtx, cancelReqDone := context.WithTimeout( + context.WithoutCancel(ctx), cancelRequestTimeout) + defer cancelReqDone() + + if err := deployment.Cancel(cancelReqCtx); err != nil { + // Some providers (e.g. Deployment Stacks) do not support per-deployment + // cancel. Surface that as the safer "leave running" outcome rather + // than a cancel failure so the user gets consistent UX/telemetry with + // the documented provider behavior. + if errors.Is(err, azapi.ErrCancelNotSupported) { + p.console.StopSpinner(ctx, "Cancel is not supported for this deployment kind", input.StepWarning) + if portalUrl != "" { + p.console.Message(ctx, + output.WithHighLightFormat( + "The Azure deployment will continue running. Track it here:\n %s", + portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentInterruptedLeaveRunning, + telemetryValue: "leave_running", + } + } + // If the deployment is already in a terminal state, route through + // the same terminal-outcome reporter so the user sees consistent + // messaging (including the portal URL). + getCtx, getDone := context.WithTimeout( + context.WithoutCancel(ctx), cancelRequestTimeout) + defer getDone() + + if state, getErr := deployment.Get(getCtx); getErr == nil && + isTerminalProvisioningState(state.ProvisioningState) { + return p.terminalToOutcome(ctx, state.ProvisioningState, portalUrl) + } + p.console.StopSpinner(ctx, "Cancel request failed", input.StepFailed) + log.Printf("interrupt handler: cancel request failed: %v", err) + if portalUrl != "" { + p.console.Message(ctx, + output.WithWarningFormat( + "Azure cancel request failed. Track the deployment here:\n %s", portalUrl)) + } + return interruptOutcome{ + err: fmt.Errorf("%w: %w", + provisioning.ErrDeploymentCancelFailed, err), + telemetryValue: "cancel_failed", + } + } + + p.console.StopSpinner(ctx, "", input.Step) + p.console.ShowSpinner(ctx, "Waiting for Azure to confirm cancellation", input.Step) + + // Poll until terminal or until our wait budget elapses. Wait for the + // poll interval BEFORE each Get so that a slow Get cannot push the loop + // into back-to-back ARM calls (and trigger throttling). + pollCtx, pollDone := context.WithTimeout( + context.WithoutCancel(ctx), cancelTerminalTimeout) + defer pollDone() + + ticker := time.NewTicker(cancelPollInterval) + defer ticker.Stop() + + var lastState azapi.DeploymentProvisioningState + for { + select { + case <-pollCtx.Done(): + p.console.StopSpinner(ctx, "Cancellation still in progress on Azure", input.StepWarning) + if portalUrl != "" { + p.console.Message(ctx, + output.WithWarningFormat( + "Azure has not confirmed cancellation within %s. Track the deployment here:\n %s", + cancelTerminalTimeout, portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentCancelTimeout, + telemetryValue: "cancel_timed_out", + } + case <-ticker.C: + } + + state, err := deployment.Get(pollCtx) + if err == nil { + lastState = state.ProvisioningState + if isTerminalProvisioningState(lastState) { + return p.terminalToOutcome(ctx, lastState, portalUrl) + } + } else { + // Don't fail the whole flow on a transient Get error — keep + // polling until either we observe a terminal state or the + // timeout fires. + log.Printf("interrupt handler: poll Get failed (will retry): %v", err) + } + } +} + +// terminalToOutcome maps a terminal provisioning state to the interrupt outcome +// that should be propagated back to Deploy. +func (p *BicepProvider) terminalToOutcome( + ctx context.Context, + state azapi.DeploymentProvisioningState, + portalUrl string, +) interruptOutcome { + switch state { + case azapi.DeploymentProvisioningStateCanceled: + p.console.StopSpinner(ctx, "Deployment canceled", input.StepDone) + if portalUrl != "" { + p.console.Message(ctx, + output.WithHighLightFormat( + "Canceled deployment is recorded in the portal:\n %s", portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentCanceledByUser, + telemetryValue: "canceled", + } + case azapi.DeploymentProvisioningStateSucceeded, + azapi.DeploymentProvisioningStateFailed: + p.console.StopSpinner(ctx, + "Deployment finished before cancel could take effect", input.StepWarning) + if portalUrl != "" { + p.console.Message(ctx, + output.WithWarningFormat( + "The Azure deployment reached %q before the cancel "+ + "request took effect. Review:\n %s", + string(state), portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentCancelTooLate, + telemetryValue: "cancel_too_late", + } + case azapi.DeploymentProvisioningStateDeleted: + p.console.StopSpinner(ctx, "Deployment was deleted", input.StepWarning) + if portalUrl != "" { + p.console.Message(ctx, + output.WithWarningFormat( + "The Azure deployment was deleted before the cancel "+ + "request could take effect. Review:\n %s", + portalUrl)) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentCancelTooLate, + telemetryValue: "cancel_too_late", + } + default: + // isTerminalProvisioningState should prevent reaching here, but be + // defensive: stop the spinner and warn the user so the UI is left in + // a clean state, then surface as too-late so the caller exits. + p.console.StopSpinner(ctx, "Deployment reached an unexpected terminal state", input.StepWarning) + if portalUrl != "" { + p.console.Message(ctx, + output.WithWarningFormat( + "The Azure deployment reached unexpected terminal state %q after the cancel request. Review:\n %s", + string(state), portalUrl)) + } else { + p.console.Message(ctx, + output.WithWarningFormat( + "The Azure deployment reached unexpected terminal state %q after the cancel request.", + string(state))) + } + return interruptOutcome{ + err: provisioning.ErrDeploymentCancelTooLate, + telemetryValue: "cancel_too_late", + } + } +} + +// isTerminalProvisioningState reports whether an Azure deployment provisioning +// state represents a terminal outcome (no further transitions expected). +func isTerminalProvisioningState(state azapi.DeploymentProvisioningState) bool { + switch state { + case azapi.DeploymentProvisioningStateCanceled, + azapi.DeploymentProvisioningStateFailed, + azapi.DeploymentProvisioningStateSucceeded, + azapi.DeploymentProvisioningStateDeleted: + return true + } + return false +} + +// applyInterruptOutcome decides what to return from BicepProvider.Deploy when +// an interrupt outcome was produced. It composes any pre-existing deploy error +// with the interrupt sentinel so error wrapping (`errors.Is`) keeps working. +func applyInterruptOutcome(outcome interruptOutcome, deployErr error) error { + if deployErr == nil { + return outcome.err + } + // Most likely deployErr is "context canceled" wrapped by the SDK (because + // we cancelled deployCtx to unblock PollUntilDone). Prefer the typed + // interrupt sentinel for the user-visible error chain. + if errors.Is(deployErr, context.Canceled) { + return outcome.err + } + return fmt.Errorf("%w: %w", outcome.err, deployErr) +} diff --git a/cli/azd/pkg/infra/provisioning/bicep/interrupt_test.go b/cli/azd/pkg/infra/provisioning/bicep/interrupt_test.go new file mode 100644 index 00000000000..dcd9c7830a0 --- /dev/null +++ b/cli/azd/pkg/infra/provisioning/bicep/interrupt_test.go @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package bicep + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/azure/azure-dev/cli/azd/pkg/azapi" + "github.com/azure/azure-dev/cli/azd/pkg/infra/provisioning" + "github.com/stretchr/testify/require" +) + +func TestIsTerminalProvisioningState(t *testing.T) { + terminal := []azapi.DeploymentProvisioningState{ + azapi.DeploymentProvisioningStateCanceled, + azapi.DeploymentProvisioningStateFailed, + azapi.DeploymentProvisioningStateSucceeded, + azapi.DeploymentProvisioningStateDeleted, + } + nonTerminal := []azapi.DeploymentProvisioningState{ + azapi.DeploymentProvisioningStateAccepted, + azapi.DeploymentProvisioningStateCanceling, + azapi.DeploymentProvisioningStateRunning, + azapi.DeploymentProvisioningStateDeploying, + azapi.DeploymentProvisioningStateValidating, + azapi.DeploymentProvisioningStateWaiting, + azapi.DeploymentProvisioningStateNotSpecified, + "", + } + for _, s := range terminal { + require.Truef(t, isTerminalProvisioningState(s), "expected %q to be terminal", s) + } + for _, s := range nonTerminal { + require.Falsef(t, isTerminalProvisioningState(s), "expected %q to NOT be terminal", s) + } +} + +func TestApplyInterruptOutcome(t *testing.T) { + leave := interruptOutcome{ + err: provisioning.ErrDeploymentInterruptedLeaveRunning, + telemetryValue: "leave_running", + } + canceled := interruptOutcome{ + err: provisioning.ErrDeploymentCanceledByUser, + telemetryValue: "canceled", + } + + t.Run("nil deploy error returns outcome err", func(t *testing.T) { + require.ErrorIs(t, applyInterruptOutcome(leave, nil), + provisioning.ErrDeploymentInterruptedLeaveRunning) + require.ErrorIs(t, applyInterruptOutcome(canceled, nil), + provisioning.ErrDeploymentCanceledByUser) + }) + + t.Run("context canceled is replaced by outcome err", func(t *testing.T) { + err := applyInterruptOutcome(canceled, context.Canceled) + require.ErrorIs(t, err, provisioning.ErrDeploymentCanceledByUser) + require.NotErrorIs(t, err, context.Canceled) + }) + + t.Run("wrapped context canceled is replaced by outcome err", func(t *testing.T) { + wrapped := fmt.Errorf("PollUntilDone: %w", context.Canceled) + err := applyInterruptOutcome(leave, wrapped) + require.ErrorIs(t, err, provisioning.ErrDeploymentInterruptedLeaveRunning) + require.NotErrorIs(t, err, context.Canceled) + }) + + t.Run("non-cancel deploy error is preserved alongside outcome", func(t *testing.T) { + other := errors.New("template validation failed") + err := applyInterruptOutcome(canceled, other) + require.ErrorIs(t, err, provisioning.ErrDeploymentCanceledByUser) + require.ErrorIs(t, err, other) + }) +} diff --git a/cli/azd/pkg/infra/provisioning/cancel.go b/cli/azd/pkg/infra/provisioning/cancel.go new file mode 100644 index 00000000000..9e4df22a2e7 --- /dev/null +++ b/cli/azd/pkg/infra/provisioning/cancel.go @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package provisioning + +import "errors" + +// Cancellation sentinels surfaced by providers when the user interrupts a +// running deployment with Ctrl+C. These are sentinel errors so the action / +// error middleware can produce a friendly, non-zero exit (with the portal URL +// and a clear message) instead of treating the case as an unexpected failure. +var ( + // ErrDeploymentInterruptedLeaveRunning is returned when the user chose to + // stop azd but allow the in-flight Azure deployment to continue running. + ErrDeploymentInterruptedLeaveRunning = errors.New( + "azd was interrupted; the Azure deployment is still running") + + // ErrDeploymentCanceledByUser is returned when the user requested + // cancellation and Azure confirmed the deployment reached the Canceled + // terminal state. + ErrDeploymentCanceledByUser = errors.New( + "deployment was canceled by user request") + + // ErrDeploymentCancelTimeout is returned when azd asked Azure to cancel the + // deployment but the deployment had not reached a terminal state before + // the local wait budget expired. The cancellation is still in progress on + // Azure. + ErrDeploymentCancelTimeout = errors.New( + "deployment cancel request was submitted but did not complete before timeout") + + // ErrDeploymentCancelTooLate is returned when azd attempted to cancel the + // deployment but Azure had already moved it to a terminal state + // (Succeeded or Failed) before the cancel request could take effect. + ErrDeploymentCancelTooLate = errors.New( + "deployment finished before the cancel request could take effect") + + // ErrDeploymentCancelFailed is returned when the ARM Cancel request itself + // failed (network, permissions, etc.). The returned error includes the + // underlying cause so the caller can inspect it with errors.Is or + // errors.As. + ErrDeploymentCancelFailed = errors.New( + "deployment cancel request failed") +) diff --git a/cli/azd/pkg/infra/scope.go b/cli/azd/pkg/infra/scope.go index 303766d2d95..095dde9be09 100644 --- a/cli/azd/pkg/infra/scope.go +++ b/cli/azd/pkg/infra/scope.go @@ -54,6 +54,11 @@ type Deployment interface { options map[string]any, progress *async.Progress[azapi.DeleteDeploymentProgress], ) error + // Cancel requests Azure to cancel an in-flight deployment. Returns nil if + // the cancel request is accepted (the deployment will transition to the + // Canceling/Canceled state asynchronously). Callers should poll Get() to + // observe the terminal state. + Cancel(ctx context.Context) error // Deploy a given template with a set of parameters. DeployPreview( ctx context.Context, @@ -114,6 +119,12 @@ func (s *ResourceGroupDeployment) Delete( ) } +// Cancel requests Azure to cancel an in-flight resource-group-scoped deployment. +func (s *ResourceGroupDeployment) Cancel(ctx context.Context) error { + return s.deploymentService.CancelResourceGroupDeployment( + ctx, s.subscriptionId, s.resourceGroupName, s.name) +} + func (s *ResourceGroupDeployment) DeployPreview( ctx context.Context, template azure.RawArmTemplate, @@ -324,6 +335,11 @@ func (s *SubscriptionDeployment) Delete( return s.deploymentService.DeleteSubscriptionDeployment(ctx, s.subscriptionId, s.name, options, progress) } +// Cancel requests Azure to cancel an in-flight subscription-scoped deployment. +func (s *SubscriptionDeployment) Cancel(ctx context.Context) error { + return s.deploymentService.CancelSubscriptionDeployment(ctx, s.subscriptionId, s.name) +} + // Deploy a given template with a set of parameters. func (s *SubscriptionDeployment) DeployPreview( ctx context.Context, diff --git a/cli/azd/pkg/infra/scope_test.go b/cli/azd/pkg/infra/scope_test.go index 0ba0961b082..5137b806a66 100644 --- a/cli/azd/pkg/infra/scope_test.go +++ b/cli/azd/pkg/infra/scope_test.go @@ -297,6 +297,66 @@ func TestScopeGetResourceOperations(t *testing.T) { }) } +func TestScopeCancel(t *testing.T) { + t.Run("SubscriptionScopeSuccess", func(t *testing.T) { + mockContext := mocks.NewMockContext(t.Context()) + deploymentService := mockazapi.NewDeploymentsServiceFromMockContext(mockContext) + + called := false + mockContext.HttpClient.When(func(request *http.Request) bool { + return request.Method == http.MethodPost && strings.Contains( + request.URL.Path, + "/subscriptions/SUBSCRIPTION_ID/providers/Microsoft.Resources/deployments/DEPLOYMENT_NAME/cancel", + ) + }).RespondFn(func(request *http.Request) (*http.Response, error) { + called = true + return mocks.CreateEmptyHttpResponse(request, http.StatusNoContent) + }) + + scope := newSubscriptionScope(deploymentService, "SUBSCRIPTION_ID", "eastus2") + target := NewSubscriptionDeployment(scope, "DEPLOYMENT_NAME") + require.NoError(t, target.Cancel(*mockContext.Context)) + require.True(t, called, "expected ARM cancel endpoint to be called") + }) + + t.Run("ResourceGroupScopeSuccess", func(t *testing.T) { + mockContext := mocks.NewMockContext(t.Context()) + deploymentService := mockazapi.NewDeploymentsServiceFromMockContext(mockContext) + + called := false + mockContext.HttpClient.When(func(request *http.Request) bool { + return request.Method == http.MethodPost && strings.Contains( + request.URL.Path, + //nolint:lll + "/subscriptions/SUBSCRIPTION_ID/resourcegroups/RESOURCE_GROUP/providers/Microsoft.Resources/deployments/DEPLOYMENT_NAME/cancel", + ) + }).RespondFn(func(request *http.Request) (*http.Response, error) { + called = true + return mocks.CreateEmptyHttpResponse(request, http.StatusNoContent) + }) + + scope := newResourceGroupScope(deploymentService, "SUBSCRIPTION_ID", "RESOURCE_GROUP") + target := NewResourceGroupDeployment(scope, "DEPLOYMENT_NAME") + require.NoError(t, target.Cancel(*mockContext.Context)) + require.True(t, called, "expected ARM cancel endpoint to be called") + }) + + t.Run("PropagatesError", func(t *testing.T) { + mockContext := mocks.NewMockContext(t.Context()) + deploymentService := mockazapi.NewDeploymentsServiceFromMockContext(mockContext) + + mockContext.HttpClient.When(func(request *http.Request) bool { + return request.Method == http.MethodPost && strings.Contains(request.URL.Path, "/cancel") + }).RespondFn(func(request *http.Request) (*http.Response, error) { + return mocks.CreateEmptyHttpResponse(request, http.StatusConflict) + }) + + scope := newSubscriptionScope(deploymentService, "SUBSCRIPTION_ID", "eastus2") + target := NewSubscriptionDeployment(scope, "DEPLOYMENT_NAME") + require.Error(t, target.Cancel(*mockContext.Context)) + }) +} + var testArmParameters = azure.ArmParameters{ "location": { Value: "West US", diff --git a/cli/azd/pkg/input/console.go b/cli/azd/pkg/input/console.go index b30f253929b..894caa49c8d 100644 --- a/cli/azd/pkg/input/console.go +++ b/cli/azd/pkg/input/console.go @@ -985,12 +985,57 @@ func watchTerminalInterrupt(c *AskerConsole) { signalChan := make(chan os.Signal, 1) signal.Notify(signalChan, os.Interrupt) go func() { - <-signalChan + for range signalChan { + // Reserve the running slot first so re-entrant Ctrl+C signals are + // suppressed even in the brief window where a handler has been + // popped from the stack but is still executing (e.g. a prompt is + // up while Deploy has already torn the registration down). + if !tryStartInterruptHandler() { + // A handler is already running. A second Ctrl+C while a + // handler is active is treated as a force-exit (standard + // POSIX convention: kubectl, terraform, docker, etc.). + if incrementForceExitCounter() { + _ = c.spinner.Stop() + os.Exit(130) // 128 + SIGINT + } + continue + } - // unhide the cursor if applicable - _ = c.spinner.Stop() + handler := currentInterruptHandler() + if handler == nil { + // No handler registered → default behavior. Release the slot + // before exiting so future signals would behave correctly if + // we did not exit (defensive). + finishInterruptHandler() + _ = c.spinner.Stop() + os.Exit(1) + } - os.Exit(1) + // Run the handler inline on the signal goroutine so any "interrupt + // started" side-effects (e.g. closing a started channel, cancelling + // the deploy context) take effect synchronously after the signal + // is received — no scheduling window where the deploy goroutine + // could complete naturally and silently drop the Ctrl+C. + var handled bool + func() { + defer func() { + if r := recover(); r != nil { + buf := make([]byte, 4096) + n := runtime.Stack(buf, false) + log.Printf( + "interrupt handler panic: %v\n%s", r, buf[:n]) + } + }() + handled = handler() + }() + finishInterruptHandler() + if !handled { + // Handler declined to take ownership of shutdown — fall back + // to default behavior. + _ = c.spinner.Stop() + os.Exit(1) + } + } }() } diff --git a/cli/azd/pkg/input/interrupt.go b/cli/azd/pkg/input/interrupt.go new file mode 100644 index 00000000000..6f5909b4729 --- /dev/null +++ b/cli/azd/pkg/input/interrupt.go @@ -0,0 +1,105 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package input + +import "sync" + +// InterruptHandler is invoked when the user presses Ctrl+C. +// +// Implementations are expected to drive any user interaction (such as +// confirming whether to abort an in-flight Azure operation) and return only +// after they have decided how to respond. The handler runs synchronously on a +// dedicated goroutine: any additional Ctrl+C signals received while the +// handler is running are ignored. +// +// If the returned bool is false, the default azd interrupt behavior runs after +// the handler returns (the spinner is stopped and the process exits with +// code 1). Returning true tells the runtime that the handler took ownership of +// the shutdown sequence. +type InterruptHandler func() (handled bool) + +var ( + interruptMu sync.Mutex + interruptStack []InterruptHandler + interruptRunning bool + // forceExitPending tracks whether one suppressed Ctrl+C has been + // received while a handler is running. A second suppressed press + // triggers a force-exit (standard POSIX convention). + forceExitPending bool +) + +// PushInterruptHandler registers a handler to be invoked on the next SIGINT +// (Ctrl+C). Handlers are stacked: the most recently pushed handler runs first. +// +// The returned function pops the handler from the stack and must be called to +// restore the previous interrupt behavior (typically with `defer`). +func PushInterruptHandler(h InterruptHandler) func() { + interruptMu.Lock() + interruptStack = append(interruptStack, h) + idx := len(interruptStack) - 1 + interruptMu.Unlock() + + return func() { + interruptMu.Lock() + defer interruptMu.Unlock() + // Only pop this handler if it is still the current top-of-stack + // entry. This enforces strict LIFO semantics and avoids accidentally + // removing unrelated newer handlers if pop functions are called out + // of order. + if len(interruptStack) == idx+1 { + // Clear the slot first so the GC can reclaim the popped handler + // (and anything it captured) even if the underlying array isn't + // reallocated for a while. + interruptStack[idx] = nil + interruptStack = interruptStack[:idx] + } + } +} + +// currentInterruptHandler returns the top-of-stack interrupt handler, or nil +// if no handler is registered. +func currentInterruptHandler() InterruptHandler { + interruptMu.Lock() + defer interruptMu.Unlock() + if len(interruptStack) == 0 { + return nil + } + return interruptStack[len(interruptStack)-1] +} + +// tryStartInterruptHandler returns true if no handler is currently running. +// On success the caller is responsible for calling finishInterruptHandler. +func tryStartInterruptHandler() bool { + interruptMu.Lock() + defer interruptMu.Unlock() + if interruptRunning { + return false + } + interruptRunning = true + forceExitPending = false // reset on new handler start + return true +} + +func finishInterruptHandler() { + interruptMu.Lock() + defer interruptMu.Unlock() + interruptRunning = false + forceExitPending = false +} + +// incrementForceExitCounter records a suppressed Ctrl+C while a handler is +// running. Returns true if this is the second suppressed press, indicating +// a force-exit should occur (standard POSIX convention). +func incrementForceExitCounter() bool { + interruptMu.Lock() + defer interruptMu.Unlock() + if !interruptRunning { + return false + } + if forceExitPending { + return true + } + forceExitPending = true + return false +} diff --git a/cli/azd/pkg/input/interrupt_test.go b/cli/azd/pkg/input/interrupt_test.go new file mode 100644 index 00000000000..7e9b534453f --- /dev/null +++ b/cli/azd/pkg/input/interrupt_test.go @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package input + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPushInterruptHandler_LIFO(t *testing.T) { + require.Nil(t, currentInterruptHandler()) + + firstCalls := 0 + first := func() bool { + firstCalls++ + return true + } + pop1 := PushInterruptHandler(first) + t.Cleanup(pop1) + + cur := currentInterruptHandler() + require.NotNil(t, cur) + require.True(t, cur()) + require.Equal(t, 1, firstCalls) + + secondCalls := 0 + second := func() bool { + secondCalls++ + return true + } + pop2 := PushInterruptHandler(second) + t.Cleanup(pop2) + + // Top-of-stack should be `second` (most recently pushed). + cur = currentInterruptHandler() + require.NotNil(t, cur) + require.True(t, cur()) + require.Equal(t, 1, firstCalls, "pushing second must not invoke first") + require.Equal(t, 1, secondCalls) + + pop2() + // After popping `second`, current should be `first` again. + cur = currentInterruptHandler() + require.NotNil(t, cur) + require.True(t, cur()) + require.Equal(t, 2, firstCalls) + require.Equal(t, 1, secondCalls, "popping second must not re-invoke it") + + pop1() + require.Nil(t, currentInterruptHandler()) +} + +func TestTryStartInterruptHandler_PreventsConcurrent(t *testing.T) { + require.True(t, tryStartInterruptHandler()) + t.Cleanup(finishInterruptHandler) + + // While the first handler is "running", the second start should be + // rejected so additional Ctrl+C signals are ignored. + require.False(t, tryStartInterruptHandler()) +} + +func TestForceExitCounter(t *testing.T) { + require.True(t, tryStartInterruptHandler()) + t.Cleanup(finishInterruptHandler) + + // First suppressed Ctrl+C while handler is running — not yet force-exit. + require.False(t, incrementForceExitCounter()) + // Second suppressed Ctrl+C — should trigger force-exit. + require.True(t, incrementForceExitCounter()) +} + +func TestForceExitCounter_ResetsOnNewHandler(t *testing.T) { + require.True(t, tryStartInterruptHandler()) + require.False(t, incrementForceExitCounter()) + finishInterruptHandler() + + // After finishing and starting a new handler, the counter resets. + require.True(t, tryStartInterruptHandler()) + t.Cleanup(finishInterruptHandler) + + require.False(t, incrementForceExitCounter(), + "force-exit counter should reset when a new handler starts") +}