@@ -571,41 +571,102 @@ export class App {
571571 // ============================================================
572572
573573 /**
574- * Delete a CloudFormation stack. If it fails (e.g. due to a custom resource
575- * whose dependencies have already been removed), retry while retaining the
576- * failed resources.
574+ * Delete a CloudFormation stack, handling the common failure mode where a
575+ * custom resource (typically `CustomAuthTriggerResource`) fails to clean
576+ * itself up because its service-token Lambda or the Cognito user pool it
577+ * references has already been removed.
578+ *
579+ * Strategy:
580+ * 1. Issue the delete and wait for completion.
581+ * 2. If the stack reaches `DELETE_FAILED`, inspect its resources:
582+ * - For any nested stack that failed, recursively clean it up. This
583+ * matters because `RetainResources` on a parent stack cannot skip
584+ * resources inside a nested stack — the nested stack itself must be
585+ * deleted with its own `RetainResources` targeting the actual
586+ * problem leaf.
587+ * - After the recursive pass, retry the parent, retaining any nested
588+ * stacks or leaf resources that are still `DELETE_FAILED`.
577589 */
578590 private async deleteStackWithRetainOnFailure ( cfnClient : CloudFormationClient , stackName : string ) : Promise < void > {
579591 await cfnClient . send ( new DeleteStackCommand ( { StackName : stackName } ) ) ;
580- try {
581- await waitUntilStackDeleteComplete ( { client : cfnClient , maxWaitTime : 300 } , { StackName : stackName } ) ;
592+ if ( await this . tryWaitForStackDelete ( cfnClient , stackName ) ) return ;
593+
594+ await this . cleanupNestedFailedStacks ( cfnClient , stackName ) ;
595+
596+ const failed = await this . listFailedResources ( cfnClient , stackName ) ;
597+ if ( failed . length === 0 ) {
598+ this . logger . info ( `Stack ${ stackName } delete did not complete within timeout (continuing teardown)` ) ;
582599 return ;
583- } catch {
584- // fall through to retry with RetainResources
585600 }
586601
602+ this . logger . info ( `Retrying delete of ${ stackName } with retained resources: ${ failed . join ( ', ' ) } ` ) ;
587603 try {
588- const { StackResources } = await cfnClient . send ( new DescribeStackResourcesCommand ( { StackName : stackName } ) ) ;
589- const failed = ( StackResources ?? [ ] )
590- . filter ( ( r ) => r . ResourceStatus === 'DELETE_FAILED' && r . LogicalResourceId )
591- // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
592- . map ( ( r ) => r . LogicalResourceId ! ) ;
593- if ( failed . length > 0 ) {
594- this . logger . info ( `Retrying delete of ${ stackName } with retained resources: ${ failed . join ( ', ' ) } ` ) ;
595- await cfnClient . send ( new DeleteStackCommand ( { StackName : stackName , RetainResources : failed } ) ) ;
596- try {
597- await waitUntilStackDeleteComplete ( { client : cfnClient , maxWaitTime : 300 } , { StackName : stackName } ) ;
598- } catch {
599- this . logger . info ( `Stack ${ stackName } retry did not complete within timeout (continuing teardown)` ) ;
600- }
601- } else {
602- this . logger . info ( `Stack ${ stackName } delete did not complete within timeout (continuing teardown)` ) ;
604+ await cfnClient . send ( new DeleteStackCommand ( { StackName : stackName , RetainResources : failed } ) ) ;
605+ if ( ! ( await this . tryWaitForStackDelete ( cfnClient , stackName ) ) ) {
606+ this . logger . info ( `Stack ${ stackName } retry did not complete within timeout (continuing teardown)` ) ;
603607 }
604608 } catch ( e ) {
605609 this . logger . info ( `Failed to retry stack ${ stackName } delete: ${ ( e as Error ) . message } (continuing teardown)` ) ;
606610 }
607611 }
608612
613+ /**
614+ * Recursively delete any nested stacks of `stackName` that are in
615+ * `DELETE_FAILED`. Each recursive call can itself retain problem leaf
616+ * resources, so after this returns the parent's retry only needs to retain
617+ * nested-stack logical IDs that remained stuck.
618+ */
619+ private async cleanupNestedFailedStacks ( cfnClient : CloudFormationClient , stackName : string ) : Promise < void > {
620+ let resources ;
621+ try {
622+ resources = await cfnClient . send ( new DescribeStackResourcesCommand ( { StackName : stackName } ) ) ;
623+ } catch ( e ) {
624+ this . logger . info ( `Failed to describe resources for ${ stackName } : ${ ( e as Error ) . message } (continuing teardown)` ) ;
625+ return ;
626+ }
627+ const nestedFailed = ( resources . StackResources ?? [ ] ) . filter (
628+ ( r ) => r . ResourceType === 'AWS::CloudFormation::Stack' && r . ResourceStatus === 'DELETE_FAILED' && r . PhysicalResourceId ,
629+ ) ;
630+ for ( const nested of nestedFailed ) {
631+ // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
632+ const nestedName = nested . PhysicalResourceId ! ;
633+ this . logger . info ( `Recursively cleaning nested stack: ${ nestedName } ` ) ;
634+ await this . emptyStackBuckets ( cfnClient , nestedName ) ;
635+ await this . deleteStackWithRetainOnFailure ( cfnClient , nestedName ) ;
636+ }
637+ }
638+
639+ /**
640+ * List logical IDs of resources in `DELETE_FAILED` for the given stack.
641+ */
642+ private async listFailedResources ( cfnClient : CloudFormationClient , stackName : string ) : Promise < string [ ] > {
643+ try {
644+ const { StackResources } = await cfnClient . send ( new DescribeStackResourcesCommand ( { StackName : stackName } ) ) ;
645+ return (
646+ ( StackResources ?? [ ] )
647+ . filter ( ( r ) => r . ResourceStatus === 'DELETE_FAILED' && r . LogicalResourceId )
648+ // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
649+ . map ( ( r ) => r . LogicalResourceId ! )
650+ ) ;
651+ } catch ( e ) {
652+ this . logger . info ( `Failed to list failed resources for ${ stackName } : ${ ( e as Error ) . message } (continuing teardown)` ) ;
653+ return [ ] ;
654+ }
655+ }
656+
657+ /**
658+ * Wait for a stack delete to complete. Returns true on success, false on
659+ * timeout or delete-failure (the caller decides how to recover).
660+ */
661+ private async tryWaitForStackDelete ( cfnClient : CloudFormationClient , stackName : string ) : Promise < boolean > {
662+ try {
663+ await waitUntilStackDeleteComplete ( { client : cfnClient , maxWaitTime : 300 } , { StackName : stackName } ) ;
664+ return true ;
665+ } catch {
666+ return false ;
667+ }
668+ }
669+
609670 /**
610671 * Empty all S3 buckets owned by the given CloudFormation stack.
611672 * CloudFormation cannot delete a bucket with objects, so we must empty them first.
0 commit comments