Skip to content

Commit baaaec0

Browse files
fix(ops): harden ephemeral-stack cleanup script (defense in depth)
Security/robustness review of scripts/cleanup-ephemeral-stacks.sh (#72): - Fail CLOSED on unparseable CreationTime. Previously a parse failure fell back to epoch 0, making every matching stack look ~billions of seconds old and eligible for deletion — the age gate failed open. Now it SKIPs. - Validate --max-age-hours is a non-negative integer before arithmetic (rejects injected/garbage input). - Print account + caller ARN (sts:GetCallerIdentity) before any action so the operator can confirm blast radius; hard-fail if identity can't be resolved. - Tolerate a single delete-stack failure instead of aborting the whole loop under set -e (would otherwise orphan later stacks); track and report a Failed count, and only increment Deleted on a delete actually initiated. - Remove dead --force-eni flag (parsed but never used; shellcheck SC2034). - Annotate the JMESPath --query backticks as intentional (shellcheck SC2016). shellcheck: clean (exit 0). semgrep --config=auto: 0 findings. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 971b1da commit baaaec0

1 file changed

Lines changed: 42 additions & 10 deletions

File tree

scripts/cleanup-ephemeral-stacks.sh

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# --dry-run Show what would be deleted without acting
1212
# --max-age-hours N Delete stacks older than N hours (default: 4)
1313
# --prefix PREFIX Only target stacks matching this prefix (default: all ABCA stacks)
14-
# --force-eni Force-detach ENIs even if stack deletion hasn't started yet
1514
#
1615
# Safety:
1716
# - Never touches stacks with termination protection enabled
@@ -23,23 +22,40 @@ set -euo pipefail
2322
MAX_AGE_HOURS=${MAX_AGE_HOURS:-4}
2423
DRY_RUN=false
2524
PREFIX=""
26-
FORCE_ENI=false
2725
REGION="${AWS_DEFAULT_REGION:-us-east-1}"
2826

2927
while [[ $# -gt 0 ]]; do
3028
case $1 in
3129
--dry-run) DRY_RUN=true; shift ;;
3230
--max-age-hours) MAX_AGE_HOURS="$2"; shift 2 ;;
3331
--prefix) PREFIX="$2"; shift 2 ;;
34-
--force-eni) FORCE_ENI=true; shift ;;
3532
*) echo "Unknown option: $1" >&2; exit 1 ;;
3633
esac
3734
done
3835

36+
# Validate numeric input — guards the age arithmetic against injection/garbage.
37+
if ! [[ "$MAX_AGE_HOURS" =~ ^[0-9]+$ ]]; then
38+
echo "Error: --max-age-hours must be a non-negative integer (got: '$MAX_AGE_HOURS')" >&2
39+
exit 1
40+
fi
41+
3942
MAX_AGE_SECONDS=$((MAX_AGE_HOURS * 3600))
4043
NOW=$(date +%s)
4144

45+
# Surface the blast radius before touching anything. Confirms the operator is
46+
# pointed at the account/identity they think they are (defense in depth).
47+
CALLER_IDENTITY=$(aws sts get-caller-identity \
48+
--region "$REGION" \
49+
--query '[Account,Arn]' --output text 2>/dev/null) || {
50+
echo "Error: unable to resolve AWS identity (sts:GetCallerIdentity failed). Check credentials." >&2
51+
exit 1
52+
}
53+
ACCOUNT_ID=$(echo "$CALLER_IDENTITY" | cut -f1)
54+
CALLER_ARN=$(echo "$CALLER_IDENTITY" | cut -f2)
55+
4256
echo "=== Ephemeral Stack Cleanup ==="
57+
echo " Account: $ACCOUNT_ID"
58+
echo " Identity: $CALLER_ARN"
4359
echo " Region: $REGION"
4460
echo " Max age: ${MAX_AGE_HOURS}h"
4561
echo " Dry run: $DRY_RUN"
@@ -62,6 +78,7 @@ fi
6278

6379
DELETED=0
6480
SKIPPED=0
81+
FAILED=0
6582

6683
while IFS=$'\t' read -r STACK_NAME CREATION_TIME; do
6784
# Apply prefix filter
@@ -99,8 +116,15 @@ while IFS=$'\t' read -r STACK_NAME CREATION_TIME; do
99116
continue
100117
fi
101118

102-
# Check age
103-
CREATED_EPOCH=$(date -d "$CREATION_TIME" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${CREATION_TIME%%.*}" +%s 2>/dev/null || echo 0)
119+
# Check age. Parse the CreationTime to epoch seconds (GNU date, then BSD date).
120+
# FAIL CLOSED: if both parsers fail we cannot trust the age, so SKIP rather than
121+
# risk deleting a stack we can't prove is old enough.
122+
CREATED_EPOCH=$(date -d "$CREATION_TIME" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${CREATION_TIME%%.*}" +%s 2>/dev/null || echo "")
123+
if ! [[ "$CREATED_EPOCH" =~ ^[0-9]+$ ]]; then
124+
echo " SKIP (unparseable creation time '$CREATION_TIME'): $STACK_NAME"
125+
((SKIPPED++)) || true
126+
continue
127+
fi
104128
AGE_SECONDS=$((NOW - CREATED_EPOCH))
105129

106130
if [[ $AGE_SECONDS -lt $MAX_AGE_SECONDS ]]; then
@@ -129,7 +153,8 @@ while IFS=$'\t' read -r STACK_NAME CREATION_TIME; do
129153

130154
if [[ -n "$SG_IDS" && "$SG_IDS" != "None" ]]; then
131155
for SG_ID in $SG_IDS; do
132-
# Find ENIs attached to this security group
156+
# Find ENIs attached to this security group.
157+
# shellcheck disable=SC2016 # backticks are JMESPath literal syntax for --query, must NOT expand
133158
ENIS=$(aws ec2 describe-network-interfaces \
134159
--region "$REGION" \
135160
--filters "Name=group-id,Values=$SG_ID" \
@@ -172,19 +197,26 @@ while IFS=$'\t' read -r STACK_NAME CREATION_TIME; do
172197
fi
173198

174199
# --- Delete the stack ---
200+
# Only count a deletion we actually initiated. Tolerate a single failure
201+
# (e.g. AccessDenied, transient throttling) without aborting the whole run —
202+
# set -e would otherwise kill the loop mid-pass and orphan later stacks.
175203
echo " Deleting stack $STACK_NAME..."
176-
aws cloudformation delete-stack \
204+
if aws cloudformation delete-stack \
177205
--region "$REGION" \
178-
--stack-name "$STACK_NAME" 2>/dev/null
179-
180-
((DELETED++)) || true
206+
--stack-name "$STACK_NAME" 2>/dev/null; then
207+
((DELETED++)) || true
208+
else
209+
echo " ERROR: delete-stack failed for $STACK_NAME (continuing)" >&2
210+
((FAILED++)) || true
211+
fi
181212

182213
done <<< "$STACKS"
183214

184215
echo ""
185216
echo "=== Summary ==="
186217
echo " Deleted: $DELETED"
187218
echo " Skipped: $SKIPPED"
219+
echo " Failed: $FAILED"
188220

189221
if [[ "$DELETED" -gt 0 && "$DRY_RUN" == "false" ]]; then
190222
echo ""

0 commit comments

Comments
 (0)