-
Notifications
You must be signed in to change notification settings - Fork 25
275 lines (254 loc) · 11.9 KB
/
Copy pathinteg.yml
File metadata and controls
275 lines (254 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
name: integ
# Phase-0 deploy-then-verify integration tests (issue #236). integ-runner
# deploys a trimmed Task API stack into the shared account behind
# secrets.AWS_ROLE_TO_ASSUME, runs the create-and-persist smoke assertions, then
# tears the stack down.
#
# Trigger model mirrors deploy.yml: build.yml completes -> workflow_run picks it
# up in the trusted base-repo context (secrets/OIDC available even for fork PRs)
# -> we resolve whether the PR touches cdk/** or agent/** -> an admin approves
# the `integ` environment gate -> deploy/assert/destroy runs against the shared
# account -> a commit status `integ-smoke` is posted back to the PR head so it
# shows up as a (required) check that blocks merge.
#
# Local dev path is unchanged: run `mise //cdk:integ` with your own AWS creds.
#
# Nightly schedule was intentionally dropped (previously 07:00 UTC) — the per-PR
# path plus manual dispatch is the agreed coverage; this is not an oversight.
on:
# zizmor: ignore[dangerous-triggers] — intentional; workflow_run is required so
# fork PRs can run against the shared account (a fork `pull_request` job gets no
# secrets/OIDC). Mitigations: build-success guard, path-filter, `integ`
# environment approval gate (admin reviews fork test code before it runs with
# the privileged role), least-privilege role, status-only tokens per job.
workflow_run:
workflows: [build]
types: [completed]
workflow_dispatch: {}
# Only one integ run at a time against the shared account — overlapping deploys
# would collide on the single hardcoded `backgroundagent-integ` stack name.
concurrency:
group: cdk-integ
cancel-in-progress: false
permissions: {}
jobs:
# Decides whether this PR needs the integ run (touches cdk/** or agent/**) and
# posts the gating `integ-smoke` status. Always runs on a successful build so
# docs/cli-only PRs get an immediate green (skipped) status and never deadlock
# the required check.
resolve:
# Manual dispatch is restricted to main (defence in depth — the `integ`
# environment approval is the primary gate). PR runs come via workflow_run.
if: >-
(github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') ||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
runs-on: ubuntu-latest
permissions:
statuses: write
pull-requests: read
outputs:
applicable: ${{ steps.decide.outputs.applicable }}
head_sha: ${{ steps.decide.outputs.head_sha }}
head_repo: ${{ steps.decide.outputs.head_repo }}
steps:
- name: Resolve applicability and post pending status
id: decide
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
EVENT_NAME: ${{ github.event_name }}
# Empty for workflow_dispatch.
PR_NUMBER_FROM_EVENT: ${{ github.event.workflow_run.pull_requests[0].number }}
WF_HEAD_SHA: ${{ github.event.workflow_run.head_sha }}
WF_HEAD_REPO: ${{ github.event.workflow_run.head_repository.full_name }}
run: |
set -euo pipefail
# workflow_dispatch: no PR context — run against the dispatched ref
# (the job's own checkout defaults). Mark applicable, skip status post.
if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
echo "applicable=true" >> "$GITHUB_OUTPUT"
echo "head_sha=${GITHUB_SHA}" >> "$GITHUB_OUTPUT"
echo "head_repo=${REPO}" >> "$GITHUB_OUTPUT"
echo "Manual dispatch — running integ against ${GITHUB_SHA}."
exit 0
fi
HEAD_SHA="$WF_HEAD_SHA"
echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
echo "head_repo=${WF_HEAD_REPO}" >> "$GITHUB_OUTPUT"
# Track whether we've posted a terminal integ-smoke status. If the job
# dies (failed API call, runner crash) before reaching one, the EXIT
# trap posts `error` so the required check resolves instead of hanging
# pending forever and silently blocking merge.
STATUS_POSTED=""
post_status() {
# $1=state $2=description
gh api -X POST "repos/$REPO/statuses/$HEAD_SHA" \
-f context=integ-smoke \
-f state="$1" \
-f description="$2" \
-f target_url="${{ github.server_url }}/$REPO/actions/runs/${{ github.run_id }}" \
>/dev/null
STATUS_POSTED="yes"
}
on_exit() {
rc=$?
# Only meaningful in the workflow_run (PR) context and only if we have
# a SHA to post against and haven't already posted a terminal status.
if [[ $rc -ne 0 && -z "$STATUS_POSTED" && -n "${HEAD_SHA:-}" ]]; then
gh api -X POST "repos/$REPO/statuses/$HEAD_SHA" \
-f context=integ-smoke \
-f state=error \
-f description="resolve step failed before gating" \
-f target_url="${{ github.server_url }}/$REPO/actions/runs/${{ github.run_id }}" \
>/dev/null 2>&1 || true
fi
}
trap on_exit EXIT
resolve_pr_number() {
if [[ -n "$PR_NUMBER_FROM_EVENT" ]]; then
echo "$PR_NUMBER_FROM_EVENT"
return
fi
gh api "repos/$REPO/commits/$HEAD_SHA/pulls" --jq '.[0].number // empty' 2>/dev/null || true
}
PR_NUMBER=$(resolve_pr_number)
if [[ -z "$PR_NUMBER" ]]; then
echo "::warning::No PR resolved for $HEAD_SHA — nothing to gate; skipping."
echo "applicable=false" >> "$GITHUB_OUTPUT"
exit 0
fi
# Fork-PR safety: only run fork-authored code after a maintainer has
# applied the `safe-to-test` label (defence in depth on top of the
# `integ` environment approval). If it's absent, leave the status
# pending and don't run — re-trigger once the label is added.
if [[ "$WF_HEAD_REPO" != "$REPO" ]]; then
if ! LABELS=$(gh api "repos/$REPO/issues/$PR_NUMBER/labels" --jq '.[].name'); then
echo "::error::Failed to read labels for PR #$PR_NUMBER."
exit 1
fi
if ! echo "$LABELS" | grep -qx 'safe-to-test'; then
post_status pending "awaiting safe-to-test label on fork PR"
echo "applicable=false" >> "$GITHUB_OUTPUT"
echo "Fork PR #$PR_NUMBER lacks safe-to-test label — not running."
exit 0
fi
fi
# Path-filter must happen here (not on.pull_request.paths) because the
# trigger is workflow_run. Fail loud on API error: a failed or truncated
# /files response must NOT fall through to a false-green skip. With
# `set -e`, an assignment inside an `if !` condition does not trip
# errexit, so we handle the failure explicitly and let the EXIT trap
# post `error`.
if ! CHANGED=$(gh api "repos/$REPO/pulls/$PR_NUMBER/files" --paginate --jq '.[].filename'); then
echo "::error::Failed to list changed files for PR #$PR_NUMBER."
exit 1
fi
if echo "$CHANGED" | grep -Eq '^(cdk|agent)/'; then
post_status pending "awaiting admin approval / running"
echo "applicable=true" >> "$GITHUB_OUTPUT"
echo "PR #$PR_NUMBER touches cdk/** or agent/** — integ applies."
else
post_status success "skipped — no cdk/** or agent/** changes"
echo "applicable=false" >> "$GITHUB_OUTPUT"
echo "PR #$PR_NUMBER has no cdk/** or agent/** changes — integ skipped (green)."
fi
# The admin-gated deploy -> assert -> destroy. The `integ` environment's
# required reviewer is the approval gate; while it waits, the integ-smoke
# status stays pending and merge stays blocked.
integ:
needs: resolve
if: needs.resolve.outputs.applicable == 'true'
name: CDK integ smoke (Task API)
runs-on: ubuntu-latest
environment: integ
timeout-minutes: 45
permissions:
id-token: write
contents: read
env:
CI: "true"
MISE_EXPERIMENTAL: "1"
steps:
- name: Checkout PR head (incl. forks)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
# Approving the `integ` environment authorizes this fork-authored test
# code to run with the privileged role — the approver MUST review
# cdk/test/integ/** changes before approving.
repository: ${{ needs.resolve.outputs.head_repo }}
ref: ${{ needs.resolve.outputs.head_sha }}
persist-credentials: false
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
# Fall back to us-east-1 if the repo variable is unset, so the action
# never runs region-less (which would fail credential resolution).
aws-region: ${{ vars.AWS_REGION || 'us-east-1' }}
- name: Install mise
uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1
with:
cache: true
- name: Setup Node.js
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: 22.x
- name: Install dependencies
run: yarn install --immutable
- name: Run integ tests (deploy → assert → destroy)
run: mise //cdk:integ
# Safety net: integ-runner forces teardown on success and failure, but if
# the run is cancelled or crashes mid-deploy the stack can be stranded in
# the shared account. Delete it directly via CloudFormation so we never
# leak billable resources.
#
# NOTE: `cdk destroy backgroundagent-integ` would NOT work here — it
# synthesizes the main app (src/main.ts), which does not contain the integ
# stack, so it exits 0 having deleted nothing. Target the stack by its
# literal CloudFormation name instead. delete-stack is idempotent (no-op if
# already gone), so `|| true` only guards transient API errors.
- name: Ensure stack torn down
if: always()
env:
AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
run: |
set -euo pipefail
aws cloudformation delete-stack --stack-name backgroundagent-integ || true
# No `|| true` on the wait: a DELETE_FAILED must surface loudly so we
# never silently leak billable resources in the shared account.
aws cloudformation wait stack-delete-complete --stack-name backgroundagent-integ
# Post the final integ-smoke status back to the PR head so the check flips from
# pending to success/failure. Skipped for workflow_dispatch (no PR to gate).
report:
needs: [resolve, integ]
if: >-
always() &&
needs.resolve.result == 'success' &&
needs.resolve.outputs.applicable == 'true' &&
github.event_name == 'workflow_run'
runs-on: ubuntu-latest
permissions:
statuses: write
steps:
- name: Post final integ-smoke status
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
HEAD_SHA: ${{ needs.resolve.outputs.head_sha }}
INTEG_RESULT: ${{ needs.integ.result }}
run: |
set -euo pipefail
if [[ "$INTEG_RESULT" == "success" ]]; then
STATE=success
DESC="deploy → assert → destroy passed"
else
STATE=failure
DESC="integ run ${INTEG_RESULT}"
fi
gh api -X POST "repos/$REPO/statuses/$HEAD_SHA" \
-f context=integ-smoke \
-f state="$STATE" \
-f description="$DESC" \
-f target_url="${{ github.server_url }}/$REPO/actions/runs/${{ github.run_id }}" \
>/dev/null