Skip to content

Commit c840426

Browse files
authored
Merge pull request #4818 from cardstack/cs-11106-preview-deployments-for-grafana-prs
CS-11106: per-PR preview deployments for grafana dashboards
2 parents 22f887b + e39be9b commit c840426

6 files changed

Lines changed: 1056 additions & 31 deletions

File tree

.github/workflows/observability-diff.yml

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,17 @@ jobs:
8383
fetch-depth: 0
8484

8585
- name: Fetch PR base ref
86-
# Belt-and-suspenders: even with `fetch-depth: 0`, ensure the
87-
# PR base ref is available as `origin/<base>` so diff.sh's
88-
# `git rev-parse --verify` succeeds. Cheap on already-fetched
89-
# history; idempotent if the ref is already up to date.
86+
# Ensure the PR base ref is available as `origin/<base>` so
87+
# diff.sh's `git rev-parse --verify` and `git diff <base>...HEAD`
88+
# both succeed. NOTE: no `--depth=1` here. A shallow fetch makes
89+
# origin/<base> an orphan with no parent history; when main has
90+
# advanced past the PR's actual merge base (any non-trivial PR
91+
# in flight long enough for main to move), `git diff <base>...HEAD`
92+
# then fails with "no merge base". Unbounded fetch is cheap
93+
# against an already-full repo (`fetch-depth: 0` on checkout).
9094
env:
9195
PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
92-
run: git fetch --no-tags --depth=1 origin "${PR_BASE_REF}"
96+
run: git fetch --no-tags origin "${PR_BASE_REF}"
9397

9498
- name: Install grafanactl
9599
uses: ./.github/actions/install-grafanactl
@@ -184,17 +188,43 @@ jobs:
184188
const diffBytes = Buffer.byteLength(diff, 'utf8');
185189
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
186190
187-
let body;
191+
// Find existing sticky comment. Paginate so we don't miss
192+
// the marker on PRs with >100 comments (would otherwise
193+
// create duplicates on every push).
194+
const comments = await github.paginate(
195+
github.rest.issues.listComments,
196+
{
197+
owner: context.repo.owner,
198+
repo: context.repo.repo,
199+
issue_number: context.issue.number,
200+
per_page: 100,
201+
}
202+
);
203+
const existing = comments.find(
204+
(c) => c.body && c.body.startsWith(marker)
205+
);
206+
188207
if (!diff.trim()) {
189-
body = [
190-
marker,
191-
'## Observability diff (vs staging)',
192-
'',
193-
'No dashboard / folder changes detected against the staging Grafana.',
194-
'',
195-
`_(Run: ${runUrl})_`,
196-
].join('\n');
197-
} else if (diffBytes > maxBytes) {
208+
// No drift between the PR's committed state and live staging.
209+
// Don't post a "nothing changed" comment — but if a prior push
210+
// left a sticky behind (the PR used to touch dashboards and
211+
// now doesn't), delete it so the PR's comment stream matches
212+
// its current state.
213+
if (existing) {
214+
await github.rest.issues.deleteComment({
215+
owner: context.repo.owner,
216+
repo: context.repo.repo,
217+
comment_id: existing.id,
218+
});
219+
core.info(`Deleted stale sticky comment ${existing.id}`);
220+
} else {
221+
core.info('No diff vs staging — nothing to post.');
222+
}
223+
return;
224+
}
225+
226+
let body;
227+
if (diffBytes > maxBytes) {
198228
// Truncate by bytes, not characters. Walk back from a
199229
// character boundary so we don't slice mid-codepoint and
200230
// produce invalid UTF-8.
@@ -225,22 +255,6 @@ jobs:
225255
].join('\n');
226256
}
227257
228-
// Find existing sticky comment. Paginate so we don't miss
229-
// the marker on PRs with >100 comments (would otherwise
230-
// create duplicates on every push).
231-
const comments = await github.paginate(
232-
github.rest.issues.listComments,
233-
{
234-
owner: context.repo.owner,
235-
repo: context.repo.repo,
236-
issue_number: context.issue.number,
237-
per_page: 100,
238-
}
239-
);
240-
const existing = comments.find(
241-
(c) => c.body && c.body.startsWith(marker)
242-
);
243-
244258
if (existing) {
245259
await github.rest.issues.updateComment({
246260
owner: context.repo.owner,
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
name: Observability — preview sweep
2+
3+
# Daily sweep that removes stale preview folders + dashboards from the
4+
# staging Grafana. Safety net for observability-preview.yml's cleanup-on-
5+
# close job: a missed `closed` webhook, a job that errored mid-delete, or a
6+
# preview pushed against a PR that was later deleted from GitHub all leave
7+
# orphan `pr<n>-*` resources behind. This workflow notices and cleans up.
8+
#
9+
# Decision rule per PR number (derived from the `pr<n>-` UID prefix):
10+
# - PR is open → keep
11+
# - PR is closed/merged → delete
12+
# - PR doesn't exist on GitHub → delete (PR was deleted from repo state)
13+
#
14+
# CS-11106.
15+
16+
on:
17+
schedule:
18+
# Daily at 06:23 UTC — chosen to avoid the top-of-hour rush. No
19+
# particular significance to the minute; deliberately not on :00 so
20+
# GHA's "scheduled workflows often run minutes late" behavior doesn't
21+
# batch this with other on-the-hour crons.
22+
- cron: "23 6 * * *"
23+
workflow_dispatch:
24+
25+
permissions:
26+
contents: read
27+
id-token: write
28+
# `gh pr view` calls in the sweep step need read access to the repo's
29+
# pull requests. Without this, every PR-state lookup would return 403
30+
# and the sweep would either delete previews for genuinely open PRs
31+
# (legacy behavior) or — after the indeterminate-state guard below —
32+
# abort the sweep entirely.
33+
pull-requests: read
34+
35+
concurrency:
36+
# The sweeper is idempotent, but running two at once would do redundant
37+
# API calls against Grafana and GitHub. Serialize; don't cancel an
38+
# in-flight run if the cron fires while a manual dispatch is still going.
39+
group: observability-preview-sweep
40+
cancel-in-progress: false
41+
42+
env:
43+
AWS_REGION: us-east-1
44+
AWS_ROLE_ARN: arn:aws:iam::680542703984:role/boxel-observability-apply
45+
GRAFANACTL_VERSION: "0.1.10"
46+
47+
jobs:
48+
sweep:
49+
name: Sweep stale previews from staging
50+
runs-on: ubuntu-latest
51+
52+
steps:
53+
- name: Checkout
54+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
55+
56+
- name: Install grafanactl
57+
uses: ./.github/actions/install-grafanactl
58+
with:
59+
version: ${{ env.GRAFANACTL_VERSION }}
60+
61+
- name: Configure AWS credentials
62+
uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 # v6.1.0
63+
with:
64+
role-to-assume: ${{ env.AWS_ROLE_ARN }}
65+
aws-region: ${{ env.AWS_REGION }}
66+
67+
- name: Source GRAFANA_TOKEN
68+
run: |
69+
GRAFANA_TOKEN="$(aws ssm get-parameter \
70+
--name /staging/grafana/grafanactl_token \
71+
--with-decryption \
72+
--query 'Parameter.Value' \
73+
--output text)"
74+
echo "::add-mask::$GRAFANA_TOKEN"
75+
echo "GRAFANA_TOKEN=$GRAFANA_TOKEN" >> "$GITHUB_ENV"
76+
77+
- name: Identify stale previews
78+
id: identify
79+
working-directory: packages/observability
80+
env:
81+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
82+
REPO: ${{ github.repository }}
83+
run: |
84+
set -eo pipefail
85+
remote="$(mktemp -d)"
86+
cfg="$(./scripts/render-config.sh staging)"
87+
trap 'rm -rf "$remote"; rm -f "$cfg"' EXIT
88+
89+
grafanactl --config "$cfg" --context staging \
90+
resources pull dashboards folders --path "$remote" >/dev/null
91+
92+
# Extract the set of unique PR numbers present in preview UIDs.
93+
# `grep ... || true` because `set -eo pipefail` would otherwise
94+
# abort the step when grep finds no matches — that's the legitimate
95+
# "no previews live in staging" path, not an error.
96+
pr_numbers="$(find "$remote" -type f -name '*.json' \
97+
-exec jq -r '.metadata.name // ""' {} \; \
98+
| { grep -E '^pr[0-9]+-' || true; } \
99+
| sed -E 's/^pr([0-9]+)-.*/\1/' \
100+
| sort -u)"
101+
102+
if [[ -z "$pr_numbers" ]]; then
103+
echo "no preview resources present — nothing to sweep"
104+
echo "stale_prs=" >> "$GITHUB_OUTPUT"
105+
exit 0
106+
fi
107+
108+
echo "found previews for PR(s): $(echo "$pr_numbers" | tr '\n' ' ')"
109+
110+
# For each PR number, ask GitHub whether it's still open. Use
111+
# `gh api` directly (rather than `gh pr view`) so we can
112+
# distinguish a real 404 ("PR was deleted from the repo —
113+
# safe to sweep") from any other failure ("transient API
114+
# error / rate limit / auth glitch — DO NOT sweep, fail the
115+
# step so a human can investigate"). Treating every nonzero
116+
# exit as "PR doesn't exist → delete its preview" would let
117+
# a GitHub outage wipe previews for legitimately open PRs.
118+
stale=""
119+
while IFS= read -r pr; do
120+
[[ -n "$pr" ]] || continue
121+
api_out="$(mktemp)"
122+
api_err="$(mktemp)"
123+
if gh api "repos/${REPO}/pulls/${pr}" \
124+
--jq '.state' \
125+
> "$api_out" 2> "$api_err"; then
126+
state="$(cat "$api_out")"
127+
rm -f "$api_out" "$api_err"
128+
case "$state" in
129+
open)
130+
echo " PR #${pr}: open — keep"
131+
;;
132+
closed)
133+
echo " PR #${pr}: closed — sweep"
134+
stale="${stale}${pr} "
135+
;;
136+
*)
137+
echo "::error::PR #${pr}: unexpected state '${state}' — aborting sweep"
138+
exit 1
139+
;;
140+
esac
141+
else
142+
# `gh api` writes the HTTP status in the error body for
143+
# non-2xx responses. A 404 is the unambiguous "PR was
144+
# deleted from the repo" signal; anything else (network,
145+
# rate-limit, 5xx, 403) is indeterminate and we refuse
146+
# to sweep on it.
147+
err_body="$(cat "$api_err")"
148+
rm -f "$api_out" "$api_err"
149+
if grep -q 'HTTP 404' <<< "$err_body"; then
150+
echo " PR #${pr}: 404 — sweep (PR deleted from repo)"
151+
stale="${stale}${pr} "
152+
else
153+
echo "::error::PR #${pr}: indeterminate gh-api failure — aborting sweep"
154+
echo "::error::${err_body}"
155+
exit 1
156+
fi
157+
fi
158+
done <<< "$pr_numbers"
159+
160+
echo "stale_prs=${stale}" >> "$GITHUB_OUTPUT"
161+
162+
- name: Sweep
163+
if: steps.identify.outputs.stale_prs != ''
164+
working-directory: packages/observability
165+
env:
166+
STALE_PRS: ${{ steps.identify.outputs.stale_prs }}
167+
run: |
168+
set -eo pipefail
169+
for pr in $STALE_PRS; do
170+
echo "::group::Cleanup PR #${pr}"
171+
./scripts/cleanup-preview.sh --pr "$pr" --env staging
172+
echo "::endgroup::"
173+
done
174+
175+
- name: Summary
176+
if: always()
177+
env:
178+
STALE_PRS: ${{ steps.identify.outputs.stale_prs }}
179+
run: |
180+
{
181+
echo "## Preview sweep"
182+
echo ""
183+
if [[ -z "${STALE_PRS:-}" ]]; then
184+
echo "No stale previews."
185+
else
186+
echo "Swept previews for PR(s): \`${STALE_PRS}\`"
187+
fi
188+
} >> "$GITHUB_STEP_SUMMARY"

0 commit comments

Comments
 (0)