|
| 1 | +name: Observability — preview sweep |
| 2 | + |
| 3 | +# Daily sweep that removes stale preview folders + dashboards from the |
| 4 | +# staging Grafana. Safety net for observability-preview.yml's cleanup-on- |
| 5 | +# close job: a missed `closed` webhook, a job that errored mid-delete, or a |
| 6 | +# preview pushed against a PR that was later deleted from GitHub all leave |
| 7 | +# orphan `pr<n>-*` resources behind. This workflow notices and cleans up. |
| 8 | +# |
| 9 | +# Decision rule per PR number (derived from the `pr<n>-` UID prefix): |
| 10 | +# - PR is open → keep |
| 11 | +# - PR is closed/merged → delete |
| 12 | +# - PR doesn't exist on GitHub → delete (PR was deleted from repo state) |
| 13 | +# |
| 14 | +# CS-11106. |
| 15 | + |
| 16 | +on: |
| 17 | + schedule: |
| 18 | + # Daily at 06:23 UTC — chosen to avoid the top-of-hour rush. No |
| 19 | + # particular significance to the minute; deliberately not on :00 so |
| 20 | + # GHA's "scheduled workflows often run minutes late" behavior doesn't |
| 21 | + # batch this with other on-the-hour crons. |
| 22 | + - cron: "23 6 * * *" |
| 23 | + workflow_dispatch: |
| 24 | + |
| 25 | +permissions: |
| 26 | + contents: read |
| 27 | + id-token: write |
| 28 | + # `gh pr view` calls in the sweep step need read access to the repo's |
| 29 | + # pull requests. Without this, every PR-state lookup would return 403 |
| 30 | + # and the sweep would either delete previews for genuinely open PRs |
| 31 | + # (legacy behavior) or — after the indeterminate-state guard below — |
| 32 | + # abort the sweep entirely. |
| 33 | + pull-requests: read |
| 34 | + |
| 35 | +concurrency: |
| 36 | + # The sweeper is idempotent, but running two at once would do redundant |
| 37 | + # API calls against Grafana and GitHub. Serialize; don't cancel an |
| 38 | + # in-flight run if the cron fires while a manual dispatch is still going. |
| 39 | + group: observability-preview-sweep |
| 40 | + cancel-in-progress: false |
| 41 | + |
| 42 | +env: |
| 43 | + AWS_REGION: us-east-1 |
| 44 | + AWS_ROLE_ARN: arn:aws:iam::680542703984:role/boxel-observability-apply |
| 45 | + GRAFANACTL_VERSION: "0.1.10" |
| 46 | + |
| 47 | +jobs: |
| 48 | + sweep: |
| 49 | + name: Sweep stale previews from staging |
| 50 | + runs-on: ubuntu-latest |
| 51 | + |
| 52 | + steps: |
| 53 | + - name: Checkout |
| 54 | + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 |
| 55 | + |
| 56 | + - name: Install grafanactl |
| 57 | + uses: ./.github/actions/install-grafanactl |
| 58 | + with: |
| 59 | + version: ${{ env.GRAFANACTL_VERSION }} |
| 60 | + |
| 61 | + - name: Configure AWS credentials |
| 62 | + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 # v6.1.0 |
| 63 | + with: |
| 64 | + role-to-assume: ${{ env.AWS_ROLE_ARN }} |
| 65 | + aws-region: ${{ env.AWS_REGION }} |
| 66 | + |
| 67 | + - name: Source GRAFANA_TOKEN |
| 68 | + run: | |
| 69 | + GRAFANA_TOKEN="$(aws ssm get-parameter \ |
| 70 | + --name /staging/grafana/grafanactl_token \ |
| 71 | + --with-decryption \ |
| 72 | + --query 'Parameter.Value' \ |
| 73 | + --output text)" |
| 74 | + echo "::add-mask::$GRAFANA_TOKEN" |
| 75 | + echo "GRAFANA_TOKEN=$GRAFANA_TOKEN" >> "$GITHUB_ENV" |
| 76 | +
|
| 77 | + - name: Identify stale previews |
| 78 | + id: identify |
| 79 | + working-directory: packages/observability |
| 80 | + env: |
| 81 | + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| 82 | + REPO: ${{ github.repository }} |
| 83 | + run: | |
| 84 | + set -eo pipefail |
| 85 | + remote="$(mktemp -d)" |
| 86 | + cfg="$(./scripts/render-config.sh staging)" |
| 87 | + trap 'rm -rf "$remote"; rm -f "$cfg"' EXIT |
| 88 | +
|
| 89 | + grafanactl --config "$cfg" --context staging \ |
| 90 | + resources pull dashboards folders --path "$remote" >/dev/null |
| 91 | +
|
| 92 | + # Extract the set of unique PR numbers present in preview UIDs. |
| 93 | + # `grep ... || true` because `set -eo pipefail` would otherwise |
| 94 | + # abort the step when grep finds no matches — that's the legitimate |
| 95 | + # "no previews live in staging" path, not an error. |
| 96 | + pr_numbers="$(find "$remote" -type f -name '*.json' \ |
| 97 | + -exec jq -r '.metadata.name // ""' {} \; \ |
| 98 | + | { grep -E '^pr[0-9]+-' || true; } \ |
| 99 | + | sed -E 's/^pr([0-9]+)-.*/\1/' \ |
| 100 | + | sort -u)" |
| 101 | +
|
| 102 | + if [[ -z "$pr_numbers" ]]; then |
| 103 | + echo "no preview resources present — nothing to sweep" |
| 104 | + echo "stale_prs=" >> "$GITHUB_OUTPUT" |
| 105 | + exit 0 |
| 106 | + fi |
| 107 | +
|
| 108 | + echo "found previews for PR(s): $(echo "$pr_numbers" | tr '\n' ' ')" |
| 109 | +
|
| 110 | + # For each PR number, ask GitHub whether it's still open. Use |
| 111 | + # `gh api` directly (rather than `gh pr view`) so we can |
| 112 | + # distinguish a real 404 ("PR was deleted from the repo — |
| 113 | + # safe to sweep") from any other failure ("transient API |
| 114 | + # error / rate limit / auth glitch — DO NOT sweep, fail the |
| 115 | + # step so a human can investigate"). Treating every nonzero |
| 116 | + # exit as "PR doesn't exist → delete its preview" would let |
| 117 | + # a GitHub outage wipe previews for legitimately open PRs. |
| 118 | + stale="" |
| 119 | + while IFS= read -r pr; do |
| 120 | + [[ -n "$pr" ]] || continue |
| 121 | + api_out="$(mktemp)" |
| 122 | + api_err="$(mktemp)" |
| 123 | + if gh api "repos/${REPO}/pulls/${pr}" \ |
| 124 | + --jq '.state' \ |
| 125 | + > "$api_out" 2> "$api_err"; then |
| 126 | + state="$(cat "$api_out")" |
| 127 | + rm -f "$api_out" "$api_err" |
| 128 | + case "$state" in |
| 129 | + open) |
| 130 | + echo " PR #${pr}: open — keep" |
| 131 | + ;; |
| 132 | + closed) |
| 133 | + echo " PR #${pr}: closed — sweep" |
| 134 | + stale="${stale}${pr} " |
| 135 | + ;; |
| 136 | + *) |
| 137 | + echo "::error::PR #${pr}: unexpected state '${state}' — aborting sweep" |
| 138 | + exit 1 |
| 139 | + ;; |
| 140 | + esac |
| 141 | + else |
| 142 | + # `gh api` writes the HTTP status in the error body for |
| 143 | + # non-2xx responses. A 404 is the unambiguous "PR was |
| 144 | + # deleted from the repo" signal; anything else (network, |
| 145 | + # rate-limit, 5xx, 403) is indeterminate and we refuse |
| 146 | + # to sweep on it. |
| 147 | + err_body="$(cat "$api_err")" |
| 148 | + rm -f "$api_out" "$api_err" |
| 149 | + if grep -q 'HTTP 404' <<< "$err_body"; then |
| 150 | + echo " PR #${pr}: 404 — sweep (PR deleted from repo)" |
| 151 | + stale="${stale}${pr} " |
| 152 | + else |
| 153 | + echo "::error::PR #${pr}: indeterminate gh-api failure — aborting sweep" |
| 154 | + echo "::error::${err_body}" |
| 155 | + exit 1 |
| 156 | + fi |
| 157 | + fi |
| 158 | + done <<< "$pr_numbers" |
| 159 | +
|
| 160 | + echo "stale_prs=${stale}" >> "$GITHUB_OUTPUT" |
| 161 | +
|
| 162 | + - name: Sweep |
| 163 | + if: steps.identify.outputs.stale_prs != '' |
| 164 | + working-directory: packages/observability |
| 165 | + env: |
| 166 | + STALE_PRS: ${{ steps.identify.outputs.stale_prs }} |
| 167 | + run: | |
| 168 | + set -eo pipefail |
| 169 | + for pr in $STALE_PRS; do |
| 170 | + echo "::group::Cleanup PR #${pr}" |
| 171 | + ./scripts/cleanup-preview.sh --pr "$pr" --env staging |
| 172 | + echo "::endgroup::" |
| 173 | + done |
| 174 | +
|
| 175 | + - name: Summary |
| 176 | + if: always() |
| 177 | + env: |
| 178 | + STALE_PRS: ${{ steps.identify.outputs.stale_prs }} |
| 179 | + run: | |
| 180 | + { |
| 181 | + echo "## Preview sweep" |
| 182 | + echo "" |
| 183 | + if [[ -z "${STALE_PRS:-}" ]]; then |
| 184 | + echo "No stale previews." |
| 185 | + else |
| 186 | + echo "Swept previews for PR(s): \`${STALE_PRS}\`" |
| 187 | + fi |
| 188 | + } >> "$GITHUB_STEP_SUMMARY" |
0 commit comments