|
| 1 | +name: Runner Liveness |
| 2 | + |
| 3 | +# #509 slice 1 — liveness alert for the self-hosted runner pool. |
| 4 | +# |
| 5 | +# Every gating CI job runs on `[self-hosted, …]`, so when the pool goes offline |
| 6 | +# every gate queues forever with no fallback and no alarm (the multi-day outage |
| 7 | +# in #509 was invisible until someone noticed by hand). This workflow is |
| 8 | +# GitHub-HOSTED (`ubuntu-latest`), so it keeps firing even when the self-hosted |
| 9 | +# pool is down, and turns that silent failure into a durable tracking issue. |
| 10 | +# |
| 11 | +# Signals (in order of reliability): |
| 12 | +# 1. Queued-run age — authoritative. Needs only `actions: read`, works |
| 13 | +# regardless of whether runners are registered at repo or org level, and |
| 14 | +# directly measures the symptom (jobs stuck in `queued`). |
| 15 | +# 2. Runner list — best-effort. `GET /repos/{repo}/actions/runners` needs the |
| 16 | +# `administration` permission, which is NOT a grantable `GITHUB_TOKEN` |
| 17 | +# scope, so the default token gets 403 (and an org-level pool returns empty |
| 18 | +# anyway). A failed/empty lookup is logged and SKIPPED rather than raised as |
| 19 | +# a false alarm; wire a PAT into `GH_TOKEN` later if a hard runner count is |
| 20 | +# wanted. The queued-age signal above is the real alarm. |
| 21 | + |
| 22 | +on: |
| 23 | + schedule: |
| 24 | + - cron: "*/15 * * * *" # every 15 minutes |
| 25 | + workflow_dispatch: |
| 26 | + |
| 27 | +permissions: |
| 28 | + actions: read # list workflow runs (queued-age check — the authoritative signal) |
| 29 | + issues: write # open / update / close the tracking issue |
| 30 | + # NB: listing self-hosted runners needs the `administration` scope, which is |
| 31 | + # not grantable to GITHUB_TOKEN — that check is best-effort and self-skips. |
| 32 | + |
| 33 | +concurrency: |
| 34 | + group: runner-liveness |
| 35 | + cancel-in-progress: false |
| 36 | + |
| 37 | +env: |
| 38 | + QUEUE_THRESHOLD_MINUTES: "30" |
| 39 | + TRACKING_LABEL: "runner-down" |
| 40 | + TRACKING_TITLE: "🚨 CI runner pool liveness alert" |
| 41 | + |
| 42 | +jobs: |
| 43 | + liveness: |
| 44 | + runs-on: ubuntu-latest |
| 45 | + steps: |
| 46 | + - name: Probe runner pool and queued runs |
| 47 | + id: probe |
| 48 | + env: |
| 49 | + GH_TOKEN: ${{ github.token }} |
| 50 | + REPO: ${{ github.repository }} |
| 51 | + run: | |
| 52 | + set -euo pipefail |
| 53 | + problems=() |
| 54 | +
|
| 55 | + # --- Runner list (best-effort) ------------------------------------- |
| 56 | + if runners=$(gh api "repos/$REPO/actions/runners" 2>/dev/null); then |
| 57 | + total=$(jq '.total_count' <<<"$runners") |
| 58 | + online=$(jq '[.runners[]? | select(.status=="online")] | length' <<<"$runners") |
| 59 | + echo "runners: total=$total online=$online" |
| 60 | + if [ "${total:-0}" -gt 0 ] && [ "${online:-0}" -eq 0 ]; then |
| 61 | + problems+=("All ${total} registered runner(s) are offline.") |
| 62 | + fi |
| 63 | + else |
| 64 | + echo "note: could not list repo runners (org-level pool or missing administration:read) — skipping runner check" |
| 65 | + fi |
| 66 | +
|
| 67 | + # --- Queued-run age (authoritative) -------------------------------- |
| 68 | + threshold="${QUEUE_THRESHOLD_MINUTES}" |
| 69 | + now=$(date -u +%s) |
| 70 | + queued=$(gh api "repos/$REPO/actions/runs?status=queued&per_page=100" 2>/dev/null || echo '{}') |
| 71 | + oldest_age=0 |
| 72 | + oldest_id="" |
| 73 | + while read -r id created; do |
| 74 | + [ -n "$id" ] || continue |
| 75 | + age=$(( (now - $(date -u -d "$created" +%s)) / 60 )) |
| 76 | + if [ "$age" -gt "$oldest_age" ]; then |
| 77 | + oldest_age=$age |
| 78 | + oldest_id=$id |
| 79 | + fi |
| 80 | + done < <(jq -r '.workflow_runs[]? | "\(.id) \(.created_at)"' <<<"$queued") |
| 81 | + echo "oldest queued run: id=${oldest_id:-none} age=${oldest_age}m (threshold ${threshold}m)" |
| 82 | + if [ "$oldest_age" -gt "$threshold" ]; then |
| 83 | + problems+=("Run ${oldest_id} has been queued ${oldest_age}m (> ${threshold}m) — runners are not picking up jobs.") |
| 84 | + fi |
| 85 | +
|
| 86 | + # --- Result -------------------------------------------------------- |
| 87 | + if [ ${#problems[@]} -gt 0 ]; then |
| 88 | + { |
| 89 | + echo "status=down" |
| 90 | + echo "summary<<SUMEOF" |
| 91 | + printf -- '- %s\n' "${problems[@]}" |
| 92 | + echo "SUMEOF" |
| 93 | + } >> "$GITHUB_OUTPUT" |
| 94 | + else |
| 95 | + echo "status=up" >> "$GITHUB_OUTPUT" |
| 96 | + fi |
| 97 | +
|
| 98 | + - name: Open or update tracking issue |
| 99 | + if: steps.probe.outputs.status == 'down' |
| 100 | + env: |
| 101 | + GH_TOKEN: ${{ github.token }} |
| 102 | + REPO: ${{ github.repository }} |
| 103 | + SUMMARY: ${{ steps.probe.outputs.summary }} |
| 104 | + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} |
| 105 | + run: | |
| 106 | + set -euo pipefail |
| 107 | + stamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") |
| 108 | + body=$(cat <<EOF |
| 109 | + <!-- runner-liveness-tracker --> |
| 110 | + The scheduled runner-liveness probe detected a problem at ${stamp}: |
| 111 | +
|
| 112 | + ${SUMMARY} |
| 113 | +
|
| 114 | + This issue auto-updates on each probe and auto-closes when the pool recovers. |
| 115 | + Diagnose: \`gh api repos/${REPO}/actions/runners --jq .total_count\` (0 = pool offline). |
| 116 | + Probe run: ${RUN_URL} |
| 117 | + EOF |
| 118 | + ) |
| 119 | + existing=$(gh issue list --repo "$REPO" --label "$TRACKING_LABEL" --state open --json number --jq '.[0].number // empty') |
| 120 | + if [ -n "$existing" ]; then |
| 121 | + gh issue comment "$existing" --repo "$REPO" --body "$body" |
| 122 | + echo "updated tracking issue #$existing" |
| 123 | + else |
| 124 | + gh label create "$TRACKING_LABEL" --repo "$REPO" --color B60205 --description "CI runner pool liveness" 2>/dev/null || true |
| 125 | + gh issue create --repo "$REPO" --title "$TRACKING_TITLE" --label "$TRACKING_LABEL" --body "$body" |
| 126 | + fi |
| 127 | +
|
| 128 | + - name: Close tracking issue on recovery |
| 129 | + if: steps.probe.outputs.status == 'up' |
| 130 | + env: |
| 131 | + GH_TOKEN: ${{ github.token }} |
| 132 | + REPO: ${{ github.repository }} |
| 133 | + run: | |
| 134 | + set -euo pipefail |
| 135 | + stamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") |
| 136 | + for n in $(gh issue list --repo "$REPO" --label "$TRACKING_LABEL" --state open --json number --jq '.[].number'); do |
| 137 | + gh issue comment "$n" --repo "$REPO" --body "✅ Runner pool healthy again as of ${stamp} (probe run ${{ github.run_id }}). Auto-closing." |
| 138 | + gh issue close "$n" --repo "$REPO" --reason completed |
| 139 | + echo "closed recovered tracking issue #$n" |
| 140 | + done |
| 141 | +
|
| 142 | + - name: Fail the run when the pool is down |
| 143 | + if: steps.probe.outputs.status == 'down' |
| 144 | + run: | |
| 145 | + echo "::error title=Runner pool down::Self-hosted runner pool liveness check failed — see the '$TRACKING_LABEL' tracking issue." |
| 146 | + exit 1 |
0 commit comments