From 5985218c222be15b4e7bf289562de91cdc8c3a71 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Sun, 14 Jun 2026 14:21:20 +0200 Subject: [PATCH] ci: add runner-liveness alert for the self-hosted pool (#509 slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every gating job runs on `[self-hosted, …]`, so when the pool goes offline every gate queues forever with no fallback and no alarm — the multi-day outage in #509 was invisible until noticed by hand. This GitHub-hosted workflow (ubuntu-latest, so it fires even when the pool is down) polls on a 15-min schedule + dispatch and raises a durable tracking issue instead of a transient red badge. Signals: (1) queued-run age > QUEUE_THRESHOLD_MINUTES (default 30) is the authoritative alarm — needs only actions:read and is agnostic to repo-vs-org runner registration; (2) the runner-list check is best-effort and self-skips, since listing self-hosted runners needs the `administration` scope that GITHUB_TOKEN cannot be granted. On a problem it opens or updates an idempotent `runner-down`-labelled issue (one tracker, comment-updated); on recovery it comments and auto-closes. Validated with actionlint (incl. shellcheck on the run blocks). Smoke-test via workflow_dispatch after merge. Out of scope (separate PRs): routing fast core gates to ubuntu-latest (runner policy + billing); the operational runbook. Trace: skip Refs: #509, #436 Co-Authored-By: Claude Opus 4.8 --- .github/workflows/runner-liveness.yml | 146 ++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 .github/workflows/runner-liveness.yml diff --git a/.github/workflows/runner-liveness.yml b/.github/workflows/runner-liveness.yml new file mode 100644 index 0000000..c2b16f4 --- /dev/null +++ b/.github/workflows/runner-liveness.yml @@ -0,0 +1,146 @@ +name: Runner Liveness + +# #509 slice 1 — liveness alert for the self-hosted runner pool. +# +# Every gating CI job runs on `[self-hosted, …]`, so when the pool goes offline +# every gate queues forever with no fallback and no alarm (the multi-day outage +# in #509 was invisible until someone noticed by hand). This workflow is +# GitHub-HOSTED (`ubuntu-latest`), so it keeps firing even when the self-hosted +# pool is down, and turns that silent failure into a durable tracking issue. +# +# Signals (in order of reliability): +# 1. Queued-run age — authoritative. Needs only `actions: read`, works +# regardless of whether runners are registered at repo or org level, and +# directly measures the symptom (jobs stuck in `queued`). +# 2. Runner list — best-effort. `GET /repos/{repo}/actions/runners` needs the +# `administration` permission, which is NOT a grantable `GITHUB_TOKEN` +# scope, so the default token gets 403 (and an org-level pool returns empty +# anyway). A failed/empty lookup is logged and SKIPPED rather than raised as +# a false alarm; wire a PAT into `GH_TOKEN` later if a hard runner count is +# wanted. The queued-age signal above is the real alarm. + +on: + schedule: + - cron: "*/15 * * * *" # every 15 minutes + workflow_dispatch: + +permissions: + actions: read # list workflow runs (queued-age check — the authoritative signal) + issues: write # open / update / close the tracking issue + # NB: listing self-hosted runners needs the `administration` scope, which is + # not grantable to GITHUB_TOKEN — that check is best-effort and self-skips. + +concurrency: + group: runner-liveness + cancel-in-progress: false + +env: + QUEUE_THRESHOLD_MINUTES: "30" + TRACKING_LABEL: "runner-down" + TRACKING_TITLE: "🚨 CI runner pool liveness alert" + +jobs: + liveness: + runs-on: ubuntu-latest + steps: + - name: Probe runner pool and queued runs + id: probe + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + problems=() + + # --- Runner list (best-effort) ------------------------------------- + if runners=$(gh api "repos/$REPO/actions/runners" 2>/dev/null); then + total=$(jq '.total_count' <<<"$runners") + online=$(jq '[.runners[]? | select(.status=="online")] | length' <<<"$runners") + echo "runners: total=$total online=$online" + if [ "${total:-0}" -gt 0 ] && [ "${online:-0}" -eq 0 ]; then + problems+=("All ${total} registered runner(s) are offline.") + fi + else + echo "note: could not list repo runners (org-level pool or missing administration:read) — skipping runner check" + fi + + # --- Queued-run age (authoritative) -------------------------------- + threshold="${QUEUE_THRESHOLD_MINUTES}" + now=$(date -u +%s) + queued=$(gh api "repos/$REPO/actions/runs?status=queued&per_page=100" 2>/dev/null || echo '{}') + oldest_age=0 + oldest_id="" + while read -r id created; do + [ -n "$id" ] || continue + age=$(( (now - $(date -u -d "$created" +%s)) / 60 )) + if [ "$age" -gt "$oldest_age" ]; then + oldest_age=$age + oldest_id=$id + fi + done < <(jq -r '.workflow_runs[]? | "\(.id) \(.created_at)"' <<<"$queued") + echo "oldest queued run: id=${oldest_id:-none} age=${oldest_age}m (threshold ${threshold}m)" + if [ "$oldest_age" -gt "$threshold" ]; then + problems+=("Run ${oldest_id} has been queued ${oldest_age}m (> ${threshold}m) — runners are not picking up jobs.") + fi + + # --- Result -------------------------------------------------------- + if [ ${#problems[@]} -gt 0 ]; then + { + echo "status=down" + echo "summary<> "$GITHUB_OUTPUT" + else + echo "status=up" >> "$GITHUB_OUTPUT" + fi + + - name: Open or update tracking issue + if: steps.probe.outputs.status == 'down' + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + SUMMARY: ${{ steps.probe.outputs.summary }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + set -euo pipefail + stamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + body=$(cat < + The scheduled runner-liveness probe detected a problem at ${stamp}: + + ${SUMMARY} + + This issue auto-updates on each probe and auto-closes when the pool recovers. + Diagnose: \`gh api repos/${REPO}/actions/runners --jq .total_count\` (0 = pool offline). + Probe run: ${RUN_URL} + EOF + ) + existing=$(gh issue list --repo "$REPO" --label "$TRACKING_LABEL" --state open --json number --jq '.[0].number // empty') + if [ -n "$existing" ]; then + gh issue comment "$existing" --repo "$REPO" --body "$body" + echo "updated tracking issue #$existing" + else + gh label create "$TRACKING_LABEL" --repo "$REPO" --color B60205 --description "CI runner pool liveness" 2>/dev/null || true + gh issue create --repo "$REPO" --title "$TRACKING_TITLE" --label "$TRACKING_LABEL" --body "$body" + fi + + - name: Close tracking issue on recovery + if: steps.probe.outputs.status == 'up' + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + stamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + for n in $(gh issue list --repo "$REPO" --label "$TRACKING_LABEL" --state open --json number --jq '.[].number'); do + gh issue comment "$n" --repo "$REPO" --body "✅ Runner pool healthy again as of ${stamp} (probe run ${{ github.run_id }}). Auto-closing." + gh issue close "$n" --repo "$REPO" --reason completed + echo "closed recovered tracking issue #$n" + done + + - name: Fail the run when the pool is down + if: steps.probe.outputs.status == 'down' + run: | + echo "::error title=Runner pool down::Self-hosted runner pool liveness check failed — see the '$TRACKING_LABEL' tracking issue." + exit 1