Watchdog: scan (cron) #142
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: "Watchdog: Stuck Jobs" | |
| run-name: "Watchdog: scan ${{ github.event_name == 'workflow_dispatch' && '(manual)' || '(cron)' }}" | |
| # Periodic safety net for the impl pipeline. Catches three failure modes | |
| # the regular workflows miss: | |
| # 1. PRs labeled `ai-review-failed` without `ai-review-rescued` | |
| # (impl-review-retry.yml normally handles these; watchdog covers | |
| # the case where that listener missed the labeled event). | |
| # 2. PRs with `ai-attempt-N` + `quality:*` but no decision label | |
| # (review handed off to repair, repair crashed, nothing else fired). | |
| # 3. spec-ready issues with `generate:<lib>` or `impl:<lib>:failed` and | |
| # no open PR for that (spec, lib) pair after the staleness window. | |
| # | |
| # Retries are bounded per-cause via marker labels: | |
| # - `ai-review-rescued` (review failures) | |
| # - `watchdog:repair-rescued-<N>` (repair failures, one per attempt#) | |
| # - `watchdog:retried-<lib>` (per-library generation failures) | |
| # When a marker is already present, the watchdog skips and emits a | |
| # warning so a human can pick the case up. | |
| on: | |
| schedule: | |
| - cron: '0 */6 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| stale_hours: | |
| description: "Hours of inactivity before a job is considered stuck" | |
| required: false | |
| default: '4' | |
| dry_run: | |
| description: "Log decisions without dispatching workflows" | |
| type: boolean | |
| default: false | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| issues: write | |
| actions: write | |
| concurrency: | |
| group: watchdog-stuck-jobs | |
| cancel-in-progress: false | |
| jobs: | |
| scan: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 | |
| with: | |
| fetch-depth: 1 | |
| - name: Scan and dispatch | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GH_REPO: ${{ github.repository }} | |
| STALE_HOURS: ${{ inputs.stale_hours || '4' }} | |
| DRY_RUN: ${{ inputs.dry_run || 'false' }} | |
| run: | | |
| set -euo pipefail | |
| STALE_SEC=$(( STALE_HOURS * 3600 )) | |
| NOW=$(date -u +%s) | |
| dispatch() { | |
| local label="$1"; shift | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| echo "::notice::[dry-run] $label → gh workflow run $*" | |
| else | |
| echo "::notice::$label → dispatching" | |
| gh workflow run "$@" | |
| fi | |
| } | |
| ensure_label() { | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| echo "::notice::[dry-run] would ensure label: $1" | |
| return 0 | |
| fi | |
| gh label create "$1" --color "$2" --description "$3" 2>/dev/null || true | |
| } | |
| # Resolve issue number → spec_id by parsing the issue title prefix | |
| # `[spec-id] ...` (set by spec-create.yml when the spec lands). | |
| spec_id_for_issue() { | |
| local issue_num="$1" | |
| gh issue view "$issue_num" --json title --jq '.title' \ | |
| | grep -oP '^\[\K[^]]+' || true | |
| } | |
| ##### A) Scan open implementation PRs ############################## | |
| # `gh pr list --search "head:..."` does not support prefix matching, | |
| # so list all open PRs and filter client-side by branch name. | |
| PRS_JSON=$(gh pr list --state open --limit 200 \ | |
| --json number,labels,headRefName,updatedAt \ | |
| --jq '[.[] | select(.headRefName | startswith("implementation/"))]') | |
| echo "Scanning $(jq 'length' <<<"$PRS_JSON") open implementation PR(s)" | |
| while IFS= read -r row; do | |
| num=$(jq -r '.number' <<<"$row") | |
| branch=$(jq -r '.headRefName' <<<"$row") | |
| updated=$(jq -r '.updatedAt' <<<"$row") | |
| labels=$(jq -r '[.labels[].name] | join(" ")' <<<"$row") | |
| updated_sec=$(date -u -d "$updated" +%s) | |
| age=$(( NOW - updated_sec )) | |
| spec_id=$(echo "$branch" | cut -d'/' -f2) | |
| library=$(echo "$branch" | cut -d'/' -f3) | |
| # Case 1: review failed, not yet rescued → re-dispatch review | |
| if echo " $labels " | grep -q " ai-review-failed "; then | |
| if echo " $labels " | grep -q " ai-review-rescued "; then | |
| echo "::warning::PR #$num: ai-review-failed persists after rescue — needs manual attention" | |
| else | |
| ensure_label "ai-review-rescued" "5319e7" "Review re-dispatched once after ai-review-failed" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh pr edit "$num" --add-label "ai-review-rescued" --remove-label "ai-review-failed" 2>/dev/null || true | |
| fi | |
| dispatch "PR #$num: review (failed)" impl-review.yml -f pr_number="$num" | |
| fi | |
| continue | |
| fi | |
| # Case 2: stalled repair handoff | |
| # has ai-attempt-N + quality:M, no ai-approved/ai-rejected, | |
| # PR untouched for stale_hours → re-dispatch impl-repair | |
| if echo " $labels " | grep -qE " ai-attempt-[0-9]+ " \ | |
| && echo " $labels " | grep -qE " quality:[0-9]+ " \ | |
| && ! echo " $labels " | grep -qE " (ai-approved|ai-rejected) " \ | |
| && (( age > STALE_SEC )); then | |
| attempt=$(echo " $labels " | grep -oP "ai-attempt-\K[0-9]+" | sort -nr | head -1) | |
| marker="watchdog:repair-rescued-$attempt" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::PR #$num: repair attempt $attempt already rescued — needs manual attention" | |
| continue | |
| fi | |
| ensure_label "$marker" "5319e7" "Watchdog re-dispatched repair attempt $attempt" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh pr edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "PR #$num: repair (stalled, attempt=$attempt)" impl-repair.yml \ | |
| -f pr_number="$num" \ | |
| -f specification_id="$spec_id" \ | |
| -f library="$library" \ | |
| -f attempt="$attempt" | |
| continue | |
| fi | |
| # Case 3: ai-approved but never merged — impl-merge `pull_request: labeled` | |
| # event was lost. Re-dispatch impl-merge.yml manually. | |
| if echo " $labels " | grep -q " ai-approved " \ | |
| && (( age > STALE_SEC )); then | |
| marker="watchdog:merge-rescued" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::PR #$num: ai-approved merge already rescued — needs manual attention" | |
| continue | |
| fi | |
| ensure_label "$marker" "5319e7" "Watchdog re-dispatched impl-merge once" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh pr edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "PR #$num: merge (ai-approved stuck)" impl-merge.yml \ | |
| -f pr_number="$num" | |
| continue | |
| fi | |
| # Case 4: ai-rejected but no ai-attempt-N label — impl-repair `labeled` | |
| # trigger inside impl-review.yml never fired. Re-dispatch attempt 1. | |
| if echo " $labels " | grep -q " ai-rejected " \ | |
| && ! echo " $labels " | grep -qE " ai-attempt-[0-9]+ " \ | |
| && (( age > STALE_SEC )); then | |
| marker="watchdog:repair-rescued-1" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::PR #$num: ai-rejected initial-repair already rescued — needs manual attention" | |
| continue | |
| fi | |
| ensure_label "$marker" "5319e7" "Watchdog re-dispatched repair attempt 1" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh pr edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "PR #$num: repair (initial, ai-rejected, no attempt label)" impl-repair.yml \ | |
| -f pr_number="$num" \ | |
| -f specification_id="$spec_id" \ | |
| -f library="$library" \ | |
| -f attempt="1" | |
| continue | |
| fi | |
| # Case 5: implementation PR with only watchdog:* markers (or none) — | |
| # impl-review never started. Happens when impl-generate.yml's | |
| # repository_dispatch to impl-review didn't land. | |
| # | |
| # We treat "labels are exactly watchdog:* markers, no review/repair | |
| # labels" as the same situation as "no labels at all" — without | |
| # that, our own previously-added marker would hide the PR from | |
| # detection. To keep the one-shot semantics ("rescue once, then | |
| # warn"), we check the marker explicitly. | |
| non_marker_labels=$(echo " $labels " | tr ' ' '\n' | grep -v '^$' | grep -vE '^watchdog:' || true) | |
| if [[ -z "$non_marker_labels" ]] && (( age > STALE_SEC )); then | |
| marker="watchdog:review-bootstrap" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::PR #$num: review never started even after watchdog bootstrap — needs manual attention" | |
| continue | |
| fi | |
| ensure_label "$marker" "5319e7" "Watchdog bootstrapped initial review" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh pr edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "PR #$num: review (never started, no labels)" impl-review.yml \ | |
| -f pr_number="$num" | |
| continue | |
| fi | |
| done < <(echo "$PRS_JSON" | jq -c '.[]') | |
| ##### B) Scan spec-ready issues for stuck generation ############### | |
| ISS_JSON=$(gh issue list --state open --label spec-ready --limit 300 \ | |
| --json number,labels,updatedAt) | |
| echo "Scanning $(jq 'length' <<<"$ISS_JSON") spec-ready issue(s)" | |
| while IFS= read -r row; do | |
| num=$(jq -r '.number' <<<"$row") | |
| updated=$(jq -r '.updatedAt' <<<"$row") | |
| labels=$(jq -r '[.labels[].name] | join(" ")' <<<"$row") | |
| updated_sec=$(date -u -d "$updated" +%s) | |
| age=$(( NOW - updated_sec )) | |
| (( age > STALE_SEC )) || continue | |
| spec_id="" | |
| spec_id_resolved=false | |
| # Iterate label tokens | |
| for label in $labels; do | |
| case "$label" in | |
| generate:*) | |
| lib="${label#generate:}" | |
| marker="watchdog:retried-$lib" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::Issue #$num: generate:$lib already retried by watchdog — needs manual attention" | |
| continue | |
| fi | |
| if ! $spec_id_resolved; then | |
| spec_id=$(spec_id_for_issue "$num"); spec_id_resolved=true | |
| fi | |
| if [ -z "$spec_id" ]; then | |
| echo "::warning::Issue #$num: cannot resolve spec_id from title; skipping generate:$lib" | |
| continue | |
| fi | |
| open_pr=$(gh pr list --state open \ | |
| --search "head:implementation/$spec_id/$lib" \ | |
| --json number --jq 'length') | |
| if [[ "$open_pr" == "0" ]]; then | |
| ensure_label "$marker" "5319e7" "Watchdog retried $lib once" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh issue edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "Issue #$num: generate (stuck pending) $lib" \ | |
| bulk-generate.yml \ | |
| -f specification_id="$spec_id" \ | |
| -f library="$lib" | |
| fi | |
| ;; | |
| impl:*:failed) | |
| lib_failed="${label#impl:}" | |
| lib="${lib_failed%:failed}" | |
| marker="watchdog:retried-$lib" | |
| if echo " $labels " | grep -q " $marker "; then | |
| echo "::warning::Issue #$num: $lib already retried by watchdog — needs manual attention" | |
| continue | |
| fi | |
| if ! $spec_id_resolved; then | |
| spec_id=$(spec_id_for_issue "$num"); spec_id_resolved=true | |
| fi | |
| if [ -z "$spec_id" ]; then | |
| echo "::warning::Issue #$num: cannot resolve spec_id from title; skipping impl:$lib:failed" | |
| continue | |
| fi | |
| open_pr=$(gh pr list --state open \ | |
| --search "head:implementation/$spec_id/$lib" \ | |
| --json number --jq 'length') | |
| if [[ "$open_pr" == "0" ]]; then | |
| ensure_label "$marker" "5319e7" "Watchdog retried $lib once" | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| gh issue edit "$num" --add-label "$marker" 2>/dev/null || true | |
| fi | |
| dispatch "Issue #$num: generate (failed) $lib" \ | |
| bulk-generate.yml \ | |
| -f specification_id="$spec_id" \ | |
| -f library="$lib" | |
| fi | |
| ;; | |
| esac | |
| done | |
| done < <(echo "$ISS_JSON" | jq -c '.[]') | |
| echo "::notice::Watchdog scan complete" |