diff --git a/.github/wiki b/.github/wiki index 2a1fdf635..250cdc07e 160000 --- a/.github/wiki +++ b/.github/wiki @@ -1 +1 @@ -Subproject commit 2a1fdf63550711a077bf945b0c5d98ed7cc8b773 +Subproject commit 250cdc07e4dadfe5d310d18243532f6c641cb720 diff --git a/.github/workflows/retry-transient-failures.yml b/.github/workflows/retry-transient-failures.yml new file mode 100644 index 000000000..a27ec859c --- /dev/null +++ b/.github/workflows/retry-transient-failures.yml @@ -0,0 +1,169 @@ +name: Retry Transient Workflow Failures + +on: + workflow_run: + workflows: + - Project Board Automation + - Changelog Automation + - Pull Request Label Sync + - Generate Reports and Deploy to GitHub Pages + - Rigorous Pull Request Review + - Run PHPUnit Tests + - Maintain Wiki + - Maintain Wiki Publication + - Update Wiki Preview + - Update Wiki + types: + - completed + +permissions: + actions: write + contents: read + +concurrency: + group: retry-transient-run-${{ github.event.workflow_run.id }} + cancel-in-progress: false + +jobs: + retry: + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + name: Retry Failed Jobs When GitHub Infrastructure Looks Transient + runs-on: ubuntu-latest + + steps: + - id: retry + uses: actions/github-script@v8 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const transientPatterns = [ + /RPC failed; HTTP 5\d\d/i, + /expected flush after ref listing/i, + /expected 'packfile'/i, + /remote:\s+Internal Server Error/i, + /requested URL returned error:\s*5\d\d/i, + /fatal:\s+unable to access 'https:\/\/github\.com\/.*': The requested URL returned error:\s*5\d\d/i, + ]; + + const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); + const runId = Number.parseInt(`${{ github.event.workflow_run.id }}`, 10); + const runAttempt = Number.parseInt(`${{ github.event.workflow_run.run_attempt }}`, 10); + const workflowName = `${{ github.event.workflow_run.name }}`; + const maxRunAttempts = 2; + + const buildSummary = ({ status, failedJobs = [], matchedJobs = [] }) => { + const lines = [ + '## Transient Failure Retry Summary', + '', + `- Workflow: \`${workflowName}\``, + `- Run ID: \`${runId}\``, + `- Run attempt: \`${runAttempt}\``, + `- Retry status: \`${status}\``, + ]; + + if (failedJobs.length > 0) { + lines.push(`- Failed jobs inspected: ${failedJobs.map((job) => `\`${job}\``).join(', ')}`); + } + + if (matchedJobs.length > 0) { + lines.push(`- Jobs with transient GitHub failure signatures: ${matchedJobs.map((job) => `\`${job}\``).join(', ')}`); + } + + if (status === 'rerun-requested') { + lines.push('- Action: Requested a rerun of failed jobs because every failed job matched transient GitHub-side error signatures.'); + } + + if (status === 'skipped-run-attempt-limit') { + lines.push('- Action: Skipped rerun because the run already reached the configured retry limit.'); + } + + if (status === 'skipped-no-failed-jobs') { + lines.push('- Action: Skipped rerun because the workflow reported failure without failed jobs to inspect.'); + } + + if (status === 'skipped-no-transient-match') { + lines.push('- Action: Skipped rerun because at least one failed job did not match the transient GitHub-side signatures.'); + } + + return lines.join('\n'); + }; + + if (runAttempt >= maxRunAttempts) { + const summary = buildSummary({ status: 'skipped-run-attempt-limit' }); + core.setOutput('status', 'skipped-run-attempt-limit'); + core.setOutput('summary', summary); + + return; + } + + const jobsResponse = await github.rest.actions.listJobsForWorkflowRun({ + owner, + repo, + run_id: runId, + per_page: 100, + }); + + const failedJobs = jobsResponse.data.jobs.filter((job) => job.conclusion === 'failure'); + + if (failedJobs.length === 0) { + const summary = buildSummary({ status: 'skipped-no-failed-jobs' }); + core.setOutput('status', 'skipped-no-failed-jobs'); + core.setOutput('summary', summary); + + return; + } + + const matchedJobs = []; + + for (const job of failedJobs) { + const logsResponse = await fetch(`https://api.github.com/repos/${owner}/${repo}/actions/jobs/${job.id}/logs`, { + headers: { + Accept: 'application/vnd.github+json', + Authorization: `Bearer ${process.env.GITHUB_TOKEN}`, + 'X-GitHub-Api-Version': '2022-11-28', + }, + redirect: 'follow', + }); + + if (!logsResponse.ok) { + throw new Error(`Failed to download logs for job ${job.name}: ${logsResponse.status} ${logsResponse.statusText}`); + } + + const logText = await logsResponse.text(); + const hasTransientMatch = transientPatterns.some((pattern) => pattern.test(logText)); + + if (!hasTransientMatch) { + const summary = buildSummary({ + status: 'skipped-no-transient-match', + failedJobs: failedJobs.map((failedJob) => failedJob.name), + matchedJobs, + }); + + core.setOutput('status', 'skipped-no-transient-match'); + core.setOutput('summary', summary); + + return; + } + + matchedJobs.push(job.name); + } + + await github.request('POST /repos/{owner}/{repo}/actions/runs/{run_id}/rerun-failed-jobs', { + owner, + repo, + run_id: runId, + }); + + const summary = buildSummary({ + status: 'rerun-requested', + failedJobs: failedJobs.map((job) => job.name), + matchedJobs, + }); + + core.setOutput('status', 'rerun-requested'); + core.setOutput('summary', summary); + + - name: Write step summary + env: + RETRY_SUMMARY: ${{ steps.retry.outputs.summary }} + run: printf '%s\n' "$RETRY_SUMMARY" >> "$GITHUB_STEP_SUMMARY" diff --git a/CHANGELOG.md b/CHANGELOG.md index 10391011b..3d98e6ff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Retry failed GitHub Actions jobs once when failed workflow logs match transient GitHub-side checkout or transport errors (#175) - Teach the review and pull-request agent skills to treat workflow-managed wiki pointer updates as expected state and to prefer fresh follow-up issues plus PRs over reviving closed deleted branches (#147) - Require GitHub issue write readback verification in the github-issues skill (#165) - Standardize cache flags and nested cache-dir propagation across cache-aware commands (#162) diff --git a/README.md b/README.md index 580671da1..f6494e45d 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,11 @@ and logged failures emit native workflow error annotations, including file and line metadata when commands provide it. The packaged tests, reports, wiki, and changelog workflows also append concise Markdown outcomes to `GITHUB_STEP_SUMMARY` so maintainers can scan versions, URLs, preview refs, -verification status, and release results without expanding full logs. +verification status, and release results without expanding full logs. This +repository also keeps a bounded retry workflow that reruns failed jobs once +when failed job logs match transient GitHub-side checkout or transport errors +such as HTTP 500 fetch failures, while leaving genuine logic and quality +failures untouched. When the packaged changelog workflow is synchronized into a consumer repository, pull requests are expected to add a notable changelog entry before diff --git a/docs/usage/github-actions.rst b/docs/usage/github-actions.rst index fd67f4519..19e946b6b 100644 --- a/docs/usage/github-actions.rst +++ b/docs/usage/github-actions.rst @@ -249,3 +249,29 @@ Maintenance Workflows * Promotes all ``Release Prepared`` work into ``Released`` when the release-preparation pull request is merged and the GitHub release is published. * Uses the built-in workflow token for project updates. * **Label Sync**: Synchronizes repository labels with ecosystem standards. + +Transient Failure Retry +----------------------- + +This repository also keeps a local ``retry-transient-failures.yml`` workflow +that watches completed workflow runs and decides whether a failed run looks +like a transient GitHub-side infrastructure problem rather than a logic bug in +the workflow itself. + +**Behavior:** +* Runs only after one of the repository's core workflows finishes with a + failure. +* Inspects failed job logs for transient GitHub-side signatures such as + checkout or fetch HTTP 500 failures, Git transport RPC errors, and related + internal-server-error patterns. +* Requests a rerun of failed jobs only when every failed job matches those + transient signatures. +* Stops after one rerun attempt, so repeated failures still surface clearly + to maintainers. +* Appends a deterministic summary describing whether a rerun was requested or + skipped. + +**Non-goals:** +* It does not retry PHPUnit failures, lint failures, changelog validation, + or other logic or quality-signal regressions. +* It does not introduce unbounded rerun loops.