Integration · Backup Restore #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # instant.dev/api — Weekly backup/restore integration test | |
| # | |
| # What this runs: | |
| # The `integration_backup`-tagged Go tests in api/e2e/ | |
| # (backup_restore_integration_test.go). Tests invoke | |
| # ../../infra/scripts/restore-drill.sh against the cluster pointed to | |
| # by KUBECONFIG_TEST_CLUSTER and assert RTO/RPO + cleanup + alert YAML. | |
| # | |
| # Cluster safety: | |
| # This workflow MUST NEVER run against the prod cluster. The drill | |
| # script itself enforces this on its end (refuses to run outside the | |
| # `do-nyc3-instant-prod` context name). The workflow uses a SEPARATE | |
| # secret KUBECONFIG_TEST_CLUSTER which the operator points at a | |
| # non-prod context. | |
| # | |
| # Why weekly: | |
| # The drill creates a throwaway namespace + pod, which holds slots | |
| # for ~2 minutes. Running on every PR would burn cluster capacity for | |
| # marginal extra signal. Weekly catches: | |
| # - the alert YAML / Prom rule has drifted from the published | |
| # 36h+60h thresholds | |
| # - the script's cleanup path is broken | |
| # - the actual RTO/RPO crosses the SLA | |
| # Manual trigger via workflow_dispatch for ad-hoc operator validation. | |
| # | |
| # Companion runbook: infra/BACKUP-RESTORE-RUNBOOK.md | |
| name: Integration · Backup Restore | |
| on: | |
| schedule: | |
| # 04:00 UTC Sunday — 1h after the nightly backup CronJob windows | |
| # so the most-recent artifact is fresh and the RPO assertion is | |
| # exercised against a real new backup. | |
| - cron: '0 4 * * 0' | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: integration-backup | |
| cancel-in-progress: false | |
| jobs: | |
| backup-restore-drill: | |
| name: Restore drill (test cluster) | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| if: ${{ vars.INTEGRATION_BACKUP_ENABLED == 'true' }} | |
| steps: | |
| - name: Check out api | |
| uses: actions/checkout@v6 | |
| with: | |
| path: api | |
| - name: Check out infra (sibling repo with restore-drill.sh) | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ github.repository_owner }}/infra | |
| path: infra | |
| token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} | |
| - name: Install kubectl | |
| uses: azure/setup-kubectl@v5 | |
| - name: Set up Go | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version-file: api/go.mod | |
| - name: Materialise drill kubeconfig | |
| env: | |
| KUBECONFIG_TEST_CLUSTER: ${{ secrets.KUBECONFIG_TEST_CLUSTER }} | |
| run: | | |
| if [ -z "$KUBECONFIG_TEST_CLUSTER" ]; then | |
| echo "::error::KUBECONFIG_TEST_CLUSTER secret is empty — refusing to run drill against unknown cluster" | |
| exit 1 | |
| fi | |
| mkdir -p "$RUNNER_TEMP/kube" | |
| printf '%s' "$KUBECONFIG_TEST_CLUSTER" | base64 -d > "$RUNNER_TEMP/kube/config" | |
| chmod 0600 "$RUNNER_TEMP/kube/config" | |
| # Defensive: refuse to proceed if the kubeconfig context name | |
| # contains 'prod' — second backstop beyond the drill script's | |
| # own gate. | |
| ctx=$(KUBECONFIG="$RUNNER_TEMP/kube/config" kubectl config current-context) | |
| case "$ctx" in | |
| *prod*|*production*) | |
| echo "::error::KUBECONFIG_TEST_CLUSTER context name is '$ctx' — looks like prod, refusing to run drill" | |
| exit 1 | |
| ;; | |
| esac | |
| echo "Drill context: $ctx" | |
| - name: Run integration_backup tests | |
| env: | |
| KUBECONFIG_DRILL: ${{ runner.temp }}/kube/config | |
| DRILL_SCRIPT_PATH: ${{ github.workspace }}/infra/scripts/restore-drill.sh | |
| working-directory: api | |
| run: | | |
| go test -tags integration_backup -v -timeout 25m ./e2e/... | |
| - name: Surface alert-config drift (non-cluster tests) | |
| if: always() | |
| env: | |
| DRILL_SCRIPT_PATH: ${{ github.workspace }}/infra/scripts/restore-drill.sh | |
| working-directory: api | |
| run: | | |
| # Re-run only the static-asset tests with no KUBECONFIG_DRILL — | |
| # these are pure-parse tests and run even when the cluster | |
| # arm above SKIPPed. | |
| go test -tags integration_backup -run 'TestBackupRestore_NRAlert|TestBackupRestore_PromRule' -v ./e2e/... |