RunPod Train Reconcile (Pulumi) #1217
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: RunPod Train Reconcile (Pulumi) | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| stack: | |
| description: "Pulumi stack (org/stack)" | |
| required: true | |
| default: "dieg0code/train" | |
| schedule: | |
| - cron: "*/30 * * * *" | |
| concurrency: | |
| group: runpod-train-reconcile | |
| cancel-in-progress: true | |
| jobs: | |
| reconcile: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 20 | |
| permissions: | |
| contents: read | |
| env: | |
| PULUMI_SKIP_UPDATE_CHECK: "true" | |
| RUNPOD_API_TOKEN: ${{ secrets.RUNPOD_API_TOKEN }} | |
| PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }} | |
| STACK_NAME: ${{ github.event.inputs.stack || vars.RUNPOD_TRAIN_STACK || 'dieg0code/train' }} | |
| steps: | |
| - name: Validate required secrets | |
| run: | | |
| test -n "${RUNPOD_API_TOKEN}" || (echo "Missing RUNPOD_API_TOKEN" && exit 1) | |
| test -n "${PULUMI_ACCESS_TOKEN}" || (echo "Missing PULUMI_ACCESS_TOKEN" && exit 1) | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Setup Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Setup Pulumi CLI | |
| uses: pulumi/setup-pulumi@v2 | |
| - name: Install jq | |
| run: sudo apt-get update && sudo apt-get install -y jq | |
| - name: Install stack dependencies | |
| working-directory: infra/runpod-train | |
| run: pip install -r requirements.txt | |
| - name: Login to Pulumi Cloud | |
| run: pulumi login | |
| - name: Select stack | |
| working-directory: infra/runpod-train | |
| run: pulumi stack select "${STACK_NAME}" --create | |
| - name: Ensure provider token exists | |
| working-directory: infra/runpod-train | |
| run: pulumi config set runpod:token "${RUNPOD_API_TOKEN}" --secret | |
| - name: Read pod id from stack | |
| id: pod | |
| working-directory: infra/runpod-train | |
| run: | | |
| POD_ID=$(pulumi stack output podId 2>/dev/null || true) | |
| echo "pod_id=${POD_ID}" >> "$GITHUB_OUTPUT" | |
| echo "pod_id=${POD_ID}" | |
| - name: Exit when no active pod is tracked | |
| if: steps.pod.outputs.pod_id == '' | |
| run: echo "No pod tracked in stack; nothing to reconcile." | |
| - name: Query pod status | |
| if: steps.pod.outputs.pod_id != '' | |
| id: status | |
| env: | |
| POD_ID: ${{ steps.pod.outputs.pod_id }} | |
| run: | | |
| response=$(curl -sS \ | |
| -H "Authorization: Bearer ${RUNPOD_API_TOKEN}" \ | |
| "https://rest.runpod.io/v1/pods/${POD_ID}") | |
| desired=$(echo "${response}" | jq -r '.desiredStatus // "unknown"' | tr '[:lower:]' '[:upper:]') | |
| runtime=$(echo "${response}" | jq -r '.runtime.status // "unknown"' | tr '[:lower:]' '[:upper:]') | |
| terminal="false" | |
| if [[ "${desired}" == "STOPPED" || "${desired}" == "TERMINATED" || "${desired}" == "EXITED" || "${desired}" == "FAILED" || "${desired}" == "CANCELLED" || "${runtime}" == "EXITED" || "${runtime}" == "FAILED" || "${runtime}" == "TERMINATED" || "${runtime}" == "STOPPED" ]]; then | |
| terminal="true" | |
| fi | |
| echo "desired=${desired}" >> "$GITHUB_OUTPUT" | |
| echo "runtime=${runtime}" >> "$GITHUB_OUTPUT" | |
| echo "terminal=${terminal}" >> "$GITHUB_OUTPUT" | |
| echo "desiredStatus=${desired} runtime.status=${runtime} terminal=${terminal}" | |
| - name: Destroy terminal pod | |
| if: > | |
| steps.pod.outputs.pod_id != '' && | |
| steps.status.outputs.terminal == 'true' | |
| working-directory: infra/runpod-train | |
| run: pulumi destroy --yes --skip-preview | |
| - name: Remove stack after destroy | |
| if: > | |
| steps.pod.outputs.pod_id != '' && | |
| steps.status.outputs.terminal == 'true' | |
| working-directory: infra/runpod-train | |
| run: pulumi stack rm "${STACK_NAME}" --yes | |
| - name: Keep running pod | |
| if: > | |
| steps.pod.outputs.pod_id != '' && | |
| steps.status.outputs.terminal != 'true' | |
| run: | | |
| echo "Pod still running. Reconcile will check again on next run." |