-
Notifications
You must be signed in to change notification settings - Fork 0
234 lines (206 loc) · 9.28 KB
/
train-runpod-start.yml
File metadata and controls
234 lines (206 loc) · 9.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
name: RunPod Train Start (Pulumi)
on:
workflow_dispatch:
inputs:
stack:
description: "Pulumi stack (org/stack)"
required: true
default: "dieg0code/train"
pod_name:
description: "RunPod pod name"
required: true
default: "ataxx-zero-train"
gpu_type_id:
description: "RunPod GPU type id"
required: true
default: "NVIDIA GeForce RTX 4090"
cloud_type:
description: "RunPod cloud type"
required: true
default: "SECURE"
terminate_after:
description: "Hard max runtime for cost safety (RunPod duration, e.g. 8h)"
required: true
default: "8h"
image_name:
description: "Container image for pod"
required: true
default: "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
repository:
description: "GitHub repository owner/name"
required: true
default: "dieg0code/ataxx-zero"
git_ref:
description: "Git ref (sha or branch)"
required: true
default: "main"
hf_repo_id:
description: "HF repository for checkpoints"
required: true
default: "dieg0code/ataxx-zero"
hf_run_id:
description: "HF namespace for this model lineage"
required: true
default: "policy_spatial_v1"
allow_when_any_pod_active:
description: "Set true only if you intentionally allow other active RunPod pods in the account"
required: true
default: "false"
train_args:
description: "Arguments passed to train.py"
required: true
default: "--no-onnx --quiet --devices 1 --strategy auto --num-workers 4 --keep-local-ckpts 2 --keep-log-versions 1 --hf --iterations 40 --episodes 70 --sims 600 --epochs 5 --batch-size 512 --lr 9e-4 --weight-decay 1e-4 --save-every 3 --opp-self 0.45 --opp-heuristic 0.50 --opp-random 0.05 --opp-heu-easy 0.00 --opp-heu-normal 0.25 --opp-heu-hard 0.75 --model-swap-prob 0.5 --selfplay-workers 8 --monitor-log-every 3"
allow_replace_running:
description: "Set true only if you intentionally want to replace an active tracked pod"
required: true
default: "false"
concurrency:
group: runpod-train-start-${{ github.ref }}
cancel-in-progress: false
jobs:
start:
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
env:
PULUMI_SKIP_UPDATE_CHECK: "true"
STACK_NAME: ${{ inputs.stack }}
RUNPOD_API_TOKEN: ${{ secrets.RUNPOD_API_TOKEN }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }}
steps:
- name: Validate required secrets
run: |
test -n "${RUNPOD_API_TOKEN}" || (echo "Missing RUNPOD_API_TOKEN" && exit 1)
test -n "${HF_TOKEN}" || (echo "Missing HF_TOKEN" && exit 1)
test -n "${PULUMI_ACCESS_TOKEN}" || (echo "Missing PULUMI_ACCESS_TOKEN" && exit 1)
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Setup Pulumi CLI
uses: pulumi/setup-pulumi@v2
- name: Install jq
run: sudo apt-get update && sudo apt-get install -y jq
- name: Install stack dependencies
working-directory: infra/runpod-train
run: pip install -r requirements.txt
- name: Verify HF token and repo access before spending GPU
env:
HF_REPO_ID_INPUT: ${{ inputs.hf_repo_id }}
run: |
python -m pip install --disable-pip-version-check --quiet huggingface_hub
python - <<'PY'
import os
from huggingface_hub import HfApi
token = os.environ["HF_TOKEN"].strip()
repo_id = os.environ["HF_REPO_ID_INPUT"].strip()
if not token:
raise SystemExit("HF_TOKEN is empty")
if not repo_id:
raise SystemExit("hf_repo_id input is empty")
api = HfApi(token=token)
who = api.whoami()
user_label = who.get("name") or who.get("fullname") or "unknown"
print(f"HF auth ok for: {user_label}")
api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
print(f"HF repo access ok: {repo_id}")
PY
- name: Login to Pulumi Cloud
run: pulumi login
- name: Select stack
working-directory: infra/runpod-train
run: pulumi stack select "${STACK_NAME}" --create
- name: Set stack config
working-directory: infra/runpod-train
run: |
pulumi config set runpod:token "${RUNPOD_API_TOKEN}" --secret
pulumi config set runpod-train:podName "${{ inputs.pod_name }}"
pulumi config set runpod-train:gpuTypeId "${{ inputs.gpu_type_id }}"
pulumi config set runpod-train:cloudType "${{ inputs.cloud_type }}"
pulumi config set runpod-train:terminateAfter "${{ inputs.terminate_after }}"
pulumi config set runpod-train:imageName "${{ inputs.image_name }}"
pulumi config set runpod-train:repository "${{ inputs.repository }}"
pulumi config set runpod-train:gitRef "${{ inputs.git_ref }}"
pulumi config set runpod-train:trainArgs -- "${{ inputs.train_args }}"
pulumi config set runpod-train:hfRepoId "${{ inputs.hf_repo_id }}"
pulumi config set runpod-train:hfRunId "${{ inputs.hf_run_id }}"
pulumi config set runpod-train:hfToken "${HF_TOKEN}" --secret
- name: Prevent launch when any RunPod pod is already active
env:
ALLOW_WHEN_ANY_POD_ACTIVE: ${{ inputs.allow_when_any_pod_active }}
run: |
response=$(curl -sS \
-H "Authorization: Bearer ${RUNPOD_API_TOKEN}" \
"https://rest.runpod.io/v1/pods")
rows=$(echo "${response}" | jq -r '
def pod_list:
if (.pods | type? == "array") then .pods
elif (.data | type? == "array") then .data
elif (.items | type? == "array") then .items
elif (type == "array") then .
else [] end;
pod_list[]? |
. as $p |
($p.id // "") as $id |
($p.name // $p.podName // "unnamed") as $name |
(($p.desiredStatus // $p.desired_status // "unknown") | ascii_upcase) as $desired |
(($p.runtime.status // $p.runtimeStatus // "unknown") | ascii_upcase) as $runtime |
[$id, $name, $desired, $runtime] | @tsv
')
active_count=0
while IFS=$'\t' read -r pod_id pod_name desired runtime; do
[ -z "${pod_id}" ] && continue
if [[ "${desired}" == "STOPPED" || "${desired}" == "TERMINATED" || "${desired}" == "EXITED" || "${desired}" == "FAILED" || "${desired}" == "CANCELLED" || "${runtime}" == "STOPPED" || "${runtime}" == "TERMINATED" || "${runtime}" == "EXITED" || "${runtime}" == "FAILED" || "${runtime}" == "CANCELLED" ]]; then
continue
fi
active_count=$((active_count + 1))
echo "Active pod detected: id=${pod_id} name=${pod_name} desired=${desired} runtime=${runtime}"
done <<< "${rows}"
if [ "${active_count}" -eq 0 ]; then
echo "No active RunPod pods detected; safe to proceed."
exit 0
fi
if [ "${ALLOW_WHEN_ANY_POD_ACTIVE}" = "true" ]; then
echo "allow_when_any_pod_active=true, proceeding by explicit override."
exit 0
fi
echo "::error::Detected ${active_count} active RunPod pod(s). Aborting to avoid duplicate billing. If intentional, rerun with allow_when_any_pod_active=true."
exit 1
- name: Prevent duplicate spend from active tracked pod
working-directory: infra/runpod-train
env:
ALLOW_REPLACE_RUNNING: ${{ inputs.allow_replace_running }}
run: |
POD_ID=$(pulumi stack output podId 2>/dev/null || true)
if [ -z "${POD_ID}" ]; then
echo "No tracked pod in stack; safe to create."
exit 0
fi
response=$(curl -sS \
-H "Authorization: Bearer ${RUNPOD_API_TOKEN}" \
"https://rest.runpod.io/v1/pods/${POD_ID}")
desired=$(echo "${response}" | jq -r '.desiredStatus // "unknown"' | tr '[:lower:]' '[:upper:]')
runtime=$(echo "${response}" | jq -r '.runtime.status // "unknown"' | tr '[:lower:]' '[:upper:]')
echo "Tracked pod ${POD_ID}: desired=${desired} runtime=${runtime}"
if [[ "${desired}" == "STOPPED" || "${desired}" == "TERMINATED" || "${runtime}" == "EXITED" || "${runtime}" == "FAILED" ]]; then
echo "Tracked pod is terminal; continuing."
exit 0
fi
if [ "${ALLOW_REPLACE_RUNNING}" = "true" ]; then
echo "allow_replace_running=true, continuing by explicit override."
exit 0
fi
echo "::error::Tracked pod ${POD_ID} is still active. Re-run after it stops, or set allow_replace_running=true explicitly."
exit 1
- name: Create or refresh training pod
working-directory: infra/runpod-train
run: pulumi up --yes --skip-preview
- name: Print pod id
working-directory: infra/runpod-train
run: |
echo "Pod created:"
pulumi stack output podId