-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwait_gpus.sh
More file actions
executable file
·35 lines (29 loc) · 1.06 KB
/
wait_gpus.sh
File metadata and controls
executable file
·35 lines (29 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/bin/bash
# wait_gpus.sh — Wait until all expected GPUs are fully initialized.
# Usage: ./wait_gpus.sh [expected_count] [timeout_seconds]
# expected_count: number of GPUs to wait for (default: 4)
# timeout_seconds: max wait time in seconds (default: 120)
# Exit 0 when all GPUs ready, 1 on timeout.
set -e
EXPECTED_GPUS="${1:-4}"
TIMEOUT="${2:-120}"
INTERVAL=3
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
log "⏳ Waiting for ${EXPECTED_GPUS} GPU(s) to initialize (timeout: ${TIMEOUT}s)..."
ELAPSED=0
while (( ELAPSED < TIMEOUT )); do
COUNT=$(nvidia-smi -L 2>/dev/null | grep -c "^GPU" || true)
if (( COUNT >= EXPECTED_GPUS )); then
log "✅ All ${EXPECTED_GPUS} GPU(s) ready (${COUNT} detected after ${ELAPSED}s)"
nvidia-smi -L
exit 0
fi
log " ⏱ ${COUNT}/${EXPECTED_GPUS} GPUs detected... waiting (${ELAPSED}s / ${TIMEOUT}s)"
sleep "$INTERVAL"
ELAPSED=$(( ELAPSED + INTERVAL ))
done
log "❌ Timeout: only ${COUNT}/${EXPECTED_GPUS} GPU(s) available after ${TIMEOUT}s"
nvidia-smi -L 2>/dev/null || true
exit 1