Skip to content

Commit d2f1254

Browse files
authored
add back correct launch script for new mi325x slurm cluster (#231)
1 parent c23a59f commit d2f1254

1 file changed

Lines changed: 20 additions & 52 deletions

File tree

runners/launch_mi325x-amd.sh

Lines changed: 20 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,24 @@
1-
#!/usr/bin/bash
1+
#!/usr/bin/env bash
22

3-
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
3+
export HF_HUB_CACHE_MOUNT="/nfsdata/sa/hf_hub_cache-${USER: -1}/"
4+
export PORT_OFFSET=${USER: -1}
45

5-
HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/"
6-
PORT=8888
7-
8-
network_name="bmk-net"
9-
server_name="bmk-server"
10-
client_name="bmk-client"
11-
12-
docker network create $network_name
6+
PARTITION="compute"
7+
SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
138

149
set -x
15-
docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
16-
--privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
17-
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
18-
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
19-
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
20-
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
21-
-e ISL -e OSL \
22-
--entrypoint=/bin/bash \
23-
$IMAGE \
24-
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh"
25-
26-
set +x
27-
while IFS= read -r line; do
28-
printf '%s\n' "$line"
29-
if [[ "$line" =~ Application\ startup\ complete ]]; then
30-
break
31-
fi
32-
done < <(docker logs -f --tail=0 $server_name 2>&1)
33-
34-
git clone https://github.com/kimbochen/bench_serving.git
35-
36-
set -x
37-
docker run --rm --network=$network_name --name=$client_name \
38-
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
39-
-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
40-
--entrypoint=python3 \
41-
$IMAGE \
42-
bench_serving/benchmark_serving.py \
43-
--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
44-
--dataset-name=random \
45-
--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
46-
--num-prompts=$(( $CONC * 10 )) \
47-
--max-concurrency=$CONC \
48-
--request-rate=inf --ignore-eos \
49-
--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
50-
--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
51-
52-
while [ -n "$(docker ps -aq)" ]; do
53-
docker stop $server_name
54-
docker network rm $network_name
55-
sleep 5
56-
done
10+
salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell
11+
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
12+
13+
srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
14+
srun --jobid=$JOB_ID \
15+
--container-image=$SQUASH_FILE \
16+
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
17+
--container-mount-home \
18+
--container-writable \
19+
--container-remap-root \
20+
--container-workdir=/workspace/ \
21+
--no-container-entrypoint --export=ALL \
22+
bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
23+
24+
scancel $JOB_ID

0 commit comments

Comments
 (0)