Skip to content

Commit a9be02a

Browse files
committed
add fix to h200 cw
1 parent a067573 commit a9be02a

1 file changed

Lines changed: 5 additions & 2 deletions

File tree

runners/launch_h200-cw.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
99
PARTITION="h200"
1010
SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
1111

12+
SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX)
13+
1214
salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
1315
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
1416

@@ -25,10 +27,11 @@ fi
2527
# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2628
srun --jobid=$JOB_ID \
2729
--container-image=$CONTAINER_IMAGE \
28-
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
30+
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$SHM_PATH:/dev/shm/sagemaker_sessions \
2931
--container-mount-home \
3032
--container-workdir=/workspace/ \
3133
--no-container-entrypoint --export=ALL \
32-
bash -c "bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions"
34+
bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
3335

36+
rmdir $SHM_PATH
3437
scancel $JOB_ID

0 commit comments

Comments
 (0)