Skip to content

Commit 1bc3973

Browse files
cquil11Oseltamivir
authored andcommitted
fix: H100+H200 CoreWeave fix (#308)
* add shm path mount * add fix to h200 cw * change path * change path back
1 parent b3ad202 commit 1bc3973

2 files changed

Lines changed: 9 additions & 3 deletions

File tree

runners/launch_h100-cw.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
77
salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
88
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
99

10+
SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX)
11+
1012
set -x
1113
srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
1214
srun --jobid=$JOB_ID \
1315
--container-image=$SQUASH_FILE \
14-
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
16+
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$SAGEMAKER_SHM_PATH:/dev/shm/sagemaker_sessions \
1517
--container-mount-home \
1618
--container-workdir=/workspace/ \
1719
--no-container-entrypoint --export=ALL,PORT=8888 \
1820
bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_h100_slurm.sh
1921

22+
rmdir $SAGEMAKER_SHM_PATH
2023
scancel $JOB_ID

runners/launch_h200-cw.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
99
PARTITION="h200"
1010
SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
1111

12+
SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX)
13+
1214
salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
1315
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
1416

@@ -25,10 +27,11 @@ fi
2527
# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2628
srun --jobid=$JOB_ID \
2729
--container-image=$CONTAINER_IMAGE \
28-
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
30+
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$SAGEMAKER_SHM_PATH:/dev/shm/sagemaker_sessions \
2931
--container-mount-home \
3032
--container-workdir=/workspace/ \
3133
--no-container-entrypoint --export=ALL \
32-
bash -c "bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions"
34+
bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
3335

36+
rmdir $SAGEMAKER_SHM_PATH
3437
scancel $JOB_ID

0 commit comments

Comments
 (0)