File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -7,14 +7,17 @@ SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
77salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
88JOB_ID=$( squeue -u $USER -h -o %A | head -n1)
99
10+ SAGEMAKER_SHM_PATH=$( mktemp -d /mnt/vast/shm-XXXXXX)
11+
1012set -x
1113srun --jobid=$JOB_ID bash -c " enroot import -o $SQUASH_FILE docker://$IMAGE "
1214srun --jobid=$JOB_ID \
1315--container-image=$SQUASH_FILE \
14- --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE \
16+ --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE , $SAGEMAKER_SHM_PATH :/dev/shm/sagemaker_sessions \
1517--container-mount-home \
1618--container-workdir=/workspace/ \
1719--no-container-entrypoint --export=ALL,PORT=8888 \
1820bash benchmarks/${EXP_NAME%% _* } _${PRECISION} _h100_slurm.sh
1921
22+ rmdir $SAGEMAKER_SHM_PATH
2023scancel $JOB_ID
Original file line number Diff line number Diff line change @@ -9,6 +9,8 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
99PARTITION=" h200"
1010SQUASH_FILE=" /mnt/vast/squash/$( echo " $IMAGE " | sed ' s/[\/:@#]/_/g' ) .sqsh"
1111
12+ SAGEMAKER_SHM_PATH=$( mktemp -d /mnt/vast/shm-XXXXXX)
13+
1214salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
1315JOB_ID=$( squeue -u $USER -h -o %A | head -n1)
1416
2527# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2628srun --jobid=$JOB_ID \
2729--container-image=$CONTAINER_IMAGE \
28- --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE \
30+ --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE , $SAGEMAKER_SHM_PATH :/dev/shm/sagemaker_sessions \
2931--container-mount-home \
3032--container-workdir=/workspace/ \
3133--no-container-entrypoint --export=ALL \
32- bash -c " bash benchmarks/${MODEL_CODE} _${PRECISION} _h200${FRAMEWORK_SUFFIX} _slurm.sh; rm -rf /dev/shm/sagemaker_sessions "
34+ bash benchmarks/${MODEL_CODE} _${PRECISION} _h200${FRAMEWORK_SUFFIX} _slurm.sh
3335
36+ rmdir $SAGEMAKER_SHM_PATH
3437scancel $JOB_ID
You can’t perform that action at this time.
0 commit comments