File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -9,6 +9,8 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
99PARTITION=" h200"
1010SQUASH_FILE=" /mnt/vast/squash/$( echo " $IMAGE " | sed ' s/[\/:@#]/_/g' ) .sqsh"
1111
12+ SHM_PATH=$( mktemp -d /mnt/vast/shm-XXXXXX)
13+
1214salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
1315JOB_ID=$( squeue -u $USER -h -o %A | head -n1)
1416
2527# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2628srun --jobid=$JOB_ID \
2729--container-image=$CONTAINER_IMAGE \
28- --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE \
30+ --container-mounts=$GITHUB_WORKSPACE :/workspace/,$HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE , $SHM_PATH :/dev/shm/sagemaker_sessions \
2931--container-mount-home \
3032--container-workdir=/workspace/ \
3133--no-container-entrypoint --export=ALL \
32- bash -c " bash benchmarks/${MODEL_CODE} _${PRECISION} _h200${FRAMEWORK_SUFFIX} _slurm.sh; rm -rf /dev/shm/sagemaker_sessions "
34+ bash benchmarks/${MODEL_CODE} _${PRECISION} _h200${FRAMEWORK_SUFFIX} _slurm.sh
3335
36+ rmdir $SHM_PATH
3437scancel $JOB_ID
You can’t perform that action at this time.
0 commit comments