@@ -25,29 +25,6 @@ set -x
2525# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes.
2626# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register
2727
28-
29- docker run --rm -d --init --network host --name $server_name \
30- --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
31- -v $HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE \
32- -v $GITHUB_WORKSPACE :/workspace/ -w /workspace/ \
33- -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
34- -e NCCL_GRAPH_REGISTER=0 \
35- -e TORCH_CUDA_ARCH_LIST=" 10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7" \
36- --entrypoint=/bin/bash \
37- $( echo " $IMAGE " | sed ' s/#/\//' ) \
38- benchmarks/" ${EXP_NAME%% _* } _${PRECISION} _b200${FRAMEWORK_SUFFIX} _docker.sh"
39-
40- set +x
41- while IFS= read -r line; do
42- printf ' %s\n' " $line "
43- if [[ " $line " =~ Application\ startup\ complete ]]; then
44- break
45- fi
46- done < <( docker logs -f --tail=0 $server_name 2>&1 )
47-
48- git clone https://github.com/kimbochen/bench_serving.git
49-
50-
5128if [[ " $MODEL " == " nvidia/DeepSeek-R1-0528-FP4" || " $MODEL " == " deepseek-ai/DeepSeek-R1-0528" ]]; then
5229 if [[ " $OSL " == " 8192" ]]; then
5330 NUM_PROMPTS=$(( CONC * 20 ))
5835 NUM_PROMPTS=$(( CONC * 10 ))
5936fi
6037
61- set -x
62- docker run --rm --network host --name $client_name \
38+ docker run --rm --init --network host --name $server_name \
39+ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
40+ -v $HF_HUB_CACHE_MOUNT :$HF_HUB_CACHE \
6341-v $GITHUB_WORKSPACE :/workspace/ -w /workspace/ \
64- -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
42+ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
43+ -e NCCL_GRAPH_REGISTER=0 \
44+ -e TORCH_CUDA_ARCH_LIST=" 10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7" \
45+ -e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
6546--entrypoint=/bin/bash \
6647$( echo " $IMAGE " | sed ' s/#/\//' ) \
67- -lc " pip install -q datasets pandas && \
68- python3 bench_serving/benchmark_serving.py \
69- --model $MODEL --backend vllm --base-url http://localhost:$PORT \
70- --dataset-name random \
71- --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
72- --num-prompts $NUM_PROMPTS \
73- --max-concurrency $CONC \
74- --request-rate inf --ignore-eos \
75- --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
76- --result-dir /workspace/ --result-filename $RESULT_FILENAME .json"
48+ benchmarks/" ${EXP_NAME%% _* } _${PRECISION} _b200${FRAMEWORK_SUFFIX} _docker.sh"
7749
7850# Try graceful first
7951docker stop -t 90 " $server_name " || true
0 commit comments