fixing b200

cquil11 · cquil11 · commit 970f27aa37b5 · 2025-11-13T18:00:48.000-06:00
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -43,7 +43,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -54,7 +54,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -69,7 +69,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
@@ -25,29 +25,6 @@ set -x
 # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes.
 # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register
 
-
-docker run --rm -d --init --network host --name $server_name \
---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
--v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
--e NCCL_GRAPH_REGISTER=0 \
--e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
---entrypoint=/bin/bash \
-$(echo "$IMAGE" | sed 's/#/\//') \
-benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-
 if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
   if [[ "$OSL" == "8192" ]]; then
     NUM_PROMPTS=$(( CONC * 20 ))
@@ -58,22 +35,17 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
-set -x
-docker run --rm --network host --name $client_name \
+docker run --rm --init --network host --name $server_name \
+--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
+-e NCCL_GRAPH_REGISTER=0 \
+-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
--lc "pip install -q datasets pandas && \
-python3 bench_serving/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $NUM_PROMPTS \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json"
+benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
 
 # Try graceful first
 docker stop -t 90 "$server_name" || true