|
| 1 | +#!/usr/bin/bash |
| 2 | + |
| 3 | +HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" |
| 4 | +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') |
| 5 | +PORT=8888 |
| 6 | + |
| 7 | +# Create unique cache directory based on model parameters |
| 8 | +MODEL_NAME=$(basename "$MODEL") |
| 9 | + |
| 10 | +server_name="bmk-server" |
| 11 | +client_name="bmk-client" |
| 12 | + |
| 13 | +nvidia-smi |
| 14 | + |
| 15 | +# GPUs must be idle |
| 16 | +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then |
| 17 | + echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1 |
| 18 | +fi |
| 19 | + |
| 20 | +set -x |
| 21 | +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup |
| 22 | +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ |
| 23 | + |
| 24 | +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. |
| 25 | +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. |
| 26 | +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register |
| 27 | + |
| 28 | + |
| 29 | +docker run --rm -d --init --network host --name $server_name \ |
| 30 | +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ |
| 31 | +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ |
| 32 | +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ |
| 33 | +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ |
| 34 | +-e NCCL_GRAPH_REGISTER=0 \ |
| 35 | +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ |
| 36 | +--entrypoint=/bin/bash \ |
| 37 | +$(echo "$IMAGE" | sed 's/#/\//') \ |
| 38 | +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" |
| 39 | + |
| 40 | +set +x |
| 41 | +while IFS= read -r line; do |
| 42 | + printf '%s\n' "$line" |
| 43 | + if [[ "$line" =~ Application\ startup\ complete ]]; then |
| 44 | + break |
| 45 | + fi |
| 46 | +done < <(docker logs -f --tail=0 $server_name 2>&1) |
| 47 | + |
| 48 | +git clone https://github.com/kimbochen/bench_serving.git |
| 49 | + |
| 50 | + |
| 51 | +if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then |
| 52 | + if [[ "$OSL" == "8192" ]]; then |
| 53 | + NUM_PROMPTS=$(( CONC * 20 )) |
| 54 | + else |
| 55 | + NUM_PROMPTS=$(( CONC * 50 )) |
| 56 | + fi |
| 57 | +else |
| 58 | + NUM_PROMPTS=$(( CONC * 10 )) |
| 59 | +fi |
| 60 | + |
| 61 | +set -x |
| 62 | +docker run --rm --network host --name $client_name \ |
| 63 | +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ |
| 64 | +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ |
| 65 | +--entrypoint=/bin/bash \ |
| 66 | +$(echo "$IMAGE" | sed 's/#/\//') \ |
| 67 | +-lc "pip install -q datasets pandas && \ |
| 68 | +python3 bench_serving/benchmark_serving.py \ |
| 69 | +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ |
| 70 | +--dataset-name random \ |
| 71 | +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ |
| 72 | +--num-prompts $NUM_PROMPTS \ |
| 73 | +--max-concurrency $CONC \ |
| 74 | +--request-rate inf --ignore-eos \ |
| 75 | +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ |
| 76 | +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json" |
| 77 | + |
| 78 | +# Try graceful first |
| 79 | +docker stop -t 90 "$server_name" || true |
| 80 | +# Wait until it's really dead |
| 81 | +docker wait "$server_name" >/dev/null 2>&1 || true |
| 82 | +# Force remove if anything lingers |
| 83 | +docker rm -f "$server_name" >/dev/null 2>&1 || true |
| 84 | + |
| 85 | +# Give a moment for GPU processes to fully terminate |
| 86 | +sleep 2 |
| 87 | +# Verify GPUs are now idle; if not, print diag and (optionally) reset |
| 88 | +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then |
| 89 | + echo "[WARN] After stop, GPU still busy:"; nvidia-smi |
| 90 | + # Last resort if driver allows and GPUs appear idle otherwise: |
| 91 | + #nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true |
| 92 | +fi |
| 93 | + |
| 94 | +nvidia-smi |
0 commit comments