|
| 1 | +#!/usr/bin/bash |
| 2 | + |
| 3 | +HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" |
| 4 | +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') |
| 5 | +PORT=8888 |
| 6 | + |
| 7 | +# Create unique cache directory based on model parameters |
| 8 | +MODEL_NAME=$(basename "$MODEL") |
| 9 | + |
| 10 | +server_name="bmk-server" |
| 11 | + |
| 12 | +nvidia-smi |
| 13 | + |
| 14 | +# GPUs must be idle |
| 15 | +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then |
| 16 | + echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1 |
| 17 | +fi |
| 18 | + |
| 19 | +set -x |
| 20 | +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup |
| 21 | +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ |
| 22 | + |
| 23 | +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. |
| 24 | +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. |
| 25 | +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register |
| 26 | + |
| 27 | +if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then |
| 28 | + if [[ "$OSL" == "8192" ]]; then |
| 29 | + export NUM_PROMPTS=$(( CONC * 20 )) |
| 30 | + else |
| 31 | + export NUM_PROMPTS=$(( CONC * 50 )) |
| 32 | + fi |
| 33 | +else |
| 34 | + export NUM_PROMPTS=$(( CONC * 10 )) |
| 35 | +fi |
| 36 | + |
| 37 | +docker run --rm --init --network host --name $server_name \ |
| 38 | +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ |
| 39 | +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ |
| 40 | +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ |
| 41 | +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ |
| 42 | +-e NCCL_GRAPH_REGISTER=0 \ |
| 43 | +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ |
| 44 | +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ |
| 45 | +--entrypoint=/bin/bash \ |
| 46 | +$(echo "$IMAGE" | sed 's/#/\//') \ |
| 47 | +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" |
| 48 | + |
| 49 | +# Try graceful first |
| 50 | +docker stop -t 90 "$server_name" || true |
| 51 | +# Wait until it's really dead |
| 52 | +docker wait "$server_name" >/dev/null 2>&1 || true |
| 53 | +# Force remove if anything lingers |
| 54 | +docker rm -f "$server_name" >/dev/null 2>&1 || true |
| 55 | + |
| 56 | +# Give a moment for GPU processes to fully terminate |
| 57 | +sleep 2 |
| 58 | +# Verify GPUs are now idle; if not, print diag and (optionally) reset |
| 59 | +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then |
| 60 | + echo "[WARN] After stop, GPU still busy:"; nvidia-smi |
| 61 | + # Last resort if driver allows and GPUs appear idle otherwise: |
| 62 | + #nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true |
| 63 | +fi |
| 64 | + |
| 65 | +nvidia-smi |
0 commit comments