Skip to content

Commit 3c3d199

Browse files
authored
Add b200 DGXC node to b200 runners list (#245)
1 parent 93bfd04 commit 3c3d199

3 files changed

Lines changed: 68 additions & 1 deletion

File tree

.github/configs/runners.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ b200:
2424
- 'b200-nvd_1'
2525
- 'b200-nvd_2'
2626
- 'b200-nvd_3'
27+
- 'b200-dgxc_1'
28+
- 'b200-dgxc_2'
2729
mi300x:
2830
- 'mi300x-amd_0'
2931
- 'mi300x-amd_1'

runners/launch_b200-dgxc.sh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/bash
2+
3+
HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
4+
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
5+
PORT=8888
6+
7+
# Create unique cache directory based on model parameters
8+
MODEL_NAME=$(basename "$MODEL")
9+
10+
server_name="bmk-server"
11+
12+
nvidia-smi
13+
14+
# GPUs must be idle
15+
if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
16+
echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1
17+
fi
18+
19+
set -x
20+
# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup
21+
# Ref: https://www.paolomainardi.com/posts/docker-run-init/
22+
23+
# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs.
24+
# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes.
25+
# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register
26+
27+
if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
28+
if [[ "$OSL" == "8192" ]]; then
29+
export NUM_PROMPTS=$(( CONC * 20 ))
30+
else
31+
export NUM_PROMPTS=$(( CONC * 50 ))
32+
fi
33+
else
34+
export NUM_PROMPTS=$(( CONC * 10 ))
35+
fi
36+
37+
docker run --rm --init --network host --name $server_name \
38+
--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
39+
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
40+
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
41+
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
42+
-e NCCL_GRAPH_REGISTER=0 \
43+
-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
44+
-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
45+
--entrypoint=/bin/bash \
46+
$(echo "$IMAGE" | sed 's/#/\//') \
47+
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
48+
49+
# Try graceful first
50+
docker stop -t 90 "$server_name" || true
51+
# Wait until it's really dead
52+
docker wait "$server_name" >/dev/null 2>&1 || true
53+
# Force remove if anything lingers
54+
docker rm -f "$server_name" >/dev/null 2>&1 || true
55+
56+
# Give a moment for GPU processes to fully terminate
57+
sleep 2
58+
# Verify GPUs are now idle; if not, print diag and (optionally) reset
59+
if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
60+
echo "[WARN] After stop, GPU still busy:"; nvidia-smi
61+
# Last resort if driver allows and GPUs appear idle otherwise:
62+
#nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
63+
fi
64+
65+
nvidia-smi

runners/launch_b200-nvd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ docker run --rm --init --network host --name $server_name \
3939
--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
4040
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
4141
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
42-
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
42+
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
4343
-e NCCL_GRAPH_REGISTER=0 \
4444
-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
4545
-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \

0 commit comments

Comments
 (0)