-
Notifications
You must be signed in to change notification settings - Fork 186
Expand file tree
/
Copy pathdsv4_fp4_b200_vllm.sh
More file actions
executable file
·120 lines (97 loc) · 3.15 KB
/
dsv4_fp4_b200_vllm.sh
File metadata and controls
executable file
·120 lines (97 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env bash
# DeepSeek-V4-Pro B200 single-node vLLM recipe derived from the B200 pareto
# sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
# (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).
source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME
if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi
nvidia-smi
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
# DeepSeek-V4-Pro weights are large; engine startup can exceed the default
# 600s. Give it an hour to load.
export VLLM_ENGINE_READY_TIMEOUT_S=3600
PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi
EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi
GMU_ARGS=()
MOE_ARGS=()
if [ "${DP_ATTENTION}" = "true" ]; then
MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
fi
if [ "${ISL}" -eq 8192 ] && [ "${CONC}" -le 128 ]; then
MAX_NUM_BATCHED_TOKENS=${ISL}
else
MAX_NUM_BATCHED_TOKENS=2048
fi
MAX_CUDAGRAPH_CAPTURE_SIZE=2048
BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
if [ "${EVAL_ONLY}" = "true" ]; then
EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
export EVAL_MAX_MODEL_LEN
SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
else
SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
set -x
vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
--trust-remote-code \
--kv-cache-dtype fp8 \
--block-size 256 \
--no-enable-prefix-caching \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
"${GMU_ARGS[@]}" \
"${MOE_ARGS[@]}" \
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
--attention_config.use_fp4_indexer_cache=True \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--max-cudagraph-capture-size "$MAX_CUDAGRAPH_CAPTURE_SIZE" \
--max-model-len "$SERVE_MAX_MODEL_LEN" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &
SERVER_PID=$!
# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
pip install -q datasets pandas
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code
# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi
# Stop GPU monitoring
stop_gpu_monitor
set +x