Skip to content
6 changes: 5 additions & 1 deletion .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ on:
conc-list:
type: string
default: '[4, 8, 16, 32, 64]'
mtp-mode:
type: string
default: 'off'

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
Expand All @@ -51,6 +54,7 @@ env:
IMAGE: ${{ inputs.image }}
FRAMEWORK: ${{ inputs.framework }}
PRECISION: ${{ inputs.precision }}
MTP_MODE: ${{ inputs.mtp-mode }}

jobs:
benchmark:
Expand Down Expand Up @@ -97,7 +101,7 @@ jobs:
- name: Launch job script
env:
RUNNER_NAME: ${{ runner.name }}
RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_conc${{ env.CONC }}${{ env.MTP_MODE != 'off' && format('-mtp-{0}', runner.name) || format('_{0}', runner.name) }}
run: |
bash ./runners/launch_${RUNNER_NAME%%_*}.sh
if [ -f "$RESULT_FILENAME.json" ]; then
Expand Down
43 changes: 42 additions & 1 deletion .github/workflows/dsr1-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ jobs:
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
tp-list: '[8]'
conc-list: '[4, 8, 16, 32, 64, 128]'
mtp-mode: 'off'

bmk-mi300x-fp8:
if: ${{ inputs.use_mi300x }}
Expand Down Expand Up @@ -186,14 +188,53 @@ jobs:
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
model: 'nvidia/DeepSeek-R1-0528-FP4-v2'
framework: 'trt'
precision: fp4
precision: 'fp4'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
tp-list: '[4, 8]'
conc-list: '[4, 8, 16, 32, 64, 128, 256]' # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it
mtp-mode: 'off'

bmk-b200-trt-fp4-mtp:
if: ${{ inputs.use_b200 }}
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
runner: b200-trt
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
model: 'nvidia/DeepSeek-R1-0528-FP4-v2'
framework: 'trt'
precision: 'fp4'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
tp-list: '[4, 8]'
conc-list: '[4, 8, 16, 32, 64, 128, 256]'
mtp-mode: 'on'

bmk-b200-trt-fp8-mtp:
if: ${{ inputs.use_b200 }}
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
runner: b200-trt
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
model: 'deepseek-ai/DeepSeek-R1-0528'
framework: 'trt'
precision: 'fp8'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
tp-list: '[8]'
conc-list: '[4, 8, 16, 32, 64, 128]'
mtp-mode: 'on'

bmk-mi355x-fp4:
if: ${{ inputs.use_mi355x }}
Expand Down
12 changes: 11 additions & 1 deletion .github/workflows/runner-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ on:
- '70b_test'
- 'dsr1_test'
- 'gptoss_test'

mtp-mode:
description: 'MTP Mode'
required: true
type: choice
options:
- 'off'
- 'on'
default: 'off'

jobs:
runner-test:
Expand All @@ -117,7 +126,8 @@ jobs:
max-model-len: 2048
random-range-ratio: 0.8
tp-list: '[8]'
conc-list: '[1]'
conc-list: '[4,8,16,32,64]'
mtp-mode: ${{ inputs.mtp-mode }}

collect-test-results:
needs: runner-test
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/dsr1_fp4_b200_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ else
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

ps aux

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
--tensor-parallel-size=$TP --data-parallel-size=1 \
Expand Down
166 changes: 166 additions & 0 deletions benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# HF_TOKEN
# HF_HUB_CACHE
# IMAGE
# MODEL
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# TP
# CONC
# RESULT_FILENAME
# PORT_OFFSET

echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

hf download $MODEL

# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
EP_SIZE="1"
MOE_BACKEND="TRTLLM"
DP_ATTENTION=false
MTP=3

if [[ "$TP" == "4" ]]; then
if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
if [[ $CONC -ge 16 ]]; then
EP_SIZE="$TP"
fi
if [[ $CONC -ge 128 ]]; then
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
if [[ $CONC -ge 32 ]]; then
EP_SIZE="$TP"
fi
if [[ $CONC -ge 128 ]]; then
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
if [[ $CONC -ge 32 ]]; then
EP_SIZE="$TP"
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
fi
elif [[ "$TP" == "8" ]]; then
if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
if [[ $CONC -ge 16 ]]; then
EP_SIZE="$TP"
fi
if [[ $CONC -ge 64 ]]; then
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
if [[ $CONC -ge 8 ]]; then
EP_SIZE="$TP"
fi
if [[ $CONC -ge 128 ]]; then
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
if [[ $CONC -ge 32 ]]; then
EP_SIZE="$TP"
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
MTP=1
fi
fi
fi

echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))
EXTRA_CONFIG_FILE="dsr1-fp4.yml"

cat > $EXTRA_CONFIG_FILE << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: 512
enable_attention_dp: $DP_ATTENTION
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.8
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: $MOE_BACKEND
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: ${MTP}
EOF

if [[ "$DP_ATTENTION" == "true" ]]; then
cat << EOF >> $EXTRA_CONFIG_FILE
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
EOF
fi

if [[ "$DP_ATTENTION" == "true" ]]; then
MAX_BATCH_SIZE=$((CONC/TP))
else
MAX_BATCH_SIZE=$CONC
fi

MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))

set -x
# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve $MODEL --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size=$MAX_BATCH_SIZE \
--max_seq_len=$MAX_MODEL_LEN \
--max_num_tokens=$MAX_NUM_TOKENS \
--tp_size=$TP --ep_size=$EP_SIZE \
--extra_llm_api_options=$EXTRA_CONFIG_FILE \
> $SERVER_LOG 2>&1 &


set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
sleep 5
tail -n100 $SERVER_LOG
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"Application startup complete"* ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")

git clone https://github.com/kimbochen/bench_serving.git
set -x
python3 bench_serving/benchmark_serving.py \
--model $MODEL --backend openai \
--base-url http://0.0.0.0:$PORT \
--dataset-name random \
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--use-chat-template \
--result-filename $RESULT_FILENAME.json
10 changes: 8 additions & 2 deletions benchmarks/dsr1_fp4_b200_trt_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,21 @@ attention_dp_config:
EOF
fi

set -x
if [[ "$DP_ATTENTION" == "true" ]]; then
MAX_BATCH_SIZE=$((CONC/TP))
else
MAX_BATCH_SIZE=$CONC
fi

MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
MAX_NUM_TOKENS=$(( (MAX_BATCH_SIZE+ISL+64+63)/64*64 ))

set -x
# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve $MODEL --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size=$MAX_BATCH_SIZE \
--max_seq_len=$MAX_MODEL_LEN \
--max_num_tokens=$MAX_NUM_TOKENS \
--tp_size=$TP --ep_size=$EP_SIZE \
Expand Down
1 change: 1 addition & 0 deletions benchmarks/dsr1_fp8_b200_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ else
SCHEDULER_RECV_INTERVAL=10
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
ps aux

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
Expand Down
Loading
Loading