Skip to content

Commit a397f9a

Browse files
Ankur-singhcquil11
authored andcommitted
Add benchmark script for GPTOSS FP4 B200 TRT-LLM (#256)
* Add benchmark script for GPTOSS FP4 B200 TRT-LLM * make changes to perf changelog --------- Co-authored-by: Cameron Quilici <cjquilici@gmail.com>
1 parent 724c370 commit a397f9a

2 files changed

Lines changed: 95 additions & 0 deletions

File tree

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# MODEL
5+
# PORT
6+
# TP
7+
# EP_SIZE
8+
# DP_ATTENTION
9+
# CONC
10+
# ISL
11+
# OSL
12+
# MAX_MODEL_LEN
13+
# RANDOM_RANGE_RATIO
14+
# NUM_PROMPTS
15+
# RESULT_FILENAME
16+
17+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
18+
19+
# GPTOSS TRTLLM Deployment Guide:
20+
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
21+
22+
MOE_BACKEND="TRTLLM"
23+
echo "MOE_BACKEND set to '$MOE_BACKEND'"
24+
25+
EXTRA_CONFIG_FILE="gptoss-fp4.yml"
26+
export TRTLLM_ENABLE_PDL=1
27+
export NCCL_GRAPH_REGISTER=0
28+
29+
cat > $EXTRA_CONFIG_FILE << EOF
30+
cuda_graph_config:
31+
enable_padding: true
32+
max_batch_size: $CONC
33+
enable_attention_dp: $DP_ATTENTION
34+
kv_cache_config:
35+
dtype: fp8
36+
enable_block_reuse: false
37+
free_gpu_memory_fraction: 0.85
38+
print_iter_log: true
39+
stream_interval: 20
40+
num_postprocess_workers: 4
41+
moe_config:
42+
backend: $MOE_BACKEND
43+
EOF
44+
45+
if [[ "$DP_ATTENTION" == "true" ]]; then
46+
cat << EOF >> $EXTRA_CONFIG_FILE
47+
attention_dp_config:
48+
enable_balance: true
49+
EOF
50+
fi
51+
52+
echo "Generated config file contents:"
53+
cat $EXTRA_CONFIG_FILE
54+
55+
set -x
56+
57+
MAX_NUM_TOKENS=20000
58+
59+
# Launch TRT-LLM server
60+
mpirun -n 1 --oversubscribe --allow-run-as-root \
61+
trtllm-serve $MODEL --port=$PORT \
62+
--trust_remote_code \
63+
--backend=pytorch \
64+
--max_batch_size 512 \
65+
--max_seq_len=$MAX_MODEL_LEN \
66+
--max_num_tokens=$MAX_NUM_TOKENS \
67+
--tp_size=$TP --ep_size=$EP_SIZE \
68+
--extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 &
69+
70+
SERVER_PID=$!
71+
72+
# Source benchmark utilities
73+
source "$(dirname "$0")/benchmark_lib.sh"
74+
75+
# Wait for server to be ready
76+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
77+
78+
pip install -q datasets pandas
79+
80+
run_benchmark_serving \
81+
--model "$MODEL" \
82+
--port "$PORT" \
83+
--backend openai \
84+
--input-len "$ISL" \
85+
--output-len "$OSL" \
86+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
87+
--num-prompts "$NUM_PROMPTS" \
88+
--max-concurrency "$CONC" \
89+
--result-filename "$RESULT_FILENAME" \
90+
--result-dir /workspace/

perf-changelog.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,8 @@
8686
description: |
8787
- Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1
8888
PR: https://github.com/InferenceMAX/InferenceMAX/pull/330
89+
- config-keys:
90+
- gptoss-fp4-b200-trt
91+
description: |
92+
- Add benchmark script for GPTOSS FP4 B200 TRT-LLM
93+
PR: https://github.com/InferenceMAX/InferenceMAX/pull/256

0 commit comments

Comments
 (0)