1+ #! /usr/bin/env bash
2+
3+ # === Required Env Vars ===
4+ # MODEL
5+ # PORT
6+ # TP
7+ # EP_SIZE
8+ # DP_ATTENTION
9+ # CONC
10+ # ISL
11+ # OSL
12+ # MAX_MODEL_LEN
13+ # RANDOM_RANGE_RATIO
14+ # NUM_PROMPTS
15+ # RESULT_FILENAME
16+
17+ SERVER_LOG=$( mktemp /tmp/server-XXXXXX.log)
18+
19+ # GPTOSS TRTLLM Deployment Guide:
20+ # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
21+
22+ MOE_BACKEND=" TRTLLM"
23+ echo " MOE_BACKEND set to '$MOE_BACKEND '"
24+
25+ EXTRA_CONFIG_FILE=" gptoss-fp4.yml"
26+ export TRTLLM_ENABLE_PDL=1
27+ export NCCL_GRAPH_REGISTER=0
28+
29+ cat > $EXTRA_CONFIG_FILE << EOF
30+ cuda_graph_config:
31+ enable_padding: true
32+ max_batch_size: $CONC
33+ enable_attention_dp: $DP_ATTENTION
34+ kv_cache_config:
35+ dtype: fp8
36+ enable_block_reuse: false
37+ free_gpu_memory_fraction: 0.85
38+ print_iter_log: true
39+ stream_interval: 20
40+ num_postprocess_workers: 4
41+ moe_config:
42+ backend: $MOE_BACKEND
43+ EOF
44+
45+ if [[ " $DP_ATTENTION " == " true" ]]; then
46+ cat << EOF >> $EXTRA_CONFIG_FILE
47+ attention_dp_config:
48+ enable_balance: true
49+ EOF
50+ fi
51+
52+ echo " Generated config file contents:"
53+ cat $EXTRA_CONFIG_FILE
54+
55+ set -x
56+
57+ MAX_NUM_TOKENS=20000
58+
59+ # Launch TRT-LLM server
60+ mpirun -n 1 --oversubscribe --allow-run-as-root \
61+ trtllm-serve $MODEL --port=$PORT \
62+ --trust_remote_code \
63+ --backend=pytorch \
64+ --max_batch_size 512 \
65+ --max_seq_len=$MAX_MODEL_LEN \
66+ --max_num_tokens=$MAX_NUM_TOKENS \
67+ --tp_size=$TP --ep_size=$EP_SIZE \
68+ --extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 &
69+
70+ SERVER_PID=$!
71+
72+ # Source benchmark utilities
73+ source " $( dirname " $0 " ) /benchmark_lib.sh"
74+
75+ # Wait for server to be ready
76+ wait_for_server_ready --port " $PORT " --server-log " $SERVER_LOG " --server-pid " $SERVER_PID "
77+
78+ pip install -q datasets pandas
79+
80+ run_benchmark_serving \
81+ --model " $MODEL " \
82+ --port " $PORT " \
83+ --backend openai \
84+ --input-len " $ISL " \
85+ --output-len " $OSL " \
86+ --random-range-ratio " $RANDOM_RANGE_RATIO " \
87+ --num-prompts " $NUM_PROMPTS " \
88+ --max-concurrency " $CONC " \
89+ --result-filename " $RESULT_FILENAME " \
90+ --result-dir /workspace/
0 commit comments