1+ #! /usr/bin/env bash
2+
3+ # === Required Env Vars ===
4+ # HF_TOKEN
5+ # HF_HUB_CACHE
6+ # IMAGE
7+ # MODEL
8+ # ISL
9+ # OSL
10+ # MAX_MODEL_LEN
11+ # RANDOM_RANGE_RATIO
12+ # TP
13+ # CONC
14+ # RESULT_FILENAME
15+ # PORT
16+
17+ # GPTOSS TRTLLM Deployment Guide:
18+ # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
19+
20+ # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
21+ EP_SIZE=" 1"
22+ MOE_BACKEND=" TRTLLM"
23+ DP_ATTENTION=false
24+
25+ # Higher concurrencies: Concurrency >= 256
26+ # MoE Backend = CUTLASS
27+ # Use DP attention with expert parallel MoE
28+ if [[ $CONC -ge 256 ]]; then
29+ EP_SIZE=" $TP "
30+ DP_ATTENTION=true
31+ fi
32+
33+ echo " Final configuration: EP_SIZE='$EP_SIZE ', MOE_BACKEND='$MOE_BACKEND ', DP_ATTENTION='$DP_ATTENTION '"
34+
35+ EXTRA_CONFIG_FILE=" gptoss-fp4.yml"
36+ export TRTLLM_ENABLE_PDL=1
37+ export NCCL_GRAPH_REGISTER=0
38+
39+ cat > $EXTRA_CONFIG_FILE << EOF
40+ cuda_graph_config:
41+ enable_padding: true
42+ max_batch_size: $CONC
43+ enable_attention_dp: $DP_ATTENTION
44+ kv_cache_config:
45+ dtype: fp8
46+ enable_block_reuse: false
47+ free_gpu_memory_fraction: 0.85
48+ print_iter_log: true
49+ stream_interval: 20
50+ num_postprocess_workers: 4
51+ moe_config:
52+ backend: $MOE_BACKEND
53+ EOF
54+
55+ if [[ " $DP_ATTENTION " == " true" ]]; then
56+ cat << EOF >> $EXTRA_CONFIG_FILE
57+ attention_dp_config:
58+ enable_balance: true
59+ EOF
60+ fi
61+
62+ echo " Generated config file contents:"
63+ cat $EXTRA_CONFIG_FILE
64+
65+ set -x
66+
67+ MAX_NUM_TOKENS=20000
68+
69+ # Launch TRT-LLM server
70+ mpirun -n 1 --oversubscribe --allow-run-as-root \
71+ trtllm-serve $MODEL --port=$PORT \
72+ --trust_remote_code \
73+ --backend=pytorch \
74+ --max_batch_size 512 \
75+ --max_seq_len=$MAX_MODEL_LEN \
76+ --max_num_tokens=$MAX_NUM_TOKENS \
77+ --tp_size=$TP --ep_size=$EP_SIZE \
78+ --extra_llm_api_options=$EXTRA_CONFIG_FILE
0 commit comments