Skip to content

Commit beb0c9a

Browse files
committed
add gptoss trt docker
1 parent 3194358 commit beb0c9a

1 file changed

Lines changed: 78 additions & 0 deletions

File tree

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT
16+
17+
# GPTOSS TRTLLM Deployment Guide:
18+
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
19+
20+
# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
21+
EP_SIZE="1"
22+
MOE_BACKEND="TRTLLM"
23+
DP_ATTENTION=false
24+
25+
# Higher concurrencies: Concurrency >= 256
26+
# MoE Backend = CUTLASS
27+
# Use DP attention with expert parallel MoE
28+
if [[ $CONC -ge 256 ]]; then
29+
EP_SIZE="$TP"
30+
DP_ATTENTION=true
31+
fi
32+
33+
echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
34+
35+
EXTRA_CONFIG_FILE="gptoss-fp4.yml"
36+
export TRTLLM_ENABLE_PDL=1
37+
export NCCL_GRAPH_REGISTER=0
38+
39+
cat > $EXTRA_CONFIG_FILE << EOF
40+
cuda_graph_config:
41+
enable_padding: true
42+
max_batch_size: $CONC
43+
enable_attention_dp: $DP_ATTENTION
44+
kv_cache_config:
45+
dtype: fp8
46+
enable_block_reuse: false
47+
free_gpu_memory_fraction: 0.85
48+
print_iter_log: true
49+
stream_interval: 20
50+
num_postprocess_workers: 4
51+
moe_config:
52+
backend: $MOE_BACKEND
53+
EOF
54+
55+
if [[ "$DP_ATTENTION" == "true" ]]; then
56+
cat << EOF >> $EXTRA_CONFIG_FILE
57+
attention_dp_config:
58+
enable_balance: true
59+
EOF
60+
fi
61+
62+
echo "Generated config file contents:"
63+
cat $EXTRA_CONFIG_FILE
64+
65+
set -x
66+
67+
MAX_NUM_TOKENS=20000
68+
69+
# Launch TRT-LLM server
70+
mpirun -n 1 --oversubscribe --allow-run-as-root \
71+
trtllm-serve $MODEL --port=$PORT \
72+
--trust_remote_code \
73+
--backend=pytorch \
74+
--max_batch_size 512 \
75+
--max_seq_len=$MAX_MODEL_LEN \
76+
--max_num_tokens=$MAX_NUM_TOKENS \
77+
--tp_size=$TP --ep_size=$EP_SIZE \
78+
--extra_llm_api_options=$EXTRA_CONFIG_FILE

0 commit comments

Comments
 (0)