@@ -33,13 +33,29 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
3333 export HIP_VISIBLE_DEVICES=" $ROCR_VISIBLE_DEVICES "
3434fi
3535
36- export AMDGCN_USE_BUFFER_OPS=0
36+ # --- AITER backend optimizations (env-var tuning) ---
3737export VLLM_ROCM_USE_AITER=1
38+ export VLLM_USE_ROCM_AITER_MXFP4=1
39+ export VLLM_USE_ROCM_AITER_PAGED_ATTN=1
40+ export VLLM_ROCM_USE_AITER_LINEAR=1
3841export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
42+ export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=1
43+ export VLLM_ROCM_USE_AITER_TRITON_GEMM=0
44+ export VLLM_ROCM_MOE_PADDING=0
3945export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
46+ export AITER_BF16_FP8_BOUND=0
47+ export AITER_USE_OPUS_MOE_SORTING=1
48+ export AITER_USE_NT=0
49+ export AMDGCN_USE_BUFFER_OPS=1
50+ export CK_MXFP4_MOE_DIM_ALIGNMENT=64
51+ export GPU_MAX_HW_QUEUES=4
52+
4053ATTN_BACKEND=" --attention-backend ROCM_AITER_UNIFIED_ATTN"
4154FUSE_ROPE_KVCACHE=" -cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
4255
56+ # --- Speculative decoding (06/02 — n-gram prompt lookup, lossless) ---
57+ SPEC_DECODE=" --speculative-config {\" method\" :\" ngram\" ,\" num_speculative_tokens\" :3,\" prompt_lookup_min\" :2,\" prompt_lookup_max\" :64}"
58+
4359SERVER_LOG=/workspace/server.log
4460
4561if [ " ${EVAL_ONLY} " = " true" ]; then
@@ -53,10 +69,13 @@ set -x
5369vllm serve $MODEL --port $PORT \
5470 $ATTN_BACKEND $FUSE_ROPE_KVCACHE \
5571 --tensor-parallel-size=$TP \
56- --gpu-memory-utilization 0.95 \
72+ --gpu-memory-utilization 0.97 \
5773 --max-model-len $MAX_MODEL_LEN \
74+ --max-num-seqs 256 \
75+ --max-num-batched-tokens 16384 \
5876 --block-size=64 \
59- --no-enable-prefix-caching > $SERVER_LOG 2>&1 &
77+ --no-enable-prefix-caching \
78+ $SPEC_DECODE > $SERVER_LOG 2>&1 &
6079
6180SERVER_PID=$!
6281
0 commit comments