Skip to content

Commit d2b331c

Browse files
committed
AMD - gpt-oss vllm mxfp4: AITER tuning + n-gram spec decode + server parameter tuning
1 parent 7d4063d commit d2b331c

2 files changed

Lines changed: 30 additions & 3 deletions

File tree

benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,29 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
3333
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
3434
fi
3535

36-
export AMDGCN_USE_BUFFER_OPS=0
36+
# --- AITER backend optimizations (env-var tuning) ---
3737
export VLLM_ROCM_USE_AITER=1
38+
export VLLM_USE_ROCM_AITER_MXFP4=1
39+
export VLLM_USE_ROCM_AITER_PAGED_ATTN=1
40+
export VLLM_ROCM_USE_AITER_LINEAR=1
3841
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
42+
export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=1
43+
export VLLM_ROCM_USE_AITER_TRITON_GEMM=0
44+
export VLLM_ROCM_MOE_PADDING=0
3945
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
46+
export AITER_BF16_FP8_BOUND=0
47+
export AITER_USE_OPUS_MOE_SORTING=1
48+
export AITER_USE_NT=0
49+
export AMDGCN_USE_BUFFER_OPS=1
50+
export CK_MXFP4_MOE_DIM_ALIGNMENT=64
51+
export GPU_MAX_HW_QUEUES=4
52+
4053
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
4154
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
4255

56+
# --- Speculative decoding (06/02 — n-gram prompt lookup, lossless) ---
57+
SPEC_DECODE="--speculative-config {\"method\":\"ngram\",\"num_speculative_tokens\":3,\"prompt_lookup_min\":2,\"prompt_lookup_max\":64}"
58+
4359
SERVER_LOG=/workspace/server.log
4460

4561
if [ "${EVAL_ONLY}" = "true" ]; then
@@ -53,10 +69,13 @@ set -x
5369
vllm serve $MODEL --port $PORT \
5470
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
5571
--tensor-parallel-size=$TP \
56-
--gpu-memory-utilization 0.95 \
72+
--gpu-memory-utilization 0.97 \
5773
--max-model-len $MAX_MODEL_LEN \
74+
--max-num-seqs 256 \
75+
--max-num-batched-tokens 16384 \
5876
--block-size=64 \
59-
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &
77+
--no-enable-prefix-caching \
78+
$SPEC_DECODE > $SERVER_LOG 2>&1 &
6079

6180
SERVER_PID=$!
6281

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3430,3 +3430,11 @@
34303430
- "Image: vllm/vllm-openai:v0.20.1"
34313431
- "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs"
34323432
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652
3433+
3434+
- config-keys:
3435+
- gptoss-fp4-mi355x-vllm
3436+
description:
3437+
- "Enable n-gram speculative decoding (prompt-lookup, num_speculative_tokens=3) for 3.26x decode throughput improvement"
3438+
- "Add full AITER env-var tuning: MXFP4, FP4 ASM GEMM, unified paged attention, inductor graph partition, opus MoE sorting"
3439+
- "Set gpu-memory-utilization=0.97, max-num-seqs=256, max-num-batched-tokens=16384, GPU_MAX_HW_QUEUES=4"
3440+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1657

0 commit comments

Comments
 (0)