11export VLLM_USE_V1=1
2- export VLLM_USE_TRITON_FLASH_ATTN=0
2+ export VLLM_USE_TRITON_FLASH_ATTN=1 # use triton mha
33# export VLLM_LOGGING_LEVEL=DEBUG
44export VLLM_RPC_TIMEOUT=1800000
55export VLLM_ROCM_USE_AITER=1
66export VLLM_ROCM_USE_AITER_MHA=0
7- export VLLM_ROCM_USE_AITER_MLA=1
7+ export VLLM_ROCM_USE_AITER_MLA=0 # use triton mha
88export VLLM_ROCM_USE_AITER_MOE=1
99export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc
1010export VLLM_DISABLE_COMPILE_CACHE=1
1111# FIXME: for now disable fp4 asm gemm because of running issue
1212export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=0
13- # export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # for now disable
13+ export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # disable for acc
1414
1515export TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1
1616export TRITON_HIP_USE_ASYNC_COPY=1
@@ -37,11 +37,12 @@ vllm serve $model_path \
3737 --trust-remote-code \
3838 --no-enable-prefix-caching \
3939 --disable-log-requests \
40- --compilation-config ' {"cudagraph_mode": "FULL_AND_PIECEWISE"} ' \
41- --gpu_memory_utilization 0.8 \
40+ --enforce-eager \
41+ --gpu_memory_utilization 0.7 \
4242 --async-scheduling \
43+ --block-size 16 \
4344 --load-format fastsafetensors \
4445 --seed 123 2>&1 | tee log.server.log &
4546
46- # --enforce-eager \
47+ # --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
4748 # --enable-expert-parallel \
0 commit comments