Skip to content

Commit 3a7d71e

Browse files
feat: add VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 as in single node example
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
1 parent 603efa6 commit 3a7d71e

1 file changed

Lines changed: 1 addition & 1 deletion

File tree

benchmarks/multi_node/amd_utils/models_vllm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ MiniMax-M2.5:
3535
# Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
3636
prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
3737
decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
38-
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600"
38+
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
3939
hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
4040

4141
gpt-oss-120b:

0 commit comments

Comments
 (0)