MAD/scripts/vllm/configs/default.yaml at develop · ROCm/MAD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 1k 1k mc1 mc8 mc32 mc128

- benchmark: serving
  model: deepseek-ai/DeepSeek-R1-0528 amd/DeepSeek-R1-0528-MXFP4-Preview
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1

## gpt-oss w4a8 is gfx950 only
- benchmark: serving
  model: openai/gpt-oss-120b amd/gpt-oss120b-w-mxfp4-a-fp8
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1
  bench_args:
    --lmeval_apply_chat_template: True

## For LLama, Mixtral, etc. on gfx942, use float16 for better GEMM perf
- benchmark: serving
  model:
    meta-llama/Llama-3.1-405B-Instruct
    amd/Llama-3.1-405B-Instruct-FP8-KV
    meta-llama/Llama-3.3-70B-Instruct
    amd/Llama-3.3-70B-Instruct-FP8-KV
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1
  arch_overrides:
    gfx942:
      dtype: float16

## Llama 3.x MXFP4 (gfx950 only)
- benchmark: serving
  model:
    amd/Llama-3.1-405B-Instruct-MXFP4-Preview
    amd/Llama-3.3-70B-Instruct-MXFP4-Preview
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1

- benchmark: serving
  model:
    meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1
  arch_overrides:
    gfx942:
      dtype: float16

- benchmark: serving
  model:
    mistralai/Mixtral-8x22B-Instruct-v0.1
    amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
  tp: 8
  inp: 1024
  out: 1024
  dtype: auto
  max_concurrency: 1 8 32 128
  env:
    VLLM_ROCM_USE_AITER: 1
  arch_overrides:
    gfx942:
      dtype: float16