-
Notifications
You must be signed in to change notification settings - Fork 46
Expand file tree
/
Copy pathdefault.yaml
More file actions
84 lines (78 loc) · 1.67 KB
/
default.yaml
File metadata and controls
84 lines (78 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 1k 1k mc1 mc8 mc32 mc128
- benchmark: serving
model: deepseek-ai/DeepSeek-R1-0528 amd/DeepSeek-R1-0528-MXFP4-Preview
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
## gpt-oss w4a8 is gfx950 only
- benchmark: serving
model: openai/gpt-oss-120b amd/gpt-oss120b-w-mxfp4-a-fp8
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
bench_args:
--lmeval_apply_chat_template: True
## For LLama, Mixtral, etc. on gfx942, use float16 for better GEMM perf
- benchmark: serving
model:
meta-llama/Llama-3.1-405B-Instruct
amd/Llama-3.1-405B-Instruct-FP8-KV
meta-llama/Llama-3.3-70B-Instruct
amd/Llama-3.3-70B-Instruct-FP8-KV
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
arch_overrides:
gfx942:
dtype: float16
## Llama 3.x MXFP4 (gfx950 only)
- benchmark: serving
model:
amd/Llama-3.1-405B-Instruct-MXFP4-Preview
amd/Llama-3.3-70B-Instruct-MXFP4-Preview
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
- benchmark: serving
model:
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
arch_overrides:
gfx942:
dtype: float16
- benchmark: serving
model:
mistralai/Mixtral-8x22B-Instruct-v0.1
amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
tp: 8
inp: 1024
out: 1024
dtype: auto
max_concurrency: 1 8 32 128
env:
VLLM_ROCM_USE_AITER: 1
arch_overrides:
gfx942:
dtype: float16