@@ -3,27 +3,25 @@ llm_perf_core:
33# ===============================================================================
44# Test Conditions Index
55# ===============================================================================
6- # 1: All GPUs common tests
7- # 2: L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases
8- # 3: A100, L20, L40S, H100, H20, H200
9- # 4: A100, L40S, H100, H20, H200
10- # 5: A100, H100, H20, H200 test cases
11- # 6: L40S, H100, H200, H20 test cases
12- # 7: H100, H200, H20 test cases
13- # 8: L20, L40S, H100, H200, H20 test cases
14- # 9: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
15- # 10: GB200, B200, B300, GB300, RTX6000-Server test cases
16- # 11: B200, GB200, B300, GB300 test cases
17- # 12: B200, B300 test cases
18- # 13: H100, H20, H200, B200, B300 test cases
19- # 14: H100, H20, H200, B200, B300, RTX-6000 Server test cases
20- # 15: RTX-6000D, RTX-6000 Server test cases
21- # 16: RTX6000-Server test cases
6+ # 1: All GPUs common tests(L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases)
7+ # 2: L20, L40S, H100, H20, H200
8+ # 3: L40S, H100, H20, H200
9+ # 4: H100, H20, H200 test cases
10+ # 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
11+ # 6: GB200, B200, B300, GB300, RTX6000-Server test cases
12+ # 7: B200, GB200, B300, GB300 test cases
13+ # 8: B200, B300 test cases
14+ # 9: H100, H20, H200, B200, B300 test cases
15+ # 10: H100, H20, H200, B200, B300, RTX-6000 Server test cases
16+ # 11: RTX-6000D, RTX-6000 Server test cases
17+ # 12: RTX6000-Server
2218# ===============================================================================
2319
2420
2521# 1: All GPUs common tests
2622- condition :
23+ terms :
24+ supports_fp8 : true
2725 ranges :
2826 system_gpu_count :
2927 gte : 2
@@ -38,19 +36,12 @@ llm_perf_core:
3836 - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
3937 - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250]
4038 - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:512,32]
41-
42-
43- # 2: L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases
44- - condition :
45- terms :
46- supports_fp8 : true
47- tests :
4839 - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
4940 - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
5041 - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
5142
5243
53- # 3: A100, L20, L40S, H100, H20, H200
44+ # 2: L20, L40S, H100, H20, H200
5445- condition :
5546 ranges :
5647 system_gpu_count :
@@ -110,49 +101,35 @@ llm_perf_core:
110101 - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:10-con:1-gpus:2]
111102 # Mistral-Small-3.1-24B-Instruct-2503
112103 - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200-gpus:2] TIMEOUT(120)
104+ # pytorch backend
105+ - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32]
106+ - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000]
107+ - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-loras:1-reqs:100-con:2-gpus:1]
108+ - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
109+ - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:2000,2000-con:250]
110+ # Ministral-8B FP8
111+ - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
112+ - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1]
113+ - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxnt:5000-input_output_len:5000,500-reqs:500-con:250]
114+ - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
115+ # mixtral_8x7b_v0.1_fp8 pytorch backend
116+ - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
117+ - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:2]
113118
114119
115- # 4: A100, L40S, H100, H20, H200
120+ # 3: L40S, H100, H20, H200
116121- condition :
117122 ranges :
118123 system_gpu_count :
119124 gte : 4
120125 compute_capability :
121- lt : 10.0
126+ gt : 8.0
127+ lte : 9.0
122128 tests :
123129 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4]
124130 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4]
125131 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
126132 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:4]
127-
128- # 5: A100, H100, H20, H200 test cases
129- - condition :
130- ranges :
131- system_gpu_count :
132- gte : 8
133- compute_capability :
134- lt : 10.0
135- gpu_memory :
136- gt : 80000
137- tests :
138- - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
139- - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-streaming-float16-input_output_len:128,128-gpus:2]
140- - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
141- - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
142- - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
143- - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
144- - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8]
145-
146-
147- # 6: L40S, H100, H200, H20 test cases
148- - condition :
149- ranges :
150- system_gpu_count :
151- gte : 8
152- compute_capability :
153- gt : 8.0
154- lte : 9.0
155- tests :
156133 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4]
157134 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:4]
158135 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
@@ -167,48 +144,34 @@ llm_perf_core:
167144 - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:8]
168145
169146
170- # 7 : H100, H200, H20 test cases
147+ # 4 : H100, H20, H200 test cases
171148- condition :
172149 ranges :
173150 system_gpu_count :
174151 gte : 8
175152 compute_capability :
176153 gte : 9.0
177154 lte : 9.0
155+ gpu_memory :
156+ gt : 80000
178157 tests :
179- # deepseek_v3_lite_fp8
158+ - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
159+ - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-streaming-float16-input_output_len:128,128-gpus:2]
160+ - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
161+ - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
162+ - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
163+ - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
164+ - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8]
165+ # deepseek_v3_lite_fp8
180166 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:2000,500]
181167 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
182168 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:500,2000]
183169 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200]
184170 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:128,128]
185171 - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200] TIMEOUT(120)
186172
187- # 8: L20, L40S, H100, H200, H20 test cases
188- - condition :
189- ranges :
190- system_gpu_count :
191- gte : 2
192- compute_capability :
193- gt : 8.0
194- lte : 9.0
195- tests :
196- # pytorch backend
197- - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32]
198- - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000]
199- - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-loras:1-reqs:100-con:2-gpus:1]
200- - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
201- - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:2000,2000-con:250]
202- # Ministral-8B FP8
203- - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
204- - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1]
205- - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxnt:5000-input_output_len:5000,500-reqs:500-con:250]
206- - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
207- # mixtral_8x7b_v0.1_fp8 pytorch backend
208- - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
209- - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:2]
210173
211- # 9 : H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
174+ # 5 : H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
212175- condition :
213176 ranges :
214177 system_gpu_count :
@@ -224,7 +187,7 @@ llm_perf_core:
224187 - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8]
225188
226189
227- # 10 : GB200, B200, B300, GB300, RTX6000-Server test cases
190+ # 6 : GB200, B200, B300, GB300, RTX6000-Server test cases
228191- condition :
229192 ranges :
230193 system_gpu_count :
@@ -274,7 +237,7 @@ llm_perf_core:
274237 - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
275238
276239
277- # 11 : B200, GB200 B300, GB300 test cases
240+ # 7 : B200, GB200, B300, GB300 test cases
278241- condition :
279242 ranges :
280243 system_gpu_count :
@@ -290,7 +253,7 @@ llm_perf_core:
290253 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
291254 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:32-maxnt:32768-input_output_len:8192,1024-reqs:20-con:1-ep:1-tp:4-gpus:4] TIMEOUT(120)
292255
293- # 12 : B200, B300 test cases
256+ # 8 : B200, B300 test cases
294257- condition :
295258 ranges :
296259 system_gpu_count :
@@ -319,7 +282,6 @@ llm_perf_core:
319282 # deepseek_r1_0528_fp4
320283 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120)
321284 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
322- - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
323285
324286 # gpt_oss_120b_fp4
325287 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8]
@@ -329,7 +291,7 @@ llm_perf_core:
329291 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8]
330292 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]
331293
332- # 13 : H100, H20, H200, B200, B300 test cases
294+ # 9 : H100, H20, H200, B200, B300 test cases
333295- condition :
334296 ranges :
335297 system_gpu_count :
@@ -392,11 +354,9 @@ llm_perf_core:
392354 # for chunked prefill cases
393355 - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120)
394356 - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120)
395- # qwen3_235b_a22b_fp8
396- - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
397357
398358
399- # 14 : H100, H20, H200, B200, B300, RTX-6000 Server test cases
359+ # 10 : H100, H20, H200, B200, B300, RTX-6000 Server test cases
400360- condition :
401361 ranges :
402362 system_gpu_count :
@@ -408,7 +368,7 @@ llm_perf_core:
408368 - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
409369
410370
411- # 15 : RTX-6000D, RTX-6000 Server test cases
371+ # 11 : RTX-6000D, RTX-6000 Server test cases
412372- condition :
413373 ranges :
414374 system_gpu_count :
@@ -442,7 +402,7 @@ llm_perf_core:
442402 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8-tp:2-gpus:2]
443403
444404
445- # 16 : RTX6000-Server test cases
405+ # 12 : RTX6000-Server test cases
446406- condition :
447407 ranges :
448408 system_gpu_count :
0 commit comments