Skip to content

Commit 24ea7de

Browse files
kimbo@semianalysis.comcquil11
authored andcommitted
revert remove: llama 70b
1 parent 1105aea commit 24ea7de

22 files changed

Lines changed: 1439 additions & 0 deletions

.github/workflows/70b-tmpl.yml

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
name: Template - LLaMA 70B
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
exp-name:
7+
required: true
8+
type: string
9+
isl:
10+
required: true
11+
type: string
12+
osl:
13+
required: true
14+
type: string
15+
max-model-len:
16+
required: true
17+
type: string
18+
random-range-ratio:
19+
required: true
20+
type: string
21+
22+
use_h100:
23+
type: boolean
24+
required: true
25+
use_h200:
26+
type: boolean
27+
required: true
28+
use_b200:
29+
type: boolean
30+
required: true
31+
use_mi300x:
32+
type: boolean
33+
required: true
34+
use_mi325x:
35+
type: boolean
36+
required: true
37+
use_mi355x:
38+
type: boolean
39+
required: true
40+
41+
jobs:
42+
bmk-h100-fp8:
43+
if: ${{ inputs.use_h100 }}
44+
uses: ./.github/workflows/benchmark-tmpl.yml
45+
secrets: inherit
46+
with:
47+
runner: h100
48+
image: 'vllm/vllm-openai:v0.10.2'
49+
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
50+
framework: 'vllm'
51+
precision: 'fp8'
52+
exp-name: ${{ inputs.exp-name }}
53+
isl: ${{ inputs.isl }}
54+
osl: ${{ inputs.osl }}
55+
max-model-len: ${{ inputs.max-model-len }}
56+
random-range-ratio: ${{ inputs.random-range-ratio }}
57+
tp-list: '[2, 4, 8]'
58+
59+
bmk-h200-fp8:
60+
if: ${{ inputs.use_h200 }}
61+
uses: ./.github/workflows/benchmark-tmpl.yml
62+
secrets: inherit
63+
with:
64+
runner: h200
65+
image: 'vllm/vllm-openai:v0.10.2'
66+
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
67+
framework: 'vllm'
68+
precision: 'fp8'
69+
exp-name: ${{ inputs.exp-name }}
70+
isl: ${{ inputs.isl }}
71+
osl: ${{ inputs.osl }}
72+
max-model-len: ${{ inputs.max-model-len }}
73+
random-range-ratio: ${{ inputs.random-range-ratio }}
74+
tp-list: '[1, 2, 4, 8]'
75+
76+
bmk-h200-trt-fp8:
77+
if: ${{ inputs.use_h200 }}
78+
uses: ./.github/workflows/benchmark-tmpl.yml
79+
secrets: inherit
80+
with:
81+
runner: h200-trt
82+
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
83+
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
84+
framework: 'trt'
85+
precision: 'fp8'
86+
exp-name: ${{ inputs.exp-name }}
87+
isl: ${{ inputs.isl }}
88+
osl: ${{ inputs.osl }}
89+
max-model-len: ${{ inputs.max-model-len }}
90+
random-range-ratio: ${{ inputs.random-range-ratio }}
91+
tp-list: '[1, 2, 4, 8]'
92+
conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128
93+
94+
bmk-b200-fp8:
95+
if: ${{ inputs.use_b200 }}
96+
uses: ./.github/workflows/benchmark-tmpl.yml
97+
secrets: inherit
98+
with:
99+
runner: b200
100+
image: 'vllm/vllm-openai:v0.10.2'
101+
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
102+
framework: 'vllm'
103+
precision: 'fp8'
104+
exp-name: ${{ inputs.exp-name }}
105+
isl: ${{ inputs.isl }}
106+
osl: ${{ inputs.osl }}
107+
max-model-len: ${{ inputs.max-model-len }}
108+
random-range-ratio: ${{ inputs.random-range-ratio }}
109+
tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
110+
111+
bmk-b200-trt-fp8:
112+
if: ${{ inputs.use_b200 }}
113+
uses: ./.github/workflows/benchmark-tmpl.yml
114+
secrets: inherit
115+
with:
116+
runner: b200-trt
117+
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
118+
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
119+
framework: 'trt'
120+
precision: 'fp8'
121+
exp-name: ${{ inputs.exp-name }}
122+
isl: ${{ inputs.isl }}
123+
osl: ${{ inputs.osl }}
124+
max-model-len: ${{ inputs.max-model-len }}
125+
random-range-ratio: ${{ inputs.random-range-ratio }}
126+
tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
127+
conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256
128+
129+
bmk-mi300x-fp8:
130+
if: ${{ inputs.use_mi300x }}
131+
uses: ./.github/workflows/benchmark-tmpl.yml
132+
secrets: inherit
133+
with:
134+
runner: mi300x
135+
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
136+
model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
137+
framework: 'vllm'
138+
precision: 'fp8'
139+
exp-name: ${{ inputs.exp-name }}
140+
isl: ${{ inputs.isl }}
141+
osl: ${{ inputs.osl }}
142+
max-model-len: ${{ inputs.max-model-len }}
143+
random-range-ratio: ${{ inputs.random-range-ratio }}
144+
tp-list: '[1, 2, 4, 8]'
145+
146+
bmk-mi325x-fp8:
147+
if: ${{ inputs.use_mi325x }}
148+
uses: ./.github/workflows/benchmark-tmpl.yml
149+
secrets: inherit
150+
with:
151+
runner: mi325x
152+
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
153+
model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
154+
framework: 'vllm'
155+
precision: 'fp8'
156+
exp-name: ${{ inputs.exp-name }}
157+
isl: ${{ inputs.isl }}
158+
osl: ${{ inputs.osl }}
159+
max-model-len: ${{ inputs.max-model-len }}
160+
random-range-ratio: ${{ inputs.random-range-ratio }}
161+
tp-list: '[1, 2, 4, 8]'
162+
163+
bmk-mi355x-fp8:
164+
if: ${{ inputs.use_mi355x }}
165+
uses: ./.github/workflows/benchmark-tmpl.yml
166+
secrets: inherit
167+
with:
168+
runner: mi355x
169+
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
170+
model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
171+
framework: 'vllm'
172+
precision: 'fp8'
173+
exp-name: ${{ inputs.exp-name }}
174+
isl: ${{ inputs.isl }}
175+
osl: ${{ inputs.osl }}
176+
max-model-len: ${{ inputs.max-model-len }}
177+
random-range-ratio: ${{ inputs.random-range-ratio }}
178+
tp-list: '[1, 2, 4, 8]'
179+
180+
bmk-b200-fp4:
181+
if: ${{ inputs.use_b200 }}
182+
uses: ./.github/workflows/benchmark-tmpl.yml
183+
secrets: inherit
184+
with:
185+
runner: b200
186+
image: 'vllm/vllm-openai:v0.10.2'
187+
model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
188+
framework: 'vllm'
189+
precision: 'fp4'
190+
exp-name: ${{ inputs.exp-name }}
191+
isl: ${{ inputs.isl }}
192+
osl: ${{ inputs.osl }}
193+
max-model-len: ${{ inputs.max-model-len }}
194+
random-range-ratio: ${{ inputs.random-range-ratio }}
195+
tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
196+
197+
bmk-b200-trt-fp4:
198+
if: ${{ inputs.use_b200 }}
199+
uses: ./.github/workflows/benchmark-tmpl.yml
200+
secrets: inherit
201+
with:
202+
runner: b200-trt
203+
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
204+
model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
205+
framework: 'trt'
206+
precision: 'fp4'
207+
exp-name: ${{ inputs.exp-name }}
208+
isl: ${{ inputs.isl }}
209+
osl: ${{ inputs.osl }}
210+
max-model-len: ${{ inputs.max-model-len }}
211+
random-range-ratio: ${{ inputs.random-range-ratio }}
212+
tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
213+
conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128
214+
215+
bmk-mi355x-fp4:
216+
if: ${{ inputs.use_mi355x }}
217+
uses: ./.github/workflows/benchmark-tmpl.yml
218+
secrets: inherit
219+
with:
220+
runner: mi355x
221+
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
222+
model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
223+
framework: 'vllm'
224+
precision: 'fp4'
225+
exp-name: ${{ inputs.exp-name }}
226+
isl: ${{ inputs.isl }}
227+
osl: ${{ inputs.osl }}
228+
max-model-len: ${{ inputs.max-model-len }}
229+
random-range-ratio: ${{ inputs.random-range-ratio }}
230+
tp-list: '[1, 2, 4, 8]'

.github/workflows/full-sweep-tmpl.yml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,31 @@ on:
3737
default: false
3838

3939
jobs:
40+
_70b-1k1k:
41+
if: ${{ inputs.run_1k1k }}
42+
uses: ./.github/workflows/70b-tmpl.yml
43+
secrets: inherit
44+
with:
45+
exp-name: '70b_1k1k'
46+
isl: 1024
47+
osl: 1024
48+
max-model-len: 2048
49+
random-range-ratio: 0.8
50+
use_h100: ${{ inputs.use_h100 }}
51+
use_h200: ${{ inputs.use_h200 }}
52+
use_b200: ${{ inputs.use_b200 }}
53+
use_mi300x: ${{ inputs.use_mi300x }}
54+
use_mi325x: ${{ inputs.use_mi325x }}
55+
use_mi355x: ${{ inputs.use_mi355x }}
56+
57+
collect-70b-1k1k-results:
58+
needs: _70b-1k1k
59+
if: ${{ inputs.run_1k1k && always() }}
60+
uses: ./.github/workflows/collect-results.yml
61+
secrets: inherit
62+
with:
63+
exp-name: '70b_1k1k'
64+
4065
dsr1-1k1k:
4166
if: ${{ inputs.run_1k1k }}
4267
uses: ./.github/workflows/dsr1-tmpl.yml
@@ -87,6 +112,31 @@ jobs:
87112
with:
88113
exp-name: 'gptoss_1k1k'
89114

115+
_70b-8k1k:
116+
if: ${{ inputs.run_8k1k }}
117+
uses: ./.github/workflows/70b-tmpl.yml
118+
secrets: inherit
119+
with:
120+
exp-name: '70b_8k1k'
121+
isl: 8192
122+
osl: 1024
123+
max-model-len: 9216
124+
random-range-ratio: 0.8
125+
use_h100: ${{ inputs.use_h100 }}
126+
use_h200: ${{ inputs.use_h200 }}
127+
use_b200: ${{ inputs.use_b200 }}
128+
use_mi300x: ${{ inputs.use_mi300x }}
129+
use_mi325x: ${{ inputs.use_mi325x }}
130+
use_mi355x: ${{ inputs.use_mi355x }}
131+
132+
collect-70b-8k1k-results:
133+
needs: _70b-8k1k
134+
if: ${{ inputs.run_8k1k && always() }}
135+
uses: ./.github/workflows/collect-results.yml
136+
secrets: inherit
137+
with:
138+
exp-name: '70b_8k1k'
139+
90140
dsr1-8k1k:
91141
if: ${{ inputs.run_8k1k }}
92142
uses: ./.github/workflows/dsr1-tmpl.yml
@@ -137,6 +187,31 @@ jobs:
137187
with:
138188
exp-name: 'gptoss_8k1k'
139189

190+
_70b-1k8k:
191+
if: ${{ inputs.run_1k8k }}
192+
uses: ./.github/workflows/70b-tmpl.yml
193+
secrets: inherit
194+
with:
195+
exp-name: '70b_1k8k'
196+
isl: 1024
197+
osl: 8192
198+
max-model-len: 9216
199+
random-range-ratio: 0.8
200+
use_h100: ${{ inputs.use_h100 }}
201+
use_h200: ${{ inputs.use_h200 }}
202+
use_b200: ${{ inputs.use_b200 }}
203+
use_mi300x: ${{ inputs.use_mi300x }}
204+
use_mi325x: ${{ inputs.use_mi325x }}
205+
use_mi355x: ${{ inputs.use_mi355x }}
206+
207+
collect-70b-1k8k-results:
208+
needs: _70b-1k8k
209+
if: ${{ inputs.run_1k8k && always() }}
210+
uses: ./.github/workflows/collect-results.yml
211+
secrets: inherit
212+
with:
213+
exp-name: '70b_1k8k'
214+
140215
dsr1-1k8k:
141216
if: ${{ inputs.run_1k8k }}
142217
uses: ./.github/workflows/dsr1-tmpl.yml

.github/workflows/runner-model-sweep-test.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
- 'h100-cw_0'
3434
- 'h100-cw_1'
3535
config:
36+
- { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
3637
- { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
3738

3839
name: '${{ matrix.runner }}'
@@ -69,6 +70,7 @@ jobs:
6970
- 'h200-nv_2'
7071
- 'h200-nv_3'
7172
config:
73+
- { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
7274
- { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
7375
- { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
7476

@@ -106,6 +108,7 @@ jobs:
106108
- 'h200-nv_2'
107109
- 'h200-nv_3'
108110
config:
111+
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
109112
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
110113
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
111114

@@ -137,6 +140,8 @@ jobs:
137140
- 'b200-nvd_2'
138141
- 'b200-nvd_3'
139142
config:
143+
- { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
144+
- { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
140145
- { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
141146
- { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
142147
- { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -169,6 +174,8 @@ jobs:
169174
- 'b200-nb_0'
170175
- 'b200-nb_1'
171176
config:
177+
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
178+
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' }
172179
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
173180
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
174181
- { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -204,6 +211,7 @@ jobs:
204211
- 'mi300x-cr_0'
205212
- 'mi300x-oci_0'
206213
config:
214+
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
207215
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
208216
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
209217

@@ -236,6 +244,7 @@ jobs:
236244
- 'mi325x-tw_2'
237245
- 'mi325x-tw_3'
238246
config:
247+
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
239248
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
240249
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
241250

@@ -267,6 +276,8 @@ jobs:
267276
- 'mi355x-amd_2'
268277
- 'mi355x-amd_3'
269278
config:
279+
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
280+
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
270281
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
271282
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
272283
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }

0 commit comments

Comments
 (0)