Skip to content

Commit bf03096

Browse files
author
root
committed
Update new fixed-AR-MTP CI workflow for kimik2.5_int4, kimik2.5_fp4, and minimaxm2.5_fp8 models
Signed-off-by: root <root@gbt350-odcdh5-wbb3.png-odc.dcgpu>
1 parent 5481fbf commit bf03096

43 files changed

Lines changed: 47839 additions & 21 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/configs/amd-master.yaml

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,98 @@ kimik2.5-int4-mi355x-vllm:
589589
search-space:
590590
- { tp: 8, conc-start: 4, conc-end: 64 }
591591

592+
kimik2.5-mxfp4-mi355x-vllm-eagle3:
593+
image: vllm/vllm-openai-rocm:v0.21.0
594+
model: amd/Kimi-K2.5-MXFP4
595+
model-prefix: kimik2.5
596+
runner: mi355x
597+
precision: fp4
598+
framework: vllm
599+
multinode: false
600+
scenarios:
601+
fixed-seq-len:
602+
- isl: 1024
603+
osl: 1024
604+
search-space:
605+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
606+
- isl: 8192
607+
osl: 1024
608+
search-space:
609+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
610+
611+
kimik2.5-int4-mi355x-vllm-eagle3:
612+
image: vllm/vllm-openai-rocm:v0.21.0
613+
model: moonshotai/Kimi-K2.5
614+
model-prefix: kimik2.5
615+
runner: mi355x
616+
precision: int4
617+
framework: vllm
618+
multinode: false
619+
scenarios:
620+
fixed-seq-len:
621+
- isl: 1024
622+
osl: 1024
623+
search-space:
624+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
625+
- isl: 8192
626+
osl: 1024
627+
search-space:
628+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
629+
630+
kimik2.5-int4-mi355x-vllm-fixed-ar-mtp:
631+
image: vllm/vllm-openai-rocm:v0.21.0
632+
model: moonshotai/Kimi-K2.5
633+
model-prefix: kimik2.5
634+
runner: mi355x
635+
precision: int4
636+
framework: vllm
637+
multinode: false
638+
scenarios:
639+
fixed-ar-mtp:
640+
- isl: 1024
641+
osl: 1024
642+
draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
643+
num-speculative-tokens: 3
644+
rejection-sample-method: synthetic
645+
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
646+
search-space:
647+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
648+
- isl: 8192
649+
osl: 1024
650+
draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
651+
num-speculative-tokens: 3
652+
rejection-sample-method: synthetic
653+
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
654+
search-space:
655+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
656+
657+
kimik2.5-fp4-mi355x-vllm-fixed-ar-mtp:
658+
image: vllm/vllm-openai-rocm:v0.21.0
659+
model: amd/Kimi-K2.5-MXFP4
660+
model-prefix: kimik2.5
661+
runner: mi355x
662+
precision: fp4
663+
framework: vllm
664+
multinode: false
665+
scenarios:
666+
fixed-ar-mtp:
667+
- isl: 1024
668+
osl: 1024
669+
draft-model: lightseekorg/kimi-k2.5-eagle3
670+
num-speculative-tokens: 3
671+
rejection-sample-method: synthetic
672+
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
673+
search-space:
674+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
675+
- isl: 8192
676+
osl: 1024
677+
draft-model: lightseekorg/kimi-k2.5-eagle3
678+
num-speculative-tokens: 3
679+
rejection-sample-method: synthetic
680+
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
681+
search-space:
682+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
683+
592684
kimik2.5-int4-mi325x-vllm:
593685
image: vllm/vllm-openai-rocm:v0.21.0
594686
model: moonshotai/Kimi-K2.5
@@ -724,6 +816,25 @@ minimaxm2.5-fp8-mi355x-vllm:
724816
- { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
725817
- { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
726818

819+
minimaxm2.5-fp8-mi355x-vllm-eagle3:
820+
image: vllm/vllm-openai-rocm:v0.21.0
821+
model: MiniMaxAI/MiniMax-M2.5
822+
model-prefix: minimaxm2.5
823+
runner: mi355x
824+
precision: fp8
825+
framework: vllm
826+
multinode: false
827+
scenarios:
828+
fixed-seq-len:
829+
- isl: 1024
830+
osl: 1024
831+
search-space:
832+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
833+
- isl: 8192
834+
osl: 1024
835+
search-space:
836+
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
837+
727838
# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
728839
# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
729840
# its fixed-seq-len sweep is unaffected.

.github/workflows/benchmark-tmpl.yml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ on:
5353
run-eval:
5454
type: boolean
5555
required: true
56-
default: false
5756
eval-only:
5857
description: "Run only evals (skip throughput benchmark)"
5958
type: boolean
@@ -68,10 +67,30 @@ on:
6867
required: false
6968
type: string
7069
scenario-type:
71-
description: "Scenario type (fixed-seq-len or agentic-coding)"
70+
description: "Scenario type (fixed-seq-len, agentic-coding, or fixed-ar-mtp)"
7271
required: false
7372
type: string
7473
default: 'fixed-seq-len'
74+
draft-model:
75+
description: "Draft model for fixed-AR MTP scenarios"
76+
required: false
77+
type: string
78+
default: ''
79+
num-speculative-tokens:
80+
description: "Number of speculative tokens for fixed-AR MTP scenarios"
81+
required: false
82+
type: string
83+
default: ''
84+
rejection-sample-method:
85+
description: "Speculative rejection sampling method"
86+
required: false
87+
type: string
88+
default: ''
89+
synthetic-acceptance-rates:
90+
description: "JSON array of synthetic acceptance rates for fixed-AR MTP scenarios"
91+
required: false
92+
type: string
93+
default: ''
7594
offloading:
7695
description: "KV offload backend for agentic scenarios (none/cpu/ssd)"
7796
required: false
@@ -111,6 +130,10 @@ env:
111130
SCENARIO_TYPE: ${{ inputs.scenario-type }}
112131
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }}
113132
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
133+
DRAFT_MODEL: ${{ inputs.draft-model }}
134+
NUM_SPECULATIVE_TOKENS: ${{ inputs.num-speculative-tokens }}
135+
REJECTION_SAMPLE_METHOD: ${{ inputs.rejection-sample-method }}
136+
SYNTHETIC_ACCEPTANCE_RATES: ${{ inputs.synthetic-acceptance-rates }}
114137
OFFLOADING: ${{ inputs.offloading }}
115138
TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
116139
DURATION: ${{ inputs.duration }}

.github/workflows/e2e-tests.yml

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ jobs:
5151
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
5252
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
5353
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
54+
fixed-ar-mtp-config: ${{ steps.get-jobs.outputs.fixed-ar-mtp-config }}
5455
steps:
5556
- name: Checkout code (ref)
5657
if: ${{ inputs.ref && inputs.ref != '' }}
@@ -71,12 +72,14 @@ jobs:
7172
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
7273
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
7374
MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
74-
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
75+
FIXED_AR_MTP=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'fixed-ar-mtp']))")
76+
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'fixed-ar-mtp') and not x.get('eval-only', False)]))")
7577
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
7678
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
7779
MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
7880
echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
7981
echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
82+
echo "fixed-ar-mtp-config=$FIXED_AR_MTP" >> $GITHUB_OUTPUT
8083
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
8184
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
8285
echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
@@ -190,7 +193,7 @@ jobs:
190193
osl: '0'
191194
max-model-len: '0'
192195
spec-decoding: 'none'
193-
disagg: 'false'
196+
disagg: ${{ 'false' }}
194197
run-eval: false
195198
scenario-type: agentic-coding
196199
ref: ${{ inputs.ref }}
@@ -235,6 +238,41 @@ jobs:
235238
scenario-type: agentic-coding
236239
ref: ${{ inputs.ref }}
237240

241+
test-sweep-fixed-ar-mtp:
242+
needs: get-jobs
243+
if: ${{ needs.get-jobs.outputs.fixed-ar-mtp-config != '[]' }}
244+
uses: ./.github/workflows/benchmark-tmpl.yml
245+
name: Fixed-AR-MTP throughput /
246+
strategy:
247+
fail-fast: false
248+
matrix:
249+
config: ${{ fromJson(needs.get-jobs.outputs.fixed-ar-mtp-config) }}
250+
secrets: inherit
251+
with:
252+
exp-name: ${{ matrix.config.exp-name }}
253+
isl: ${{ matrix.config.isl }}
254+
osl: ${{ matrix.config.osl }}
255+
max-model-len: ${{ matrix.config.max-model-len }}
256+
runner: ${{ matrix.config.runner }}
257+
image: ${{ matrix.config.image }}
258+
model: ${{ matrix.config.model }}
259+
model-prefix: ${{ matrix.config.model-prefix }}
260+
framework: ${{ matrix.config.framework }}
261+
precision: ${{ matrix.config.precision }}
262+
tp: ${{ matrix.config.tp }}
263+
ep: ${{ matrix.config.ep }}
264+
dp-attn: ${{ matrix.config.dp-attn }}
265+
conc: ${{ matrix.config.conc }}
266+
spec-decoding: ${{ matrix.config.spec-decoding }}
267+
disagg: ${{ matrix.config.disagg }}
268+
run-eval: false
269+
scenario-type: fixed-ar-mtp
270+
draft-model: ${{ matrix.config.draft-model }}
271+
num-speculative-tokens: ${{ matrix.config.num-speculative-tokens }}
272+
rejection-sample-method: ${{ matrix.config.rejection-sample-method }}
273+
synthetic-acceptance-rates: ${{ toJson(matrix.config.synthetic-acceptance-rates) }}
274+
ref: ${{ inputs.ref }}
275+
238276
test-sweep-single-node:
239277
needs: get-jobs
240278
if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }}
@@ -297,8 +335,8 @@ jobs:
297335
ref: ${{ inputs.ref }}
298336

299337
collect-results:
300-
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
301-
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
338+
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-fixed-ar-mtp, test-sweep-agentic, test-sweep-multi-node-agentic]
339+
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-fixed-ar-mtp.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
302340
uses: ./.github/workflows/collect-results.yml
303341
secrets: inherit
304342
with:

0 commit comments

Comments
 (0)