Skip to content

Commit 433f2ef

Browse files
committed
add logic for event driven runs
new single workflow that runs on merge to main, new perg-changelog.yaml to track performance changes, new logic to parse changelog, removed cron job in full sweep schedulers
1 parent 0291997 commit 433f2ef

11 files changed

Lines changed: 784 additions & 131 deletions

.github/workflows/full-sweep-1k1k-scheduler.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k1k"
22

33
on:
44
workflow_dispatch:
5-
schedule:
6-
- cron: "0 0 * * *"
75

86
jobs:
97
get-dsr1-configs:

.github/workflows/full-sweep-1k8k-scheduler.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k8k"
22

33
on:
44
workflow_dispatch:
5-
schedule:
6-
- cron: "0 0 * * *"
75

86
jobs:
97
get-dsr1-configs:

.github/workflows/full-sweep-8k1k-scheduler.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 8k1k"
22

33
on:
44
workflow_dispatch:
5-
schedule:
6-
- cron: "0 0 * * *"
75

86
jobs:
97
get-dsr1-configs:

.github/workflows/run-sweep.yml

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
name: "Run Sweep"
2+
run-name: Run Sweep - ${{ github.event.pull_request.title || github.ref_name }}
3+
4+
concurrency:
5+
group: sweep-${{ github.event.pull_request.number || github.ref }}
6+
cancel-in-progress: true
7+
8+
on:
9+
push:
10+
branches:
11+
- main
12+
paths:
13+
- "perf-changelog.yaml"
14+
pull_request:
15+
branches:
16+
- main
17+
types:
18+
- ready_for_review
19+
- synchronize
20+
- labeled
21+
paths:
22+
- "perf-changelog.yaml"
23+
24+
jobs:
25+
setup:
26+
runs-on: ubuntu-latest
27+
if: >-
28+
(github.event_name == 'pull_request' && !github.event.pull_request.draft && contains(github.event.pull_request.labels.*.name, 'sweep-enabled')) ||
29+
(github.event_name != 'pull_request' && !contains(github.event.head_commit.message, '[skip-sweep]'))
30+
outputs:
31+
search-space-config: ${{ steps.setup.outputs.search-space-config }}
32+
steps:
33+
- name: Checkout code
34+
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
35+
with:
36+
fetch-depth: 0
37+
38+
- id: setup
39+
run: |
40+
pip install pydantic
41+
42+
if [ "${{ github.event_name }}" == "pull_request" ]; then
43+
BASE_REF="origin/${{ github.base_ref }}"
44+
HEAD_REF="${{ github.event.pull_request.head.sha }}"
45+
else
46+
BASE_REF="${{ github.event.before }}"
47+
HEAD_REF="${{ github.event.after }}"
48+
fi
49+
50+
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/process_changelog.py \
51+
--changelog-file ${GITHUB_WORKSPACE}/perf-changelog.yaml \
52+
--base-ref "$BASE_REF" \
53+
--head-ref "$HEAD_REF")
54+
55+
echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
56+
57+
sweep-multi-node-1k1k:
58+
needs: setup
59+
if: ${{ needs.setup.outputs.search-space-config.multi_node['1k1k'] != '[]' }}
60+
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
61+
name: multi-node 1k1k /
62+
strategy:
63+
fail-fast: false
64+
matrix:
65+
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k'] }}
66+
secrets: inherit
67+
with: &multi-node-inputs
68+
isl: ${{ matrix.config.isl }}
69+
osl: ${{ matrix.config.osl }}
70+
max-model-len: ${{ matrix.config.max-model-len }}
71+
runner: ${{ matrix.config.runner }}
72+
image: ${{ matrix.config.image }}
73+
model: ${{ matrix.config.model }}
74+
model-prefix: ${{ matrix.config.model-prefix }}
75+
framework: ${{ matrix.config.framework }}
76+
precision: ${{ matrix.config.precision }}
77+
exp-name: ${{ matrix.config.exp-name }}
78+
conc-list: ${{ toJson(matrix.config.conc) }}
79+
spec-decoding: ${{ matrix.config.spec-decoding }}
80+
disagg: ${{ matrix.config.disagg }}
81+
82+
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
83+
prefill-tp: ${{ matrix.config.prefill.tp }}
84+
prefill-ep: ${{ matrix.config.prefill.ep }}
85+
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
86+
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
87+
88+
decode-num-worker: ${{ matrix.config.decode.num-worker }}
89+
decode-tp: ${{ matrix.config.decode.tp }}
90+
decode-ep: ${{ matrix.config.decode.ep }}
91+
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
92+
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
93+
94+
sweep-multi-node-1k8k:
95+
needs: setup
96+
if: ${{ needs.setup.outputs.search-space-config.multi_node['1k8k'] != '[]' }}
97+
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
98+
name: multi-node 1k8k /
99+
strategy:
100+
fail-fast: false
101+
matrix:
102+
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }}
103+
secrets: inherit
104+
with: *multi-node-inputs
105+
106+
sweep-multi-node-8k1k:
107+
needs: setup
108+
if: ${{ needs.setup.outputs.search-space-config.multi_node['8k1k'] != '[]' }}
109+
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
110+
name: multi-node 8k1k /
111+
strategy:
112+
fail-fast: false
113+
matrix:
114+
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k'] }}
115+
secrets: inherit
116+
with: *multi-node-inputs
117+
118+
sweep-single-node-1k1k:
119+
needs: setup
120+
if: ${{ needs.setup.outputs.search-space-config.single_node['1k1k'] != '[]' }}
121+
uses: ./.github/workflows/benchmark-tmpl.yml
122+
name: single-node 1k1k /
123+
strategy:
124+
fail-fast: false
125+
matrix:
126+
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }}
127+
secrets: inherit
128+
with: &single-node-inputs
129+
exp-name: ${{ matrix.config.exp-name }}
130+
isl: ${{ matrix.config.isl }}
131+
osl: ${{ matrix.config.osl }}
132+
max-model-len: ${{ matrix.config.max-model-len }}
133+
runner: ${{ matrix.config.runner }}
134+
image: ${{ matrix.config.image }}
135+
model: ${{ matrix.config.model }}
136+
model-prefix: ${{ matrix.config.model-prefix }}
137+
framework: ${{ matrix.config.framework }}
138+
precision: ${{ matrix.config.precision }}
139+
tp: ${{ matrix.config.tp }}
140+
ep: ${{ matrix.config.ep }}
141+
dp-attn: ${{ matrix.config.dp-attn }}
142+
conc: ${{ matrix.config.conc }}
143+
spec-decoding: ${{ matrix.config.spec-decoding }}
144+
disagg: ${{ matrix.config.disagg }}
145+
146+
sweep-single-node-1k8k:
147+
needs: setup
148+
if: ${{ needs.setup.outputs.search-space-config.single_node['1k8k'] != '[]' }}
149+
uses: ./.github/workflows/benchmark-tmpl.yml
150+
name: single-node 1k8k /
151+
strategy:
152+
fail-fast: false
153+
matrix:
154+
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }}
155+
secrets: inherit
156+
with: *single-node-inputs
157+
158+
sweep-single-node-8k1k:
159+
needs: setup
160+
if: ${{ needs.setup.outputs.search-space-config.single_node['8k1k'] != '[]' }}
161+
uses: ./.github/workflows/benchmark-tmpl.yml
162+
name: single-node 8k1k /
163+
strategy:
164+
fail-fast: false
165+
matrix:
166+
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
167+
secrets: inherit
168+
with: *single-node-inputs
169+
170+
collect-results:
171+
needs:
172+
[
173+
sweep-single-node-1k1k,
174+
sweep-single-node-1k8k,
175+
sweep-single-node-8k1k,
176+
sweep-multi-node-1k1k,
177+
sweep-multi-node-1k8k,
178+
sweep-multi-node-8k1k,
179+
setup,
180+
]
181+
if: ${{ always() && needs.setup.result != 'skipped' }}
182+
uses: ./.github/workflows/collect-results.yml
183+
secrets: inherit
184+
185+
upload-changelog-metadata:
186+
needs: [setup, collect-results]
187+
if: ${{ needs.setup.result != 'skipped' }}
188+
runs-on: ubuntu-latest
189+
steps:
190+
- name: Extract and save changelog metadata
191+
env:
192+
CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
193+
run: |
194+
echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json
195+
196+
- name: Upload changelog artifact
197+
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
198+
with:
199+
name: changelog-metadata
200+
path: changelog_metadata.json
201+
202+
calc-success-rate:
203+
needs: collect-results
204+
if: ${{ always() && needs.collect-results.result != 'skipped'}}
205+
runs-on: ubuntu-latest
206+
207+
env:
208+
RESULTS_DIR: "results/"
209+
STATS_FILENAME: "run_stats"
210+
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
211+
212+
steps:
213+
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
214+
with:
215+
token: ${{ secrets.REPO_PAT }}
216+
fetch-depth: 0
217+
218+
- name: Download results artifacts
219+
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
220+
with:
221+
path: ${{ env.RESULTS_DIR }}
222+
pattern: results_*
223+
224+
- name: Install python dependencies
225+
run: pip install PyGithub
226+
227+
- name: Calculate success rate
228+
run: python3 utils/calc_success_rate.py $STATS_FILENAME
229+
230+
- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
231+
with:
232+
name: "run-stats"
233+
path: ${{ env.STATS_FILENAME }}.json

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
- config-keys:
2+
- gptoss-fp4-mi300x-vllm
3+
description: |
4+
Updating vllm version for mi300x
5+
- config-keys:
6+
- gptoss-fp4-mi300x-vllm
7+
description: |
8+
Updating vllm version for mi325x

utils/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
MASTER_CONFIGS = [".github/configs/amd-master.yaml",
2+
".github/configs/nvidia-master.yaml"]
3+
RUNNER_CONFIG = ".github/configs/runners.yaml"
4+
GENERATE_SWEEPS_PY_SCRIPT = "utils/matrix_logic/generate_sweep_configs.py"

0 commit comments

Comments
 (0)