Skip to content

Commit 13803ac

Browse files
committed
adding more workflows
1 parent be3b40f commit 13803ac

3 files changed

Lines changed: 303 additions & 0 deletions

File tree

.github/workflows/1k8k-sweep.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ concurrency:
44
group: benchmark-lock-1k8k
55
cancel-in-progress: false
66

7+
on:
8+
workflow_dispatch:
9+
schedule:
10+
- cron: '0 23 * * *'
11+
712
on:
813
# pull_request:
914
workflow_dispatch:

.github/workflows/test.yml

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
name: Test - Full Sweep
2+
3+
concurrency:
4+
group: benchmark-lock
5+
cancel-in-progress: false
6+
7+
on:
8+
pull_request:
9+
workflow_dispatch:
10+
inputs:
11+
name:
12+
description: "Name of benchmark from master configs"
13+
required: true
14+
type: string
15+
default: 70b-fp4-mi355x-vllm
16+
17+
run_1k1k:
18+
description: "Run ISL/OSL 1k/1k"
19+
type: boolean
20+
required: true
21+
run_1k8k:
22+
description: "Run ISL/OSL 1k/8k"
23+
type: boolean
24+
required: true
25+
run_8k1k:
26+
description: "Run ISL/OSL 8k/1k"
27+
type: boolean
28+
required: true
29+
30+
runner:
31+
description: "Specific runner node to run on"
32+
required: false
33+
type: choice
34+
options:
35+
- "h100-cr_0"
36+
- "h100-cr_1"
37+
- "h100-cw_0"
38+
- "h100-cw_1"
39+
- "h200-cw_0"
40+
- "h200-cw_1"
41+
- "h200-nb_0"
42+
- "h200-nb_1"
43+
- "h200-nb_2"
44+
- "h200-nb_3"
45+
- "h200-nv_0"
46+
- "h200-nv_1"
47+
- "h200-nv_2"
48+
- "h200-nv_3"
49+
- "b200-nv_0"
50+
- "b200-nv_1"
51+
- "b200-nb_0"
52+
- "b200-nb_1"
53+
- "b200-nvd_0"
54+
- "b200-nvd_1"
55+
- "b200-nvd_2"
56+
- "b200-nvd_3"
57+
- "b200-tg_0"
58+
- "mi300x-amd_0"
59+
- "mi300x-amd_1"
60+
- "mi300x-amd_2"
61+
- "mi300x-amd_3"
62+
- "mi300x-amd_4"
63+
- "mi300x-cr_0"
64+
- "mi300x-oci_0"
65+
- "mi325x-amd_0"
66+
- "mi325x-tw_0"
67+
- "mi325x-tw_1"
68+
- "mi325x-tw_2"
69+
- "mi325x-tw_3"
70+
- "mi355x-amd_0"
71+
- "mi355x-amd_1"
72+
- "mi355x-amd_2"
73+
- "mi355x-amd_3"
74+
75+
jobs:
76+
get-jobs:
77+
runs-on: ubuntu-latest
78+
outputs:
79+
search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
80+
steps:
81+
- name: Checkout code
82+
uses: actions/checkout@v4
83+
84+
- id: get-jobs
85+
run: |
86+
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
87+
--config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
88+
--key ${{ inputs.name }} \
89+
${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
90+
echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
91+
92+
test-sweep:
93+
needs: get-jobs
94+
uses: ./.github/workflows/benchmark-tmpl.yml
95+
name: test sweep - ${{ inputs.name }}
96+
strategy:
97+
fail-fast: false
98+
matrix:
99+
config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
100+
secrets: inherit
101+
with:
102+
exp-name: "dsr1_1k1k"
103+
isl: ${{ matrix.config.isl }}
104+
osl: ${{ matrix.config.osl }}
105+
max-model-len: ${{ matrix.config.max-model-len }}
106+
runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
107+
image: ${{ matrix.config.image }}
108+
model: ${{ matrix.config.model }}
109+
framework: ${{ matrix.config.framework }}
110+
precision: ${{ matrix.config.precision }}
111+
tp: ${{ matrix.config.tp }}
112+
ep: ${{ matrix.config.ep || 1 }}
113+
dp-attn: ${{ matrix.config.dp-attn || false }}
114+
conc: ${{ matrix.config.conc }}
115+
116+
calc-success-rate:
117+
needs: test-sweep
118+
if: ${{ always() }}
119+
runs-on: ubuntu-latest
120+
121+
env:
122+
RESULTS_DIR: "results/"
123+
STATS_FILENAME: "run_stats"
124+
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
125+
126+
steps:
127+
- uses: actions/checkout@v3
128+
with:
129+
token: ${{ secrets.REPO_PAT }}
130+
fetch-depth: 0
131+
132+
- name: Download results artifacts
133+
uses: actions/download-artifact@v4
134+
with:
135+
path: ${{ env.RESULTS_DIR }}
136+
pattern: results_*
137+
138+
- name: Install python dependencies
139+
run: pip install PyGithub
140+
141+
- name: Calculate success rate
142+
run: python3 utils/calc_success_rate.py $STATS_FILENAME
143+
144+
- uses: actions/upload-artifact@v4
145+
with:
146+
name: "run-stats"
147+
path: ${{ env.STATS_FILENAME }}.json
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import json
2+
import yaml
3+
import sys
4+
import argparse
5+
6+
seq_len_stoi = {
7+
"1k1k": (1024, 1024),
8+
"1k8k": (1024, 8192),
9+
"8k1k": (8192, 1024)
10+
}
11+
12+
def main():
13+
parser = argparse.ArgumentParser(
14+
description='Generate benchmark matrix from a specific configuration key'
15+
)
16+
parser.add_argument(
17+
'--config-files',
18+
nargs='+',
19+
required=True,
20+
help='One or more configuration files (YAML format)'
21+
)
22+
parser.add_argument(
23+
'--key',
24+
required=True,
25+
help='Configuration key to use'
26+
)
27+
parser.add_argument(
28+
'--seq-lens',
29+
nargs='+',
30+
choices=list(seq_len_stoi.keys()),
31+
required=False,
32+
help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
33+
)
34+
parser.add_argument(
35+
'--step-size',
36+
type=int,
37+
default=2,
38+
help='Step size for concurrency values (default: 2)'
39+
)
40+
41+
args = parser.parse_args()
42+
43+
# Convert seq-lens to set of (isl, osl) tuples for filtering
44+
seq_lens_filter = None
45+
if args.seq_lens:
46+
seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
47+
48+
# Load and merge all config files
49+
all_config_data = {}
50+
for config_file in args.config_files:
51+
try:
52+
with open(config_file, 'r') as f:
53+
config_data = yaml.safe_load(f)
54+
assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
55+
56+
# Check for duplicate keys
57+
duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
58+
if duplicate_keys:
59+
raise ValueError(
60+
f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
61+
)
62+
63+
all_config_data.update(config_data)
64+
except FileNotFoundError:
65+
raise ValueError(f"Input file '{config_file}' does not exist.")
66+
67+
# Check if the key exists
68+
if args.key not in all_config_data:
69+
available_keys = ', '.join(sorted(all_config_data.keys()))
70+
raise ValueError(
71+
f"Key '{args.key}' not found in configuration files. "
72+
f"Available keys: {available_keys}"
73+
)
74+
75+
val = all_config_data[args.key]
76+
77+
# Validate required fields
78+
seq_len_configs = val.get('seq-len-configs')
79+
assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
80+
81+
image = val.get('image')
82+
model = val.get('model')
83+
precision = val.get('precision')
84+
framework = val.get('framework')
85+
runner = val.get('runner')
86+
87+
assert None not in (image, model, precision, framework, runner), \
88+
f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
89+
90+
matrix_values = []
91+
92+
# Process each sequence length configuration
93+
for seq_config in seq_len_configs:
94+
isl = seq_config.get('isl')
95+
osl = seq_config.get('osl')
96+
97+
assert None not in (isl, osl), \
98+
f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
99+
100+
# Filter by sequence lengths if specified
101+
if seq_lens_filter and (isl, osl) not in seq_lens_filter:
102+
continue
103+
104+
bmk_space = seq_config.get('bmk-space')
105+
assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
106+
107+
for bmk in bmk_space:
108+
tp = bmk.get('tp')
109+
conc_start = bmk.get('conc-start')
110+
conc_end = bmk.get('conc-end')
111+
ep = bmk.get('ep')
112+
dp_attn = bmk.get('dp-attn')
113+
114+
assert None not in (tp, conc_start, conc_end), \
115+
f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
116+
117+
# Generate entries for each concurrency value in the range
118+
conc = conc_start
119+
while conc <= conc_end:
120+
entry = {
121+
'image': image,
122+
'model': model,
123+
'precision': precision,
124+
'framework': framework,
125+
'runner': runner,
126+
'isl': isl,
127+
'osl': osl,
128+
'tp': tp,
129+
'conc': conc,
130+
'max-model-len': isl + osl,
131+
}
132+
133+
# Add optional fields if they exist
134+
if ep is not None:
135+
entry['ep'] = ep
136+
if dp_attn is not None:
137+
entry['dp-attn'] = dp_attn
138+
139+
matrix_values.append(entry)
140+
141+
if conc == conc_end:
142+
break
143+
conc *= args.step_size
144+
if conc > conc_end:
145+
conc = conc_end
146+
147+
print(json.dumps(matrix_values))
148+
return matrix_values
149+
150+
if __name__ == "__main__":
151+
main()

0 commit comments

Comments
 (0)