Skip to content

Commit 67efd3c

Browse files
cquil11functionstackx
authored andcommitted
debug 10
1 parent 9d264aa commit 67efd3c

1 file changed

Lines changed: 78 additions & 31 deletions

File tree

.github/workflows/label-validation.yml

Lines changed: 78 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ jobs:
1616
runs-on: ubuntu-latest
1717
outputs:
1818
search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
19+
gb200-config: ${{ steps.get-jobs.outputs.gb200-config }}
1920
steps:
2021
- name: Checkout code
2122
uses: actions/checkout@v4
@@ -33,58 +34,82 @@ jobs:
3334
pattern = r'^([^_]+)_([^_]+)$'
3435
3536
matching = []
37+
gb200_labels = []
3638
for label in labels:
3739
match = re.match(pattern, label['name'])
3840
if match:
39-
matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)})
40-
print(f"Matched label: {label['name']}")
41+
runner_type = match.group(1)
42+
model_prefix = match.group(2)
4143
42-
if not matching:
44+
if runner_type == 'gb200':
45+
gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix})
46+
print(f"Matched GB200 label: {label['name']}")
47+
else:
48+
matching.append({'runner-type': runner_type, 'model-prefix': model_prefix})
49+
print(f"Matched label: {label['name']}")
50+
51+
if not matching and not gb200_labels:
4352
print("No matching labels found")
4453
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
4554
f.write('search-space-config=[]\n')
55+
f.write('gb200-config=[]\n')
4656
exit(0)
4757
48-
# Generate configs for all matching labels
49-
subprocess.run(['pip', 'install', 'pydantic'], check=True)
50-
58+
# Generate configs for standard labels
5159
all_configs = []
52-
for label in matching:
53-
result = subprocess.run([
54-
'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py",
55-
'full-sweep',
56-
'--runner-type', label['runner-type'],
57-
'--model-prefix', label['model-prefix'],
58-
'--seq-lens', '1k1k',
59-
'--test-mode',
60-
'--config-files',
61-
f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
62-
f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
63-
'--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
64-
], capture_output=True, text=True)
65-
66-
if result.returncode != 0:
67-
print(f"Error generating configs:")
68-
print(f"STDOUT: {result.stdout}")
69-
print(f"STDERR: {result.stderr}")
70-
exit(1)
71-
72-
all_configs.extend(json.loads(result.stdout))
73-
74-
print(f"Total configs: {len(all_configs)}")
60+
if matching:
61+
subprocess.run(['pip', 'install', 'pydantic'], check=True)
62+
63+
for label in matching:
64+
result = subprocess.run([
65+
'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py",
66+
'full-sweep',
67+
'--runner-type', label['runner-type'],
68+
'--model-prefix', label['model-prefix'],
69+
'--seq-lens', '1k1k',
70+
'--test-mode',
71+
'--config-files',
72+
f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
73+
f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
74+
'--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
75+
], capture_output=True, text=True)
76+
77+
if result.returncode != 0:
78+
print(f"Error generating configs:")
79+
print(f"STDOUT: {result.stdout}")
80+
print(f"STDERR: {result.stderr}")
81+
exit(1)
82+
83+
all_configs.extend(json.loads(result.stdout))
84+
85+
# Handle GB200 configs (use static config like in full-sweep-test.yml)
86+
gb200_configs = []
87+
if gb200_labels:
88+
# Static GB200 config from full-sweep-test.yml
89+
gb200_configs = [
90+
{"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"},
91+
{"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"},
92+
{"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"},
93+
{"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"}
94+
]
95+
96+
print(f"Total standard configs: {len(all_configs)}")
97+
print(f"Total GB200 configs: {len(gb200_configs)}")
98+
7599
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
76100
f.write(f'search-space-config={json.dumps(all_configs)}\n')
101+
f.write(f'gb200-config={json.dumps(gb200_configs)}\n')
77102
78103
validate:
79104
needs: get-jobs
80-
# Prolly unnecessary
81105
if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }}
82106
uses: ./.github/workflows/benchmark-tmpl.yml
83107
strategy:
84108
fail-fast: false
85109
matrix:
86110
config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
87111
secrets: inherit
112+
name: validate ${{ fromJson(needs.get-jobs.outputs.search-space-config).runner }} ${{ fromJson(needs.get-jobs.outputs.search-space-config).image }}
88113
with:
89114
exp-name: ${{ matrix.config.exp-name }}
90115
isl: ${{ matrix.config.isl }}
@@ -100,8 +125,30 @@ jobs:
100125
dp-attn: ${{ matrix.config.dp-attn }}
101126
conc: ${{ matrix.config.conc }}
102127

128+
validate-gb200:
129+
needs: get-jobs
130+
if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }}
131+
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
132+
name: gb200 validation
133+
strategy:
134+
fail-fast: false
135+
matrix:
136+
config: ${{ fromJson(needs.get-jobs.outputs.gb200-config) }}
137+
secrets: inherit
138+
with:
139+
runner: gb200
140+
image: ${{ matrix.config.image }}
141+
model: ${{ matrix.config.model }}
142+
framework: ${{ matrix.config.framework }}
143+
precision: ${{ matrix.config.precision }}
144+
exp-name: dsr1_1k1k
145+
isl: "1024"
146+
osl: "1024"
147+
max-model-len: 2048
148+
mtp-mode: ${{ matrix.config.mtp }}
149+
103150
calc-success-rate:
104-
needs: validate
151+
needs: [validate, validate-gb200]
105152
if: ${{ always() }}
106153
runs-on: ubuntu-latest
107154

0 commit comments

Comments
 (0)