1616 runs-on : ubuntu-latest
1717 outputs :
1818 search-space-config : ${{ steps.get-jobs.outputs.search-space-config }}
19+ gb200-config : ${{ steps.get-jobs.outputs.gb200-config }}
1920 steps :
2021 - name : Checkout code
2122 uses : actions/checkout@v4
@@ -33,58 +34,82 @@ jobs:
3334 pattern = r'^([^_]+)_([^_]+)$'
3435
3536 matching = []
37+ gb200_labels = []
3638 for label in labels:
3739 match = re.match(pattern, label['name'])
3840 if match:
39- matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)} )
40- print(f"Matched label: {label['name']}" )
41+ runner_type = match.group(1 )
42+ model_prefix = match.group(2 )
4143
42- if not matching:
44+ if runner_type == 'gb200':
45+ gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix})
46+ print(f"Matched GB200 label: {label['name']}")
47+ else:
48+ matching.append({'runner-type': runner_type, 'model-prefix': model_prefix})
49+ print(f"Matched label: {label['name']}")
50+
51+ if not matching and not gb200_labels:
4352 print("No matching labels found")
4453 with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
4554 f.write('search-space-config=[]\n')
55+ f.write('gb200-config=[]\n')
4656 exit(0)
4757
48- # Generate configs for all matching labels
49- subprocess.run(['pip', 'install', 'pydantic'], check=True)
50-
58+ # Generate configs for standard labels
5159 all_configs = []
52- for label in matching:
53- result = subprocess.run([
54- 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py",
55- 'full-sweep',
56- '--runner-type', label['runner-type'],
57- '--model-prefix', label['model-prefix'],
58- '--seq-lens', '1k1k',
59- '--test-mode',
60- '--config-files',
61- f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
62- f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
63- '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
64- ], capture_output=True, text=True)
65-
66- if result.returncode != 0:
67- print(f"Error generating configs:")
68- print(f"STDOUT: {result.stdout}")
69- print(f"STDERR: {result.stderr}")
70- exit(1)
71-
72- all_configs.extend(json.loads(result.stdout))
73-
74- print(f"Total configs: {len(all_configs)}")
60+ if matching:
61+ subprocess.run(['pip', 'install', 'pydantic'], check=True)
62+
63+ for label in matching:
64+ result = subprocess.run([
65+ 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py",
66+ 'full-sweep',
67+ '--runner-type', label['runner-type'],
68+ '--model-prefix', label['model-prefix'],
69+ '--seq-lens', '1k1k',
70+ '--test-mode',
71+ '--config-files',
72+ f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
73+ f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
74+ '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
75+ ], capture_output=True, text=True)
76+
77+ if result.returncode != 0:
78+ print(f"Error generating configs:")
79+ print(f"STDOUT: {result.stdout}")
80+ print(f"STDERR: {result.stderr}")
81+ exit(1)
82+
83+ all_configs.extend(json.loads(result.stdout))
84+
85+ # Handle GB200 configs (use static config like in full-sweep-test.yml)
86+ gb200_configs = []
87+ if gb200_labels:
88+ # Static GB200 config from full-sweep-test.yml
89+ gb200_configs = [
90+ {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"},
91+ {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"},
92+ {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"},
93+ {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"}
94+ ]
95+
96+ print(f"Total standard configs: {len(all_configs)}")
97+ print(f"Total GB200 configs: {len(gb200_configs)}")
98+
7599 with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
76100 f.write(f'search-space-config={json.dumps(all_configs)}\n')
101+ f.write(f'gb200-config={json.dumps(gb200_configs)}\n')
77102
78103 validate :
79104 needs : get-jobs
80- # Prolly unnecessary
81105 if : ${{ needs.get-jobs.outputs.search-space-config != '[]' }}
82106 uses : ./.github/workflows/benchmark-tmpl.yml
83107 strategy :
84108 fail-fast : false
85109 matrix :
86110 config : ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
87111 secrets : inherit
112+ name : validate ${{ fromJson(needs.get-jobs.outputs.search-space-config).runner }} ${{ fromJson(needs.get-jobs.outputs.search-space-config).image }}
88113 with :
89114 exp-name : ${{ matrix.config.exp-name }}
90115 isl : ${{ matrix.config.isl }}
@@ -100,8 +125,30 @@ jobs:
100125 dp-attn : ${{ matrix.config.dp-attn }}
101126 conc : ${{ matrix.config.conc }}
102127
128+ validate-gb200 :
129+ needs : get-jobs
130+ if : ${{ needs.get-jobs.outputs.gb200-config != '[]' }}
131+ uses : ./.github/workflows/benchmark-multinode-tmpl.yml
132+ name : gb200 validation
133+ strategy :
134+ fail-fast : false
135+ matrix :
136+ config : ${{ fromJson(needs.get-jobs.outputs.gb200-config) }}
137+ secrets : inherit
138+ with :
139+ runner : gb200
140+ image : ${{ matrix.config.image }}
141+ model : ${{ matrix.config.model }}
142+ framework : ${{ matrix.config.framework }}
143+ precision : ${{ matrix.config.precision }}
144+ exp-name : dsr1_1k1k
145+ isl : " 1024"
146+ osl : " 1024"
147+ max-model-len : 2048
148+ mtp-mode : ${{ matrix.config.mtp }}
149+
103150 calc-success-rate :
104- needs : validate
151+ needs : [ validate, validate-gb200]
105152 if : ${{ always() }}
106153 runs-on : ubuntu-latest
107154
0 commit comments