Validate PR #167 #28

Workflow file for this run

.github/workflows/label-validation.yml at a370ae6

	name: PR Label Validation
	run-name: "Validate PR #${{ github.event.pull_request.number }}"

	concurrency:
	group: "PR#${{ github.event.pull_request.number }}"
	cancel-in-progress: true

	on:
	pull_request:
	types: [labeled, synchronize]
	branches:
	- main

	jobs:
	get-jobs:
	runs-on: ubuntu-latest
	outputs:
	search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
	gb200-config: ${{ steps.get-jobs.outputs.gb200-config }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- id: get-jobs
	shell: python
	run: \|
	import json
	import subprocess
	import re
	import os

	# Get matching labels
	labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''')
	pattern = r'^([^_]+)_([^_]+)$'

	matching = []
	gb200_labels = []
	for label in labels:
	match = re.match(pattern, label['name'])
	if match:
	runner_type = match.group(1)
	model_prefix = match.group(2)

	if runner_type == 'gb200':
	gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix})
	print(f"Matched GB200 label: {label['name']}")
	else:
	matching.append({'runner-type': runner_type, 'model-prefix': model_prefix})
	print(f"Matched label: {label['name']}")

	if not matching and not gb200_labels:
	print("No matching labels found")
	with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
	f.write('search-space-config=[]\n')
	f.write('gb200-config=[]\n')
	exit(0)

	# Generate configs for standard labels
	all_configs = []
	if matching:
	subprocess.run(['pip', 'install', 'pydantic'], check=True)

	for label in matching:
	result = subprocess.run([
	'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py",
	'full-sweep',
	'--runner-type', label['runner-type'],
	'--model-prefix', label['model-prefix'],
	'--seq-lens', '1k1k',
	'--test-mode',
	'--config-files',
	f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
	f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
	'--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
	], capture_output=True, text=True)

	if result.returncode != 0:
	print(f"Error generating configs:")
	print(f"STDOUT: {result.stdout}")
	print(f"STDERR: {result.stderr}")
	exit(1)

	all_configs.extend(json.loads(result.stdout))

	# Handle GB200 configs (use static config like in full-sweep-test.yml)
	gb200_configs = []
	if gb200_labels:
	# Static GB200 config from full-sweep-test.yml
	gb200_configs = [
	{"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"},
	{"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"},
	{"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"},
	{"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"}
	]

	print(f"Total standard configs: {len(all_configs)}")
	print(f"Total GB200 configs: {len(gb200_configs)}")

	with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
	f.write(f'search-space-config={json.dumps(all_configs)}\n')
	f.write(f'gb200-config={json.dumps(gb200_configs)}\n')

	validate:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }}
	uses: ./.github/workflows/benchmark-tmpl.yml
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
	secrets: inherit
	name: validate ${{ matrix.config.runner }}
	with:
	exp-name: ${{ matrix.config.exp-name }}
	isl: ${{ matrix.config.isl }}
	osl: ${{ matrix.config.osl }}
	max-model-len: ${{ matrix.config.max-model-len }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	tp: ${{ matrix.config.tp }}
	ep: ${{ matrix.config.ep }}
	dp-attn: ${{ matrix.config.dp-attn }}
	conc: ${{ matrix.config.conc }}

	validate-gb200:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }}
	uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	name: gb200 validation
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.gb200-config) }}
	secrets: inherit
	with:
	runner: gb200
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	exp-name: dsr1_1k1k
	isl: "1024"
	osl: "1024"
	max-model-len: 2048
	mtp-mode: ${{ matrix.config.mtp }}

	calc-success-rate:
	needs: [validate, validate-gb200]
	if: ${{ always() }}
	runs-on: ubuntu-latest

	env:
	RESULTS_DIR: "results/"
	STATS_FILENAME: "run_stats"
	GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

	steps:
	- uses: actions/checkout@v3
	with:
	token: ${{ secrets.REPO_PAT }}
	fetch-depth: 0

	- name: Download results artifacts
	uses: actions/download-artifact@v4
	with:
	path: ${{ env.RESULTS_DIR }}
	pattern: results_*

	- name: Install python dependencies
	run: pip install PyGithub

	- name: Calculate success rate
	run: python3 utils/calc_success_rate.py $STATS_FILENAME

	- uses: actions/upload-artifact@v4
	with:
	name: "run-stats"
	path: ${{ env.STATS_FILENAME }}.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Validate PR #167 #28

Workflow file

Validate PR #167 #28

Uh oh!

Workflow file for this run