Validate PR #167 #28
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PR Label Validation | |
| run-name: "Validate PR #${{ github.event.pull_request.number }}" | |
| concurrency: | |
| group: "PR#${{ github.event.pull_request.number }}" | |
| cancel-in-progress: true | |
| on: | |
| pull_request: | |
| types: [labeled, synchronize] | |
| branches: | |
| - main | |
| jobs: | |
| get-jobs: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} | |
| gb200-config: ${{ steps.get-jobs.outputs.gb200-config }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - id: get-jobs | |
| shell: python | |
| run: | | |
| import json | |
| import subprocess | |
| import re | |
| import os | |
| # Get matching labels | |
| labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''') | |
| pattern = r'^([^_]+)_([^_]+)$' | |
| matching = [] | |
| gb200_labels = [] | |
| for label in labels: | |
| match = re.match(pattern, label['name']) | |
| if match: | |
| runner_type = match.group(1) | |
| model_prefix = match.group(2) | |
| if runner_type == 'gb200': | |
| gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix}) | |
| print(f"Matched GB200 label: {label['name']}") | |
| else: | |
| matching.append({'runner-type': runner_type, 'model-prefix': model_prefix}) | |
| print(f"Matched label: {label['name']}") | |
| if not matching and not gb200_labels: | |
| print("No matching labels found") | |
| with open(os.environ['GITHUB_OUTPUT'], 'a') as f: | |
| f.write('search-space-config=[]\n') | |
| f.write('gb200-config=[]\n') | |
| exit(0) | |
| # Generate configs for standard labels | |
| all_configs = [] | |
| if matching: | |
| subprocess.run(['pip', 'install', 'pydantic'], check=True) | |
| for label in matching: | |
| result = subprocess.run([ | |
| 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", | |
| 'full-sweep', | |
| '--runner-type', label['runner-type'], | |
| '--model-prefix', label['model-prefix'], | |
| '--seq-lens', '1k1k', | |
| '--test-mode', | |
| '--config-files', | |
| f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", | |
| f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", | |
| '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" | |
| ], capture_output=True, text=True) | |
| if result.returncode != 0: | |
| print(f"Error generating configs:") | |
| print(f"STDOUT: {result.stdout}") | |
| print(f"STDERR: {result.stderr}") | |
| exit(1) | |
| all_configs.extend(json.loads(result.stdout)) | |
| # Handle GB200 configs (use static config like in full-sweep-test.yml) | |
| gb200_configs = [] | |
| if gb200_labels: | |
| # Static GB200 config from full-sweep-test.yml | |
| gb200_configs = [ | |
| {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"}, | |
| {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"}, | |
| {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"}, | |
| {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"} | |
| ] | |
| print(f"Total standard configs: {len(all_configs)}") | |
| print(f"Total GB200 configs: {len(gb200_configs)}") | |
| with open(os.environ['GITHUB_OUTPUT'], 'a') as f: | |
| f.write(f'search-space-config={json.dumps(all_configs)}\n') | |
| f.write(f'gb200-config={json.dumps(gb200_configs)}\n') | |
| validate: | |
| needs: get-jobs | |
| if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} | |
| uses: ./.github/workflows/benchmark-tmpl.yml | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} | |
| secrets: inherit | |
| name: validate ${{ matrix.config.runner }} | |
| with: | |
| exp-name: ${{ matrix.config.exp-name }} | |
| isl: ${{ matrix.config.isl }} | |
| osl: ${{ matrix.config.osl }} | |
| max-model-len: ${{ matrix.config.max-model-len }} | |
| runner: ${{ matrix.config.runner }} | |
| image: ${{ matrix.config.image }} | |
| model: ${{ matrix.config.model }} | |
| framework: ${{ matrix.config.framework }} | |
| precision: ${{ matrix.config.precision }} | |
| tp: ${{ matrix.config.tp }} | |
| ep: ${{ matrix.config.ep }} | |
| dp-attn: ${{ matrix.config.dp-attn }} | |
| conc: ${{ matrix.config.conc }} | |
| validate-gb200: | |
| needs: get-jobs | |
| if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }} | |
| uses: ./.github/workflows/benchmark-multinode-tmpl.yml | |
| name: gb200 validation | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| config: ${{ fromJson(needs.get-jobs.outputs.gb200-config) }} | |
| secrets: inherit | |
| with: | |
| runner: gb200 | |
| image: ${{ matrix.config.image }} | |
| model: ${{ matrix.config.model }} | |
| framework: ${{ matrix.config.framework }} | |
| precision: ${{ matrix.config.precision }} | |
| exp-name: dsr1_1k1k | |
| isl: "1024" | |
| osl: "1024" | |
| max-model-len: 2048 | |
| mtp-mode: ${{ matrix.config.mtp }} | |
| calc-success-rate: | |
| needs: [validate, validate-gb200] | |
| if: ${{ always() }} | |
| runs-on: ubuntu-latest | |
| env: | |
| RESULTS_DIR: "results/" | |
| STATS_FILENAME: "run_stats" | |
| GITHUB_TOKEN: ${{ secrets.REPO_PAT }} | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| token: ${{ secrets.REPO_PAT }} | |
| fetch-depth: 0 | |
| - name: Download results artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: ${{ env.RESULTS_DIR }} | |
| pattern: results_* | |
| - name: Install python dependencies | |
| run: pip install PyGithub | |
| - name: Calculate success rate | |
| run: python3 utils/calc_success_rate.py $STATS_FILENAME | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: "run-stats" | |
| path: ${{ env.STATS_FILENAME }}.json |