InferenceX/.github/workflows/label-validation.yml at efcb4e477de4783465e8177aec3477e5da07983a · SemiAnalysisAI/InferenceX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
name: PR Label Validation
run-name: "Validate PR #${{ github.event.pull_request.number }}"

concurrency:
  group: "PR#${{ github.event.pull_request.number }}"
  cancel-in-progress: true

on:
  pull_request:
    types: [labeled, synchronize]

jobs:
  get-jobs:
    runs-on: ubuntu-latest
    outputs:
      search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
    steps:
      - name: Checkout code
        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - id: get-jobs
        shell: python
        run: |
          import json
          import subprocess
          import re
          import os

          # Get matching labels
          labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''')
          pattern = r'^([^_]+)_([^_]+)$'

          matching = []
          for label in labels:
              match = re.match(pattern, label['name'])
              if match:
                  runner_type = match.group(1)
                  model_prefix = match.group(2)

                  matching.append({'runner-type': runner_type, 'model-prefix': model_prefix})
                  print(f"Matched label: {label['name']}")

          if not matching:
              print("No matching labels found")
              with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
                  f.write('search-space-config=[]\n')
              exit(0)

          # Generate configs for standard labels
          all_configs = []
          if matching:
              subprocess.run(['pip', 'install', 'pydantic'], check=True)

              for label in matching:
                  result = subprocess.run([
                      'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix_logic/generate_sweep_configs.py",
                      'full-sweep',
                      '--runner-type', label['runner-type'],
                      '--model-prefix', label['model-prefix'],
                      '--seq-lens', '1k1k',
                      '--config-files',
                      f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml",
                      f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml",
                      '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml"
                  ], capture_output=True, text=True)

                  if result.returncode != 0:
                      print(f"Error generating configs:")
                      print(f"STDOUT: {result.stdout}")
                      print(f"STDERR: {result.stderr}")
                      exit(1)

                  all_configs.extend(json.loads(result.stdout))

          print(f"Total standard configs: {len(all_configs)}")

          with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
              f.write(f'search-space-config={json.dumps(all_configs)}\n')

  validate:
    needs: get-jobs
    if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }}
    uses: ./.github/workflows/benchmark-tmpl.yml
    strategy:
      fail-fast: false
      matrix:
        config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
    secrets: inherit
    name: validate ${{ matrix.config.runner }}
    with:
      exp-name: ${{ matrix.config.exp-name }}
      isl: ${{ matrix.config.isl }}
      osl: ${{ matrix.config.osl }}
      max-model-len: ${{ matrix.config.max-model-len }}
      runner: ${{ matrix.config.runner }}
      image: ${{ matrix.config.image }}
      model: ${{ matrix.config.model }}
      model-prefix: ${{ matrix.config.model-prefix }}
      framework: ${{ matrix.config.framework }}
      precision: ${{ matrix.config.precision }}
      tp: ${{ matrix.config.tp }}
      ep: ${{ matrix.config.ep }}
      dp-attn: ${{ matrix.config.dp-attn }}
      conc: ${{ matrix.config.conc }}
      spec-decoding: ${{ matrix.config.spec-decoding }}
      disagg: ${{ matrix.config.disagg }}

  collect-results:
    needs: validate
    if: ${{ always() }}
    uses: ./.github/workflows/collect-results.yml
    secrets: inherit

  calc-success-rate:
    needs: collect-results
    if: ${{ always() }}
    runs-on: ubuntu-latest

    env:
      RESULTS_DIR: "results/"
      STATS_FILENAME: "run_stats"
      GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

    steps:
      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          token: ${{ secrets.REPO_PAT }}
          fetch-depth: 0

      - name: Download results artifacts
        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
        with:
          path: ${{ env.RESULTS_DIR }}
          pattern: results_*

      - name: Install python dependencies
        run: pip install PyGithub

      - name: Calculate success rate
        run: python3 utils/calc_success_rate.py $STATS_FILENAME

      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: "run-stats"
          path: ${{ env.STATS_FILENAME }}.json