InferenceMAX/.github/workflows/benchmark-tmpl.yml at main · austenstone/InferenceMAX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
name: Template - Benchmark
on:
  workflow_call:
    inputs:
      runner:
        required: true
        type: string
      image:
        required: true
        type: string
      model:
        required: true
        type: string
      precision:
        required: true
        type: string
      framework:
        required: true
        type: string
      exp-name:
        required: true
        type: string
      isl:
        required: true
        type: string
      osl:
        required: true
        type: string
      tp:
        required: true
        type: string
      ep:
        required: true
        type: string
      dp-attn:
        required: true
        type: boolean
      max-model-len:
        required: true
        type: string
      conc:
        required: true
        type: string
      random-range-ratio:
        required: false
        type: string
        default: '0.8'

env:
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
  EXP_NAME: ${{ inputs.exp-name }}
  MODEL: ${{ inputs.model }}
  ISL: ${{ inputs.isl }}
  OSL: ${{ inputs.osl }}
  MAX_MODEL_LEN: ${{ inputs.max-model-len }}
  RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
  IMAGE: ${{ inputs.image }}
  FRAMEWORK: ${{ inputs.framework }}
  PRECISION: ${{ inputs.precision }}
  TP: ${{ inputs.tp }}
  EP_SIZE: ${{ inputs.ep }}
  DP_ATTENTION: ${{ inputs.dp-attn }}
  CONC: ${{ inputs.conc }}

permissions:
  contents: read

jobs:
  benchmark:
    runs-on: ${{ inputs.runner }}
    timeout-minutes: 180
    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
    steps:
      - name: Resource cleanup
        run: |
          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
            host=$(hostname)

            if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
              echo "[INFO] Running container-by-container cleanup on $host"

              for cid in $(docker ps -aq); do
                echo "[INFO] Cleaning container $cid"

                # Try graceful first
                docker stop -t 90 "$cid" || true

                # Wait until it's really dead
                docker wait "$cid" >/dev/null 2>&1 || true

                # Force remove if anything lingers
                docker rm -f "$cid" >/dev/null 2>&1 || true
              done

              # Give a moment for GPU processes to fully terminate
              sleep 2

              # Verify GPUs are now idle
              if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
                echo "[WARN] After stop, GPU still busy:"
                nvidia-smi
                # Last resort if driver allows and GPUs appear idle otherwise:
                # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
              fi
            else
              echo "[Docker] Cleaning up resources ..."
              docker ps -aq | xargs -r docker rm -f
              docker network prune -f
              while [ -n "$(docker ps -aq)" ]; do
                docker ps -a
                sleep 5
              done
            fi
          fi
          if command -v squeue >/dev/null 2>&1; then
            echo "[Slurm] Cleaning up resources ..."
            scancel -u $USER
            while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
              squeue -u $USER
              sleep 5
            done
          fi

      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          token: ${{ secrets.REPO_PAT }}
          fetch-depth: 0

      - name: Launch job script
        env:
          RUNNER_NAME: ${{ runner.name }}
          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
        run: |
          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
          if [ -f "$RESULT_FILENAME.json" ]; then
            echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
          else
            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
            exit 1
          fi

      - name: Process result
        env:
          RUNNER_TYPE: ${{ inputs.runner }}
        run: |
          python3 utils/process_result.py
      - name: Upload result
        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: ${{ env.RESULT_FILENAME }}
          path: agg_${{ env.RESULT_FILENAME }}.json