|
| 1 | +name: Profile |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_dispatch: |
| 5 | + inputs: |
| 6 | + config-key: |
| 7 | + description: "Config key from config yaml" |
| 8 | + required: true |
| 9 | + type: string |
| 10 | + config-file: |
| 11 | + description: "Config file to use" |
| 12 | + required: false |
| 13 | + type: string |
| 14 | + default: '.github/configs/nvidia-master.yaml' |
| 15 | + conc: |
| 16 | + description: "Concurrency value (must exist in config's conc-range/list)" |
| 17 | + required: false |
| 18 | + type: string |
| 19 | + default: '64' |
| 20 | + moe-debug: |
| 21 | + description: "Enable MoE debug patch and log (MOE_DEBUG_LOG)" |
| 22 | + required: false |
| 23 | + type: boolean |
| 24 | + default: false |
| 25 | + ref: |
| 26 | + description: "Ref (branch/sha) to checkout" |
| 27 | + required: false |
| 28 | + type: string |
| 29 | + |
| 30 | +permissions: |
| 31 | + contents: read |
| 32 | + |
| 33 | +env: |
| 34 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| 35 | + HF_HUB_CACHE: '/mnt/hf_hub_cache/' |
| 36 | + RANDOM_RANGE_RATIO: '0.8' |
| 37 | + PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage |
| 38 | + |
| 39 | +jobs: |
| 40 | + get-jobs: |
| 41 | + runs-on: ubuntu-latest |
| 42 | + outputs: |
| 43 | + filtered-matrix: ${{ steps.filter.outputs.filtered }} |
| 44 | + count: ${{ steps.filter.outputs.count }} |
| 45 | + steps: |
| 46 | + - name: Checkout code |
| 47 | + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 |
| 48 | + with: |
| 49 | + ref: ${{ inputs.ref || github.ref }} |
| 50 | + |
| 51 | + - id: gen |
| 52 | + name: Generate matrix via script |
| 53 | + run: | |
| 54 | + pip install pydantic |
| 55 | + CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}" |
| 56 | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS) |
| 57 | + echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT |
| 58 | +
|
| 59 | + - id: filter |
| 60 | + name: Take first generated job |
| 61 | + shell: python |
| 62 | + run: | |
| 63 | + import json, os, sys |
| 64 | + raw = '${{ steps.gen.outputs.raw }}' |
| 65 | + try: |
| 66 | + data = json.loads(raw) |
| 67 | + except Exception as e: |
| 68 | + print('Invalid generator output:', e, file=sys.stderr) |
| 69 | + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: |
| 70 | + f.write("filtered=[]\ncount=0\n") |
| 71 | + raise |
| 72 | +
|
| 73 | + if not isinstance(data, list): |
| 74 | + print('Generator output is not a list.', file=sys.stderr) |
| 75 | + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: |
| 76 | + f.write("filtered=[]\ncount=0\n") |
| 77 | + raise SystemExit(1) |
| 78 | +
|
| 79 | + filt = data[:1] |
| 80 | +
|
| 81 | + out = json.dumps(filt) |
| 82 | + print(out) |
| 83 | + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: |
| 84 | + f.write(f"filtered={out}\n") |
| 85 | + f.write(f"count={len(filt)}\n") |
| 86 | +
|
| 87 | + - name: Fail if no matching entries |
| 88 | + if: ${{ steps.filter.outputs.count == '0' }} |
| 89 | + run: | |
| 90 | + echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2 |
| 91 | + exit 1 |
| 92 | +
|
| 93 | + profile: |
| 94 | + needs: get-jobs |
| 95 | + strategy: |
| 96 | + fail-fast: false |
| 97 | + matrix: |
| 98 | + config: ${{ fromJson(needs.get-jobs.outputs.filtered-matrix) }} |
| 99 | + runs-on: ${{ matrix.config.runner }} |
| 100 | + env: |
| 101 | + EXP_NAME: ${{ matrix.config.exp-name }} |
| 102 | + MODEL: ${{ matrix.config.model }} |
| 103 | + MODEL_PREFIX: ${{ matrix.config.model-prefix }} |
| 104 | + ISL: ${{ matrix.config.isl }} |
| 105 | + OSL: ${{ matrix.config.osl }} |
| 106 | + MAX_MODEL_LEN: ${{ matrix.config.max-model-len }} |
| 107 | + IMAGE: ${{ matrix.config.image }} |
| 108 | + FRAMEWORK: ${{ matrix.config.framework }} |
| 109 | + PRECISION: ${{ matrix.config.precision }} |
| 110 | + TP: ${{ matrix.config.tp }} |
| 111 | + EP_SIZE: ${{ matrix.config.ep }} |
| 112 | + DP_ATTENTION: ${{ matrix.config['dp-attn'] }} |
| 113 | + CONC: ${{ matrix.config.conc }} |
| 114 | + SPEC_DECODING: ${{ matrix.config.spec-decoding }} |
| 115 | + DISAGG: ${{ matrix.config.disagg }} |
| 116 | + MOE_DEBUG: '0' |
| 117 | + MOE_DEBUG_LOG: ${{ (inputs.moe-debug) && '/workspace/moe_debug.tp0.log' || '' }} |
| 118 | + steps: |
| 119 | + - name: Resource cleanup |
| 120 | + run: | |
| 121 | + # Cleanup Docker resources |
| 122 | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then |
| 123 | + echo "[Docker] Cleaning up resources ..." |
| 124 | + docker ps -aq | xargs -r docker rm -f |
| 125 | + docker network prune -f |
| 126 | + while [ -n "$(docker ps -aq)" ]; do |
| 127 | + docker ps -a |
| 128 | + sleep 5 |
| 129 | + done |
| 130 | + fi |
| 131 | +
|
| 132 | + # Cleanup SLURM resources |
| 133 | + if command -v squeue >/dev/null 2>&1; then |
| 134 | + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then |
| 135 | + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." |
| 136 | + scancel --name="${{ runner.name }}" || true |
| 137 | + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do |
| 138 | + squeue --name="${{ runner.name }}" |
| 139 | + sleep 5 |
| 140 | + done |
| 141 | + else |
| 142 | + echo "[Slurm] Cleaning up jobs for user: $USER ..." |
| 143 | + scancel -u "$USER" || true |
| 144 | + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do |
| 145 | + squeue -u "$USER" |
| 146 | + sleep 5 |
| 147 | + done |
| 148 | + fi |
| 149 | + fi |
| 150 | +
|
| 151 | + - name: Checkout code |
| 152 | + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 |
| 153 | + with: |
| 154 | + fetch-depth: 0 |
| 155 | + ref: ${{ inputs.ref || github.ref }} |
| 156 | + |
| 157 | + - name: Launch + Profile (single-node sglang/vllm) |
| 158 | + id: run |
| 159 | + env: |
| 160 | + RUNNER_NAME: ${{ runner.name }} |
| 161 | + PROFILE: '1' |
| 162 | + SGLANG_TORCH_PROFILER_DIR: /workspace/ |
| 163 | + VLLM_TORCH_PROFILER_DIR: /workspace/ |
| 164 | + VLLM_RPC_TIMEOUT: '1800000' |
| 165 | + shell: bash |
| 166 | + run: | |
| 167 | + set -euo pipefail |
| 168 | + ep_val="${EP_SIZE:-1}" |
| 169 | + res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" |
| 170 | + export RESULT_FILENAME="${res_name}" |
| 171 | + echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV" |
| 172 | +
|
| 173 | + bash ./runners/launch_${RUNNER_NAME%%_*}.sh |
| 174 | +
|
| 175 | + if [ ! -f "${res_name}.json" ]; then |
| 176 | + echo "Run failed: Benchmark result ${res_name}.json not found." >&2 |
| 177 | + exit 1 |
| 178 | + fi |
| 179 | +
|
| 180 | + trace_path="profile_${res_name}.trace.json.gz" |
| 181 | + if [ -f "$trace_path" ]; then |
| 182 | + echo "trace=$trace_path" >> "$GITHUB_OUTPUT" |
| 183 | + if [ "${FRAMEWORK}" = "sglang" ]; then |
| 184 | + # Try to locate corresponding TP-0 traces produced by SGLang profiler |
| 185 | + merged_latest=$(ls -t profiles/merged-*.trace.json.gz 2>/dev/null | head -n1 || true) |
| 186 | + if [ -n "${merged_latest}" ] && [ -f "${merged_latest}" ]; then |
| 187 | + ts_name="${merged_latest##*/}" |
| 188 | + ts_name="${ts_name#merged-}" |
| 189 | + ts_name="${ts_name%.trace.json.gz}" |
| 190 | + tp0_decode="profiles/${ts_name}-TP-0-DECODE.trace.json.gz" |
| 191 | + tp0_extend="profiles/${ts_name}-TP-0-EXTEND.trace.json.gz" |
| 192 | + if [ -f "${tp0_decode}" ]; then |
| 193 | + echo "tp0_decode=${tp0_decode}" >> "$GITHUB_OUTPUT" |
| 194 | + fi |
| 195 | + if [ -f "${tp0_extend}" ]; then |
| 196 | + echo "tp0_extend=${tp0_extend}" >> "$GITHUB_OUTPUT" |
| 197 | + fi |
| 198 | + fi |
| 199 | + fi |
| 200 | + else |
| 201 | + echo "Profile trace not found: $trace_path" >&2 |
| 202 | + fi |
| 203 | +
|
| 204 | + - name: Process result (json -> agg) |
| 205 | + env: |
| 206 | + RUNNER_TYPE: ${{ matrix.config.runner }} |
| 207 | + run: | |
| 208 | + python3 utils/process_result.py |
| 209 | +
|
| 210 | + - name: Upload profile as artifact |
| 211 | + if: ${{ steps.run.outputs.trace != '' }} |
| 212 | + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 |
| 213 | + with: |
| 214 | + name: profile_${{ env.RESULT_FILENAME }} |
| 215 | + path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz |
| 216 | + if-no-files-found: ignore |
| 217 | + |
| 218 | + - name: Upload TP-0-DECODE trace as artifact |
| 219 | + if: ${{ steps.run.outputs.tp0_decode != '' }} |
| 220 | + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 |
| 221 | + with: |
| 222 | + name: profile_${{ env.RESULT_FILENAME }}_TP0_DECODE |
| 223 | + path: ${{ steps.run.outputs.tp0_decode }} |
| 224 | + if-no-files-found: ignore |
| 225 | + |
| 226 | + - name: Upload TP-0-EXTEND trace as artifact |
| 227 | + if: ${{ steps.run.outputs.tp0_extend != '' }} |
| 228 | + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 |
| 229 | + with: |
| 230 | + name: profile_${{ env.RESULT_FILENAME }}_TP0_EXTEND |
| 231 | + path: ${{ steps.run.outputs.tp0_extend }} |
| 232 | + if-no-files-found: ignore |
| 233 | + |
| 234 | + - name: Upload MoE debug log as artifact |
| 235 | + if: ${{ env.MOE_DEBUG == '1' }} |
| 236 | + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 |
| 237 | + with: |
| 238 | + name: moe_debug_${{ env.RESULT_FILENAME }} |
| 239 | + path: "moe_debug.tp0.log" |
| 240 | + if-no-files-found: ignore |
| 241 | + |
| 242 | + - name: Checkout storage repo |
| 243 | + if: ${{ steps.run.outputs.trace != '' }} |
| 244 | + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 |
| 245 | + with: |
| 246 | + repository: SemiAnalysisAI/InferenceX-trace-storage |
| 247 | + path: storage |
| 248 | + ref: master |
| 249 | + ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }} |
| 250 | + fetch-depth: 0 |
| 251 | + |
| 252 | + - name: Push profile to storage repo |
| 253 | + if: ${{ steps.run.outputs.trace != '' }} |
| 254 | + id: push |
| 255 | + env: |
| 256 | + TRACE_LOCAL: ${{ steps.run.outputs.trace }} |
| 257 | + shell: bash |
| 258 | + run: | |
| 259 | + set -euo pipefail |
| 260 | +
|
| 261 | + dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" |
| 262 | + mkdir -p "$dest_dir" |
| 263 | + cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz" |
| 264 | +
|
| 265 | + pushd storage >/dev/null |
| 266 | + git config user.name "github-actions" |
| 267 | + git config user.email "github-actions@github.com" |
| 268 | + git add -A |
| 269 | + git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit" |
| 270 | + git push |
| 271 | + STORAGE_SHA="$(git rev-parse HEAD)" |
| 272 | + popd >/dev/null |
| 273 | +
|
| 274 | + export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz" |
| 275 | + export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" |
| 276 | +
|
| 277 | + enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')" |
| 278 | + enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')" |
| 279 | +
|
| 280 | + relay="${PERFETTO_RELAY_URL%/}" |
| 281 | + RELAY_URL="${relay}/?src=${enc_src}&title=${enc_title}" |
| 282 | +
|
| 283 | + echo "raw_url=$RAW_URL" >> "$GITHUB_OUTPUT" |
| 284 | + echo "relay_url=$RELAY_URL" >> "$GITHUB_OUTPUT" |
| 285 | +
|
| 286 | + - name: Print Perfetto link (relay) |
| 287 | + if: ${{ steps.push.outputs.relay_url != '' }} |
| 288 | + env: |
| 289 | + RELAY_URL: ${{ steps.push.outputs.relay_url }} |
| 290 | + RAW_URL: ${{ steps.push.outputs.raw_url }} |
| 291 | + shell: bash |
| 292 | + run: | |
| 293 | + set -euo pipefail |
| 294 | + echo "RAW trace URL: $RAW_URL" |
| 295 | + echo "Perfetto Relay URL: $RELAY_URL" |
| 296 | + printf "\n**Perfetto (Relay):** %s\n" "$RELAY_URL" >> "$GITHUB_STEP_SUMMARY" |
0 commit comments