Skip to content

Profile

Profile #34

Workflow file for this run

name: Profile
on:
workflow_dispatch:
inputs:
config-key:
description: "Config key from config yaml"
required: true
type: string
config-file:
description: "Config file to use"
required: false
type: string
default: '.github/configs/nvidia-master.yaml'
conc:
description: "Concurrency value (must exist in config's conc-range/list)"
required: false
type: string
default: '64'
moe-debug:
description: "Enable MoE debug patch and log (MOE_DEBUG_LOG)"
required: false
type: boolean
default: false
ref:
description: "Ref (branch/sha) to checkout"
required: false
type: string
permissions:
contents: read
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_CACHE: '/mnt/hf_hub_cache/'
RANDOM_RANGE_RATIO: '0.8'
PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage
PYTHONDONTWRITEBYTECODE: '1'
PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
jobs:
get-jobs:
runs-on: ubuntu-latest
outputs:
filtered-matrix: ${{ steps.filter.outputs.filtered }}
count: ${{ steps.filter.outputs.count }}
steps:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.ref || github.ref }}
- id: gen
name: Generate matrix via script
run: |
pip install pydantic
CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}"
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS)
echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT
- id: filter
name: Take first generated job
shell: python
run: |
import json, os, sys
raw = '${{ steps.gen.outputs.raw }}'
try:
data = json.loads(raw)
except Exception as e:
print('Invalid generator output:', e, file=sys.stderr)
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write("filtered=[]\ncount=0\n")
raise
if not isinstance(data, list):
print('Generator output is not a list.', file=sys.stderr)
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write("filtered=[]\ncount=0\n")
raise SystemExit(1)
filt = data[:1]
out = json.dumps(filt)
print(out)
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write(f"filtered={out}\n")
f.write(f"count={len(filt)}\n")
- name: Fail if no matching entries
if: ${{ steps.filter.outputs.count == '0' }}
run: |
echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2
exit 1
profile:
needs: get-jobs
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.filtered-matrix) }}
runs-on: ${{ matrix.config.runner }}
env:
EXP_NAME: ${{ matrix.config.exp-name }}
MODEL: ${{ matrix.config.model }}
MODEL_PREFIX: ${{ matrix.config.model-prefix }}
ISL: ${{ matrix.config.isl }}
OSL: ${{ matrix.config.osl }}
MAX_MODEL_LEN: ${{ matrix.config.max-model-len }}
IMAGE: ${{ matrix.config.image }}
FRAMEWORK: ${{ matrix.config.framework }}
PRECISION: ${{ matrix.config.precision }}
TP: ${{ matrix.config.tp }}
EP_SIZE: ${{ matrix.config.ep }}
DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
CONC: ${{ matrix.config.conc }}
SPEC_DECODING: ${{ matrix.config.spec-decoding }}
DISAGG: ${{ matrix.config.disagg }}
MOE_DEBUG: ${{ (inputs.moe-debug) && '1' || '0' }}
MOE_DEBUG_LOG: ${{ (inputs.moe-debug) && '/workspace/moe_debug.tp0.log' || '' }}
steps:
- name: Resource cleanup
run: |
# Cleanup Docker resources
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
echo "[Docker] Cleaning up resources ..."
docker ps -aq | xargs -r docker rm -f
docker network prune -f
while [ -n "$(docker ps -aq)" ]; do
docker ps -a
sleep 5
done
fi
# Cleanup SLURM resources
if command -v squeue >/dev/null 2>&1; then
if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then
echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
scancel --name="${{ runner.name }}" || true
while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
squeue --name="${{ runner.name }}"
sleep 5
done
else
echo "[Slurm] Cleaning up jobs for user: $USER ..."
scancel -u "$USER" || true
while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
squeue -u "$USER"
sleep 5
done
fi
fi
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
ref: ${{ inputs.ref || github.ref }}
clean: false
- name: Launch + Profile (single-node sglang/vllm)
id: run
env:
RUNNER_NAME: ${{ runner.name }}
PROFILE: '1'
SGLANG_TORCH_PROFILER_DIR: /workspace/
VLLM_TORCH_PROFILER_DIR: /workspace/
VLLM_RPC_TIMEOUT: '1800000'
shell: bash
run: |
set -euo pipefail
ep_val="${EP_SIZE:-1}"
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
export RESULT_FILENAME="${res_name}"
echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
bash ./runners/launch_${RUNNER_NAME%%_*}.sh
if [ ! -f "${res_name}.json" ]; then
echo "Run failed: Benchmark result ${res_name}.json not found." >&2
exit 1
fi
trace_path="profile_${res_name}.trace.json.gz"
if [ -f "$trace_path" ]; then
echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
if [ "${FRAMEWORK}" = "sglang" ]; then
# Try to locate corresponding TP-0 traces produced by SGLang profiler
merged_latest=$(ls -t profiles/merged-*.trace.json.gz 2>/dev/null | head -n1 || true)
if [ -n "${merged_latest}" ] && [ -f "${merged_latest}" ]; then
ts_name="${merged_latest##*/}"
ts_name="${ts_name#merged-}"
ts_name="${ts_name%.trace.json.gz}"
tp0_decode="profiles/${ts_name}-TP-0-DECODE.trace.json.gz"
tp0_extend="profiles/${ts_name}-TP-0-EXTEND.trace.json.gz"
if [ -f "${tp0_decode}" ]; then
echo "tp0_decode=${tp0_decode}" >> "$GITHUB_OUTPUT"
fi
if [ -f "${tp0_extend}" ]; then
echo "tp0_extend=${tp0_extend}" >> "$GITHUB_OUTPUT"
fi
fi
fi
else
echo "Profile trace not found: $trace_path" >&2
fi
- name: Process result (json -> agg)
env:
RUNNER_TYPE: ${{ matrix.config.runner }}
run: |
python3 utils/process_result.py
- name: Upload profile as artifact
if: ${{ steps.run.outputs.trace != '' }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: profile_${{ env.RESULT_FILENAME }}
path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
if-no-files-found: ignore
- name: Upload TP-0-DECODE trace as artifact
if: ${{ steps.run.outputs.tp0_decode != '' }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: profile_${{ env.RESULT_FILENAME }}_TP0_DECODE
path: ${{ steps.run.outputs.tp0_decode }}
if-no-files-found: ignore
- name: Upload TP-0-EXTEND trace as artifact
if: ${{ steps.run.outputs.tp0_extend != '' }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: profile_${{ env.RESULT_FILENAME }}_TP0_EXTEND
path: ${{ steps.run.outputs.tp0_extend }}
if-no-files-found: ignore
- name: Upload MoE debug log as artifact
if: ${{ env.MOE_DEBUG == '1' }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: moe_debug_${{ env.RESULT_FILENAME }}
path: "moe_debug.tp0.log"
if-no-files-found: ignore
- name: Checkout storage repo
if: ${{ steps.run.outputs.trace != '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: SemiAnalysisAI/InferenceX-trace-storage
path: storage
ref: master
ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }}
fetch-depth: 0
- name: Push profile to storage repo
if: ${{ steps.run.outputs.trace != '' }}
id: push
env:
TRACE_LOCAL: ${{ steps.run.outputs.trace }}
shell: bash
run: |
set -euo pipefail
dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
mkdir -p "$dest_dir"
cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"
pushd storage >/dev/null
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git add -A
git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
git push
STORAGE_SHA="$(git rev-parse HEAD)"
popd >/dev/null
export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"
relay="${PERFETTO_RELAY_URL%/}"
RELAY_URL="${relay}/?src=${enc_src}&title=${enc_title}"
echo "raw_url=$RAW_URL" >> "$GITHUB_OUTPUT"
echo "relay_url=$RELAY_URL" >> "$GITHUB_OUTPUT"
- name: Print Perfetto link (relay)
if: ${{ steps.push.outputs.relay_url != '' }}
env:
RELAY_URL: ${{ steps.push.outputs.relay_url }}
RAW_URL: ${{ steps.push.outputs.raw_url }}
shell: bash
run: |
set -euo pipefail
echo "RAW trace URL: $RAW_URL"
echo "Perfetto Relay URL: $RELAY_URL"
printf "\n**Perfetto (Relay):** %s\n" "$RELAY_URL" >> "$GITHUB_STEP_SUMMARY"