Profile #81
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Profile | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| config-key: | |
| description: "Config key from config yaml" | |
| required: true | |
| type: string | |
| config-file: | |
| description: "Config file to use" | |
| required: false | |
| type: string | |
| default: '.github/configs/nvidia-master.yaml' | |
| conc: | |
| description: "Concurrency value (must exist in config's conc-range/list)" | |
| required: false | |
| type: string | |
| default: '64' | |
| moe-debug: | |
| description: "Enable MoE debug patch and log (MOE_DEBUG_LOG)" | |
| required: false | |
| type: boolean | |
| default: false | |
| ref: | |
| description: "Ref (branch/sha) to checkout" | |
| required: false | |
| type: string | |
| permissions: | |
| contents: read | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| HF_HUB_CACHE: '/mnt/hf_hub_cache/' | |
| RANDOM_RANGE_RATIO: '0.8' | |
| PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage | |
| PYTHONDONTWRITEBYTECODE: '1' | |
| PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache | |
| jobs: | |
| get-jobs: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| filtered-matrix: ${{ steps.filter.outputs.filtered }} | |
| count: ${{ steps.filter.outputs.count }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| ref: ${{ inputs.ref || github.sha }} | |
| - id: gen | |
| name: Generate matrix via script | |
| run: | | |
| pip install pydantic | |
| CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}" | |
| CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS) | |
| echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT | |
| - id: filter | |
| name: Take first generated job | |
| shell: python | |
| run: | | |
| import json, os, sys | |
| raw = '${{ steps.gen.outputs.raw }}' | |
| try: | |
| data = json.loads(raw) | |
| except Exception as e: | |
| print('Invalid generator output:', e, file=sys.stderr) | |
| with open(os.environ['GITHUB_OUTPUT'], 'a') as f: | |
| f.write("filtered=[]\ncount=0\n") | |
| raise | |
| if not isinstance(data, list): | |
| print('Generator output is not a list.', file=sys.stderr) | |
| with open(os.environ['GITHUB_OUTPUT'], 'a') as f: | |
| f.write("filtered=[]\ncount=0\n") | |
| raise SystemExit(1) | |
| filt = data[:1] | |
| out = json.dumps(filt) | |
| print(out) | |
| with open(os.environ['GITHUB_OUTPUT'], 'a') as f: | |
| f.write(f"filtered={out}\n") | |
| f.write(f"count={len(filt)}\n") | |
| - name: Fail if no matching entries | |
| if: ${{ steps.filter.outputs.count == '0' }} | |
| run: | | |
| echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2 | |
| exit 1 | |
| profile: | |
| needs: get-jobs | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| config: ${{ fromJson(needs.get-jobs.outputs.filtered-matrix) }} | |
| runs-on: ${{ matrix.config.runner }} | |
| env: | |
| EXP_NAME: ${{ matrix.config.exp-name }} | |
| MODEL: ${{ matrix.config.model }} | |
| MODEL_PREFIX: ${{ matrix.config.model-prefix }} | |
| ISL: ${{ matrix.config.isl }} | |
| OSL: ${{ matrix.config.osl }} | |
| MAX_MODEL_LEN: ${{ matrix.config.max-model-len }} | |
| IMAGE: ${{ matrix.config.image }} | |
| FRAMEWORK: ${{ matrix.config.framework }} | |
| PRECISION: ${{ matrix.config.precision }} | |
| TP: ${{ matrix.config.tp }} | |
| EP_SIZE: ${{ matrix.config.ep }} | |
| DP_ATTENTION: ${{ matrix.config['dp-attn'] }} | |
| CONC: ${{ matrix.config.conc }} | |
| SPEC_DECODING: ${{ matrix.config.spec-decoding }} | |
| DISAGG: ${{ matrix.config.disagg }} | |
| MOE_DEBUG: '0' | |
| MOE_DEBUG_LOG: ${{ (inputs.moe-debug) && '/workspace/moe_debug.tp0.log' || '' }} | |
| steps: | |
| - name: Resource cleanup | |
| run: | | |
| # Cleanup Docker resources | |
| if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then | |
| echo "[Docker] Cleaning up resources ..." | |
| docker ps -aq | xargs -r docker rm -f | |
| docker network prune -f | |
| while [ -n "$(docker ps -aq)" ]; do | |
| docker ps -a | |
| sleep 5 | |
| done | |
| fi | |
| # Cleanup SLURM resources | |
| if command -v squeue >/dev/null 2>&1; then | |
| echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." | |
| scancel --name="${{ runner.name }}" || true | |
| while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do | |
| squeue --name="${{ runner.name }}" | |
| sleep 5 | |
| done | |
| fi | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| fetch-depth: 0 | |
| ref: ${{ inputs.ref || github.sha }} | |
| clean: false | |
| - name: Launch + Profile (single-node sglang/vllm) | |
| id: run | |
| env: | |
| RUNNER_NAME: ${{ runner.name }} | |
| PROFILE: '1' | |
| SGLANG_TORCH_PROFILER_DIR: /workspace/ | |
| VLLM_TORCH_PROFILER_DIR: /workspace/ | |
| VLLM_RPC_TIMEOUT: '1800000' | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| ep_val="${EP_SIZE:-1}" | |
| res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" | |
| export RESULT_FILENAME="${res_name}" | |
| echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV" | |
| bash ./runners/launch_${RUNNER_NAME%%_*}.sh | |
| if [ ! -f "${res_name}.json" ]; then | |
| echo "Run failed: Benchmark result ${res_name}.json not found." >&2 | |
| exit 1 | |
| fi | |
| trace_path="profile_${res_name}.trace.json.gz" | |
| if [ -f "$trace_path" ]; then | |
| echo "trace=$trace_path" >> "$GITHUB_OUTPUT" | |
| if [ "${FRAMEWORK}" = "sglang" ]; then | |
| # Try to locate corresponding TP-0 traces produced by SGLang profiler | |
| merged_latest=$(ls -t profiles/merged-*.trace.json.gz 2>/dev/null | head -n1 || true) | |
| if [ -n "${merged_latest}" ] && [ -f "${merged_latest}" ]; then | |
| ts_name="${merged_latest##*/}" | |
| ts_name="${ts_name#merged-}" | |
| ts_name="${ts_name%.trace.json.gz}" | |
| tp0_decode="profiles/${ts_name}-TP-0-DECODE.trace.json.gz" | |
| tp0_extend="profiles/${ts_name}-TP-0-EXTEND.trace.json.gz" | |
| if [ -f "${tp0_decode}" ]; then | |
| echo "tp0_decode=${tp0_decode}" >> "$GITHUB_OUTPUT" | |
| fi | |
| if [ -f "${tp0_extend}" ]; then | |
| echo "tp0_extend=${tp0_extend}" >> "$GITHUB_OUTPUT" | |
| fi | |
| fi | |
| fi | |
| else | |
| echo "Profile trace not found: $trace_path" >&2 | |
| fi | |
| - name: Process result (json -> agg) | |
| env: | |
| RUNNER_TYPE: ${{ matrix.config.runner }} | |
| run: | | |
| python3 utils/process_result.py | |
| - name: Upload profile as artifact | |
| if: ${{ steps.run.outputs.trace != '' }} | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: profile_${{ env.RESULT_FILENAME }} | |
| path: ${{ steps.run.outputs.trace }} | |
| if-no-files-found: ignore | |
| - name: Upload TP-0-DECODE trace as artifact | |
| if: ${{ steps.run.outputs.tp0_decode != '' }} | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: profile_${{ env.RESULT_FILENAME }}_TP0_DECODE | |
| path: ${{ steps.run.outputs.tp0_decode }} | |
| if-no-files-found: ignore | |
| - name: Upload TP-0-EXTEND trace as artifact | |
| if: ${{ steps.run.outputs.tp0_extend != '' }} | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: profile_${{ env.RESULT_FILENAME }}_TP0_EXTEND | |
| path: ${{ steps.run.outputs.tp0_extend }} | |
| if-no-files-found: ignore | |
| - name: Upload MoE debug log as artifact | |
| if: ${{ env.MOE_DEBUG == '1' }} | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: moe_debug_${{ env.RESULT_FILENAME }} | |
| path: "moe_debug.tp0.log" | |
| if-no-files-found: ignore | |
| - name: Checkout storage repo | |
| if: ${{ steps.run.outputs.trace != '' }} | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| repository: SemiAnalysisAI/InferenceX-trace-storage | |
| path: storage | |
| ref: master | |
| token: ${{ secrets.REPO_PAT }} | |
| fetch-depth: 0 | |
| - name: Push profile to storage repo | |
| if: ${{ steps.run.outputs.trace != '' }} | |
| id: push | |
| env: | |
| TRACE_LOCAL: ${{ steps.run.outputs.trace }} | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" | |
| mkdir -p "$dest_dir" | |
| cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz" | |
| pushd storage >/dev/null | |
| git config user.name "github-actions" | |
| git config user.email "github-actions@github.com" | |
| git add -A | |
| git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit" | |
| git push | |
| STORAGE_SHA="$(git rev-parse HEAD)" | |
| popd >/dev/null | |
| export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz" | |
| export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" | |
| enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')" | |
| enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')" | |
| relay="${PERFETTO_RELAY_URL%/}" | |
| RELAY_URL="${relay}/?src=${enc_src}&title=${enc_title}" | |
| echo "raw_url=$RAW_URL" >> "$GITHUB_OUTPUT" | |
| echo "relay_url=$RELAY_URL" >> "$GITHUB_OUTPUT" | |
| - name: Print Perfetto link (relay) | |
| if: ${{ steps.push.outputs.relay_url != '' }} | |
| env: | |
| RELAY_URL: ${{ steps.push.outputs.relay_url }} | |
| RAW_URL: ${{ steps.push.outputs.raw_url }} | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| echo "RAW trace URL: $RAW_URL" | |
| echo "Perfetto Relay URL: $RELAY_URL" | |
| printf "\n**Perfetto (Relay):** %s\n" "$RELAY_URL" >> "$GITHUB_STEP_SUMMARY" |