Skip to content

e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1 #2699

e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1

e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1 #2699

Workflow file for this run

name: End-to-End Tests
run-name: e2e Test - ${{ inputs.test-name || inputs.generate-cli-command || github.event.inputs.generate-cli-command }}
on:
workflow_dispatch:
inputs:
generate-cli-command:
description: "Command passed to generate matrix script"
required: true
type: string
test-name:
description: "Name for this test run"
required: false
type: string
ref:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
duration-override:
description: "Override matrix.config.duration (seconds). Empty = use matrix value."
required: false
type: string
default: ""
workflow_call:
inputs:
generate-cli-command:
description: "Command passed to generate matrix script"
required: true
type: string
test-name:
description: "Name for this test run"
required: false
type: string
ref:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
duration-override:
description: "Override matrix.config.duration (seconds). Empty = use matrix value."
required: false
type: string
default: ""
jobs:
get-jobs:
runs-on: ubuntu-latest
outputs:
single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
eval-config: ${{ steps.get-jobs.outputs.eval-config }}
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
steps:
- name: Checkout code (ref)
if: ${{ inputs.ref && inputs.ref != '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.ref }}
- name: Checkout code (default)
if: ${{ !inputs.ref || inputs.ref == '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ github.sha }}
- id: get-jobs
run: |
pip install pydantic
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT
test-sweep-multi-node:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.multi-node-config != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.multi-node-config) }}
secrets: inherit
with:
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
exp-name: ${{ matrix.config.exp-name }}
conc-list: ${{ toJson(matrix.config.conc) }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: false
ref: ${{ inputs.ref }}
test-sweep-multi-node-evals:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }}
secrets: inherit
with:
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
exp-name: ${{ matrix.config.exp-name }}
conc-list: ${{ toJson(matrix.config.conc) }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: true
eval-only: true
eval-conc: ${{ matrix.config.eval-conc }}
ref: ${{ inputs.ref }}
test-sweep-agentic:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.agentic-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: agentic /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.agentic-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
offloading: ${{ matrix.config.offloading }}
duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
isl: '0'
osl: '0'
max-model-len: '0'
spec-decoding: 'none'
disagg: 'false'
run-eval: false
scenario-type: agentic-coding
ref: ${{ inputs.ref }}
test-sweep-multi-node-agentic:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node agentic /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.multi-node-agentic-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: '0'
osl: '0'
max-model-len: '0'
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
conc-list: '[${{ matrix.config.conc }}]'
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
conc: ${{ matrix.config.conc }}
duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
run-eval: false
scenario-type: agentic-coding
ref: ${{ inputs.ref }}
test-sweep-single-node:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.single-node-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: false
ref: ${{ inputs.ref }}
test-sweep-evals:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true
ref: ${{ inputs.ref }}
collect-results:
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
with:
result-prefix: "bmk"
collect-evals:
needs: [test-sweep-evals, test-sweep-multi-node-evals]
if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
uses: ./.github/workflows/collect-evals.yml
secrets: inherit
collect-agentic-results:
needs: [test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
submodules: true
- uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Install dependencies
run: pip install pandas matplotlib numpy
- name: Download agentic artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
pattern: 'agentic_*'
path: results/
- name: Run aggregation
env:
PYTHONPATH: utils/agentic-benchmark/scripts:utils/agentic-benchmark/analysis
run: |
python utils/agentic-benchmark/scripts/collect_sweep_results.py results/ aggregated/
- name: Upload aggregated results
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: agentic_aggregated
path: aggregated/
calc-success-rate:
needs: [collect-results, collect-evals, collect-agentic-results]
if: ${{ always() }}
runs-on: ubuntu-latest
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0
- name: Download results artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
path: ${{ env.RESULTS_DIR }}
pattern: results_*
- name: Install python dependencies
run: pip install PyGithub
- name: Calculate success rate
run: python3 utils/calc_success_rate.py $STATS_FILENAME
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: "run-stats"
path: ${{ env.STATS_FILENAME }}.json