e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1 #2699

Workflow file for this run

.github/workflows/e2e-tests.yml at 4138df9

	name: End-to-End Tests
	run-name: e2e Test - ${{ inputs.test-name \|\| inputs.generate-cli-command \|\| github.event.inputs.generate-cli-command }}

	on:
	workflow_dispatch:
	inputs:
	generate-cli-command:
	description: "Command passed to generate matrix script"
	required: true
	type: string
	test-name:
	description: "Name for this test run"
	required: false
	type: string
	ref:
	description: "Ref (branch/sha) to checkout for generating configs"
	required: false
	type: string
	duration-override:
	description: "Override matrix.config.duration (seconds). Empty = use matrix value."
	required: false
	type: string
	default: ""
	workflow_call:
	inputs:
	generate-cli-command:
	description: "Command passed to generate matrix script"
	required: true
	type: string
	test-name:
	description: "Name for this test run"
	required: false
	type: string
	ref:
	description: "Ref (branch/sha) to checkout for generating configs"
	required: false
	type: string
	duration-override:
	description: "Override matrix.config.duration (seconds). Empty = use matrix value."
	required: false
	type: string
	default: ""

	jobs:
	get-jobs:
	runs-on: ubuntu-latest
	outputs:
	single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
	multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
	eval-config: ${{ steps.get-jobs.outputs.eval-config }}
	multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
	agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
	multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
	steps:
	- name: Checkout code (ref)
	if: ${{ inputs.ref && inputs.ref != '' }}
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	ref: ${{ inputs.ref }}

	- name: Checkout code (default)
	if: ${{ !inputs.ref \|\| inputs.ref == '' }}
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	ref: ${{ github.sha }}

	- id: get-jobs
	run: \|
	pip install pydantic
	CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
	${{ inputs.generate-cli-command \|\| github.event.inputs.generate-cli-command }})
	AGENTIC=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
	MULTI_AGENTIC=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
	SINGLE=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
	MULTI=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
	EVALS=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
	MULTI_EVAL=$(echo "$CONFIG_JSON" \| python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
	echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
	echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
	echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
	echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
	echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
	echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT

	test-sweep-multi-node:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.multi-node-config != '[]' }}
	uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	name: multi-node /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.multi-node-config) }}
	secrets: inherit
	with:
	isl: ${{ matrix.config.isl }}
	osl: ${{ matrix.config.osl }}
	max-model-len: ${{ matrix.config.max-model-len }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	exp-name: ${{ matrix.config.exp-name }}
	conc-list: ${{ toJson(matrix.config.conc) }}
	spec-decoding: ${{ matrix.config.spec-decoding }}
	disagg: ${{ matrix.config.disagg }}

	prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
	prefill-tp: ${{ matrix.config.prefill.tp }}
	prefill-ep: ${{ matrix.config.prefill.ep }}
	prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
	prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}

	decode-num-worker: ${{ matrix.config.decode.num-worker }}
	decode-tp: ${{ matrix.config.decode.tp }}
	decode-ep: ${{ matrix.config.decode.ep }}
	decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
	decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
	run-eval: false
	ref: ${{ inputs.ref }}

	test-sweep-multi-node-evals:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }}
	uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	name: multi-node eval /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }}
	secrets: inherit
	with:
	isl: ${{ matrix.config.isl }}
	osl: ${{ matrix.config.osl }}
	max-model-len: ${{ matrix.config.max-model-len }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	exp-name: ${{ matrix.config.exp-name }}
	conc-list: ${{ toJson(matrix.config.conc) }}
	spec-decoding: ${{ matrix.config.spec-decoding }}
	disagg: ${{ matrix.config.disagg }}

	prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
	prefill-tp: ${{ matrix.config.prefill.tp }}
	prefill-ep: ${{ matrix.config.prefill.ep }}
	prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
	prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}

	decode-num-worker: ${{ matrix.config.decode.num-worker }}
	decode-tp: ${{ matrix.config.decode.tp }}
	decode-ep: ${{ matrix.config.decode.ep }}
	decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
	decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
	run-eval: true
	eval-only: true
	eval-conc: ${{ matrix.config.eval-conc }}
	ref: ${{ inputs.ref }}

	test-sweep-agentic:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.agentic-config != '[]' }}
	uses: ./.github/workflows/benchmark-tmpl.yml
	name: agentic /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.agentic-config) }}
	secrets: inherit
	with:
	exp-name: ${{ matrix.config.exp-name }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	tp: ${{ matrix.config.tp }}
	ep: ${{ matrix.config.ep }}
	dp-attn: ${{ matrix.config.dp-attn }}
	conc: ${{ matrix.config.conc }}
	offloading: ${{ matrix.config.offloading }}
	duration: ${{ inputs.duration-override != '' && inputs.duration-override \|\| matrix.config.duration }}
	isl: '0'
	osl: '0'
	max-model-len: '0'
	spec-decoding: 'none'
	disagg: 'false'
	run-eval: false
	scenario-type: agentic-coding
	ref: ${{ inputs.ref }}

	test-sweep-multi-node-agentic:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }}
	uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	name: multi-node agentic /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.multi-node-agentic-config) }}
	secrets: inherit
	with:
	exp-name: ${{ matrix.config.exp-name }}
	isl: '0'
	osl: '0'
	max-model-len: '0'
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	conc-list: '[${{ matrix.config.conc }}]'
	spec-decoding: ${{ matrix.config.spec-decoding }}
	disagg: ${{ matrix.config.disagg }}
	prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
	prefill-tp: ${{ matrix.config.prefill.tp }}
	prefill-ep: ${{ matrix.config.prefill.ep }}
	prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
	prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
	decode-num-worker: ${{ matrix.config.decode.num-worker }}
	decode-tp: ${{ matrix.config.decode.tp }}
	decode-ep: ${{ matrix.config.decode.ep }}
	decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
	decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
	conc: ${{ matrix.config.conc }}
	duration: ${{ inputs.duration-override != '' && inputs.duration-override \|\| matrix.config.duration }}
	run-eval: false
	scenario-type: agentic-coding
	ref: ${{ inputs.ref }}

	test-sweep-single-node:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }}
	uses: ./.github/workflows/benchmark-tmpl.yml
	name: single-node /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.single-node-config) }}
	secrets: inherit
	with:
	exp-name: ${{ matrix.config.exp-name }}
	isl: ${{ matrix.config.isl }}
	osl: ${{ matrix.config.osl }}
	max-model-len: ${{ matrix.config.max-model-len }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	tp: ${{ matrix.config.tp }}
	ep: ${{ matrix.config.ep }}
	dp-attn: ${{ matrix.config.dp-attn }}
	conc: ${{ matrix.config.conc }}
	spec-decoding: ${{ matrix.config.spec-decoding }}
	disagg: ${{ matrix.config.disagg }}
	run-eval: false
	ref: ${{ inputs.ref }}

	test-sweep-evals:
	needs: get-jobs
	if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
	uses: ./.github/workflows/benchmark-tmpl.yml
	name: eval /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
	secrets: inherit
	with:
	exp-name: ${{ matrix.config.exp-name }}
	isl: ${{ matrix.config.isl }}
	osl: ${{ matrix.config.osl }}
	max-model-len: ${{ matrix.config.max-model-len }}
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	model-prefix: ${{ matrix.config.model-prefix }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	tp: ${{ matrix.config.tp }}
	ep: ${{ matrix.config.ep }}
	dp-attn: ${{ matrix.config.dp-attn }}
	conc: ${{ matrix.config.conc }}
	spec-decoding: ${{ matrix.config.spec-decoding }}
	disagg: ${{ matrix.config.disagg }}
	run-eval: true
	eval-only: true
	ref: ${{ inputs.ref }}

	collect-results:
	needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
	if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' \|\| needs.test-sweep-single-node.result != 'skipped' \|\| needs.test-sweep-agentic.result != 'skipped' \|\| needs.test-sweep-multi-node-agentic.result != 'skipped') }}
	uses: ./.github/workflows/collect-results.yml
	secrets: inherit
	with:
	result-prefix: "bmk"

	collect-evals:
	needs: [test-sweep-evals, test-sweep-multi-node-evals]
	if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' \|\| needs.test-sweep-multi-node-evals.result != 'skipped') }}
	uses: ./.github/workflows/collect-evals.yml
	secrets: inherit

	collect-agentic-results:
	needs: [test-sweep-agentic, test-sweep-multi-node-agentic]
	if: ${{ always() && (needs.test-sweep-agentic.result != 'skipped' \|\| needs.test-sweep-multi-node-agentic.result != 'skipped') }}
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	submodules: true

	- uses: actions/setup-python@v6
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: pip install pandas matplotlib numpy

	- name: Download agentic artifacts
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
	with:
	pattern: 'agentic_*'
	path: results/

	- name: Run aggregation
	env:
	PYTHONPATH: utils/agentic-benchmark/scripts:utils/agentic-benchmark/analysis
	run: \|
	python utils/agentic-benchmark/scripts/collect_sweep_results.py results/ aggregated/

	- name: Upload aggregated results
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: agentic_aggregated
	path: aggregated/

	calc-success-rate:
	needs: [collect-results, collect-evals, collect-agentic-results]
	if: ${{ always() }}
	runs-on: ubuntu-latest

	env:
	RESULTS_DIR: "results/"
	STATS_FILENAME: "run_stats"
	GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	token: ${{ secrets.REPO_PAT }}
	fetch-depth: 0

	- name: Download results artifacts
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
	with:
	path: ${{ env.RESULTS_DIR }}
	pattern: results_*

	- name: Install python dependencies
	run: pip install PyGithub

	- name: Calculate success rate
	run: python3 utils/calc_success_rate.py $STATS_FILENAME

	- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: "run-stats"
	path: ${{ env.STATS_FILENAME }}.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1 #2699

Workflow file

e2e Test - llm-d-vllm smoke - dsr1 fp8 h200 1P+1D conc=1 #2699

Uh oh!

Workflow file for this run