From 11f5d35ab035bc1651dd92fb5fc8f31e2c0d7679 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 09:08:44 -0600 Subject: [PATCH 1/6] remove h200-trt runner type --- .github/README.md | 2 +- .github/configs/CONFIGS.md | 2 +- .github/configs/nvidia-master.yaml | 4 ++-- .github/configs/runners.yaml | 11 ----------- .github/workflows/full-sweep-test.yml | 2 +- utils/matrix-logic/generate_sweep_configs.py | 4 ++-- utils/plot_perf.py | 1 - 7 files changed, 7 insertions(+), 19 deletions(-) diff --git a/.github/README.md b/.github/README.md index 69fc1069f..003b8809f 100644 --- a/.github/README.md +++ b/.github/README.md @@ -96,7 +96,7 @@ full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia **Test all TRT configs on H200 runners:** ``` -full-sweep --framework trt --runner-type h200 h200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +full-sweep --framework trt --runner-type h200 b200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` **Quick smoke test of all configs (highest TP, lowest concurrency only):** diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 218e17821..9d3c24309 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -49,4 +49,4 @@ Notes: ## Runners -The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `h200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs. +The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `b200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs. diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4f8ea4ca7..917136739 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -144,7 +144,7 @@ dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: h200-trt + runner: h200 precision: fp8 framework: trt # For all sequence lengths, EP=TP @@ -258,7 +258,7 @@ gptoss-fp4-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev model: openai/gpt-oss-120b model-prefix: gptoss - runner: h200-trt + runner: h200 precision: fp4 framework: trt # For all sequence lengths, EP=TP, DP_ATTENTION=false diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index ccbb15cbc..948db2754 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -14,17 +14,6 @@ h200: - 'h200-nv_1' - 'h200-nv_2' - 'h200-nv_3' -h200-trt: -- 'h200-cw_0' -- 'h200-cw_1' -- 'h200-nb_0' -- 'h200-nb_1' -- 'h200-nb_2' -- 'h200-nb_3' -- 'h200-nv_0' -- 'h200-nv_1' -- 'h200-nv_2' -- 'h200-nv_3' b200-trt: - 'b200-nv_0' - 'b200-nv_1' diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 9cd08e163..c9d64e602 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,7 +56,7 @@ jobs: set -x # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" + RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index bb0e22911..0c3ccac51 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -824,7 +824,7 @@ def main(): test_config_parser.add_argument( '--runner-type', required=True, - help='Runner type (e.g., h200-trt, h100)' + help='Runner type (e.g., b200-trt, h100)' ) test_config_parser.add_argument( '--runner-config', @@ -847,7 +847,7 @@ def main(): test_config_parser.add_argument( '--runner-type', required=True, - help='Runner type (e.g., h200-trt, h100)' + help='Runner type (e.g., b200-trt, h100)' ) test_config_parser.add_argument( '--model-prefix', diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 1cab81cdc..6ef3d9848 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -9,7 +9,6 @@ hw_color = { 'h100': 'lightgreen', 'h200': 'green', # H200 VLLM - 'h200-trt': 'darkgreen', # H200 TRT-LLM 'b200': 'black', # B200 VLLM 'b200-trt': 'gray', # B200 TRT-LLM 'mi300x': 'pink', From d8e1142280c178c0ab9b7a37973e53a53dc795e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 09:32:13 -0600 Subject: [PATCH 2/6] bug fix --- .github/workflows/full-sweep-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index c9d64e602..6c25d2c4b 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,7 +56,7 @@ jobs: set -x # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" + RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) From 2ae76b36c772d851c34d17697861a7433e02d0fa Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 09:36:34 -0600 Subject: [PATCH 3/6] undo changes in plot perf --- utils/plot_perf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 6ef3d9848..1cab81cdc 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -9,6 +9,7 @@ hw_color = { 'h100': 'lightgreen', 'h200': 'green', # H200 VLLM + 'h200-trt': 'darkgreen', # H200 TRT-LLM 'b200': 'black', # B200 VLLM 'b200-trt': 'gray', # B200 TRT-LLM 'mi300x': 'pink', From c1bcd2d35c388a42a09b016369799d37f0efc6e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 09:43:49 -0600 Subject: [PATCH 4/6] debug summarize py --- utils/summarize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/summarize.py b/utils/summarize.py index 7668729fe..1c8d26a10 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -6,6 +6,7 @@ results = [] results_dir = Path(sys.argv[1]) for result_path in results_dir.rglob(f'*.json'): + print(f"Found result file: {result_path}") with open(result_path) as f: result = json.load(f) results.append(result) From 0ad04f5788dbb9e030b4e983c203007c168b6bb2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 09:53:13 -0600 Subject: [PATCH 5/6] argv[2] not even used in summarize.py -- removing from workflow --- .github/workflows/collect-results.yml | 2 +- utils/summarize.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 2bd499090..aa00aa2ae 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -26,7 +26,7 @@ jobs: pattern: ${{ inputs.exp-name && format('{0}_*', inputs.exp-name) || '*' }} - name: Print summary - run: python3 utils/summarize.py results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY + run: python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY - name: Aggregate results run: python3 utils/collect_results.py results/ ${{ inputs.exp-name || 'all' }} diff --git a/utils/summarize.py b/utils/summarize.py index 1c8d26a10..7668729fe 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -6,7 +6,6 @@ results = [] results_dir = Path(sys.argv[1]) for result_path in results_dir.rglob(f'*.json'): - print(f"Found result file: {result_path}") with open(result_path) as f: result = json.load(f) results.append(result) From f01a2439ef070d72a23328d0ae64714f5f98dc17 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 10:09:35 -0600 Subject: [PATCH 6/6] fix race condition in e2e-tests.yml --- .github/workflows/e2e-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 8b7654ff9..e675ea93b 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -55,7 +55,7 @@ jobs: secrets: inherit calc-success-rate: - needs: test-sweep + needs: collect-results if: ${{ always() }} runs-on: ubuntu-latest