SemiAnalysisAI
diff --git a/‎.github/workflows/70b-tmpl.yml‎
Lines changed: 230 additions & 0 deletions b/‎.github/workflows/70b-tmpl.yml‎
Lines changed: 230 additions & 0 deletions
diff --git a/‎.github/workflows/full-sweep-tmpl.yml‎
Lines changed: 75 additions & 0 deletions b/‎.github/workflows/full-sweep-tmpl.yml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎.github/workflows/runner-model-sweep-test.yml‎
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/runner-model-sweep-test.yml‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,230 @@
+name: Template - LLaMA 70B
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: true
+        type: string
+      isl:
+        required: true
+        type: string
+      osl:
+        required: true
+        type: string
+      max-model-len:
+        required: true
+        type: string
+      random-range-ratio:
+        required: true
+        type: string
+
+      use_h100:
+        type: boolean
+        required: true
+      use_h200:
+        type: boolean
+        required: true
+      use_b200:
+        type: boolean
+        required: true
+      use_mi300x:
+        type: boolean
+        required: true
+      use_mi325x:
+        type: boolean
+        required: true
+      use_mi355x:
+        type: boolean
+        required: true
+
+jobs:
+  bmk-h100-fp8:
+    if: ${{ inputs.use_h100 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h100
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[2, 4, 8]'
+
+  bmk-h200-fp8:
+    if: ${{ inputs.use_h200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-h200-trt-fp8:
+    if: ${{ inputs.use_h200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'trt'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'  
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
+
+  bmk-b200-fp8:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+
+  bmk-b200-trt-fp8:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'trt'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
+
+  bmk-mi300x-fp8:
+    if: ${{ inputs.use_mi300x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi300x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-mi325x-fp8:
+    if: ${{ inputs.use_mi325x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi325x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-mi355x-fp8:
+    if: ${{ inputs.use_mi355x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi355x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-b200-fp4:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      framework: 'vllm'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
+
+  bmk-b200-trt-fp4:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      framework: 'trt'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
+
+  bmk-mi355x-fp4:
+    if: ${{ inputs.use_mi355x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi355x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
+      framework: 'vllm'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
@@ -37,6 +37,31 @@ on:
         default: false
 
 jobs:
+  _70b-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-1k1k-results:
+    needs: _70b-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+
   dsr1-1k1k:
     if: ${{ inputs.run_1k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -87,6 +112,31 @@ jobs:
     with:
       exp-name: 'gptoss_1k1k'
 
+  _70b-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-8k1k-results:
+    needs: _70b-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+
   dsr1-8k1k:
     if: ${{ inputs.run_8k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -137,6 +187,31 @@ jobs:
     with:
       exp-name: 'gptoss_8k1k'
 
+  _70b-1k8k:
+    if: ${{ inputs.run_1k8k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-1k8k-results:
+    needs: _70b-1k8k
+    if: ${{ inputs.run_1k8k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k8k'
+
   dsr1-1k8k:
     if: ${{ inputs.run_1k8k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
 
@@ -33,6 +33,7 @@ jobs:
           - 'h100-cw_0'
           - 'h100-cw_1'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
     name: '${{ matrix.runner }}'
@@ -69,6 +70,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -106,6 +108,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -137,6 +140,8 @@ jobs:
           - 'b200-nvd_2'
           - 'b200-nvd_3'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -169,6 +174,8 @@ jobs:
           - 'b200-nb_0'
           - 'b200-nb_1'
         config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -204,6 +211,7 @@ jobs:
           - 'mi300x-cr_0'
           - 'mi300x-oci_0'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -236,6 +244,7 @@ jobs:
           - 'mi325x-tw_2'
           - 'mi325x-tw_3'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -267,6 +276,8 @@ jobs:
           - 'mi355x-amd_2'
           - 'mi355x-amd_3'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }