b200 test

Oseltamivir · Oseltamivir · commit 822521f03b1f · 2025-12-02T02:53:18.000+08:00
diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
@@ -49,10 +49,10 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_0
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: b200-nvd_2
+      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
+      framework: trt
       precision: fp4
       exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
       tp: '4'
@@ -61,4 +61,4 @@ jobs:
       port: ${{ inputs.port || '8888' }}
       eval-task: gsm8k
       num-fewshot: ${{ inputs.num_fewshot || '5' }}
-      limit: ${{ inputs.limit || '200' }} 
+      limit: ${{ inputs.limit || '200' }}
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
@@ -81,7 +81,6 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -94,3 +94,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x