Update glm5-fp4-b300-sglang and -mtp SGLang image to v0.5.12-cu130 (#1420)

Klaud-Cold · web-flow · commit 8862360995f3 · 2026-05-24T20:08:32.000-07:00
Conc 128 breaking but pareto works
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -2352,7 +2352,7 @@ glm5-fp4-b200-sglang-mtp:
   # does not have a B300-specific recipe, so this config reuses the existing
   # GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available.
 glm5-fp4-b300-sglang:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b300
@@ -2373,7 +2373,7 @@ glm5-fp4-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b300
diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/glm5_fp4_b300.sh
@@ -24,6 +24,13 @@ nvidia-smi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
+# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
+# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
+# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
+# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
+# at bs=128 + EAGLE on B300 appeared in the same image bump.
+pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
+
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
@@ -24,6 +24,12 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
+# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
+# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
+# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
+# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
+# at bs=128 + EAGLE on B300 appeared in the same image bump.
+pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
 
 export SGL_ENABLE_JIT_DEEPGEMM=1
 export SGLANG_ENABLE_SPEC_V2=1
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3043,6 +3043,13 @@
     - "Update SGLang image from nightly-dev-cu13-20260518-c67b2870 to nightly-dev-cu13-20260519-dbac4647"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1492
 
+- config-keys:
+    - glm5-fp4-b300-sglang
+    - glm5-fp4-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1420
+
 - config-keys:
     - dsr1-fp4-b200-sglang-mtp
   description: