Update glm5-fp8-b300-sglang and glm5-fp8-b300-sglang-mtp SGLang image to v0.5.12-cu130 (#1421)

Klaud-Cold · github-actions[bot] · claude-fix-bot · web-flow · commit 67230af8a3ba · 2026-05-18T12:17:31.000-04:00
Ref #1154 Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Klaud Cold <Klaud-Cold@users.noreply.github.com> Co-authored-by: claude-fix-bot <claude-fix-bot@local> Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -2250,7 +2250,7 @@ glm5-fp8-b200-sglang-agentic:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
 
 glm5-fp8-b300-sglang:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: b300
@@ -2269,7 +2269,7 @@ glm5-fp8-b300-sglang:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp8-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: b300
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh
@@ -25,7 +25,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 
-export SGL_ENABLE_JIT_DEEPGEMM=1
+# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor
+# regression on B300 (sm_120) crashes CUDA graph capture with
+# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected
+# kernel path. Restore to =1 once the upstream regression is fixed.
+export SGL_ENABLE_JIT_DEEPGEMM=0
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
@@ -25,7 +25,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 
-export SGL_ENABLE_JIT_DEEPGEMM=1
+# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor
+# regression on B300 (sm_120) crashes CUDA graph capture with
+# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected
+# kernel path. Restore to =1 once the upstream regression is fixed.
+export SGL_ENABLE_JIT_DEEPGEMM=0
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2747,3 +2747,11 @@
   description:
     - "Update SGLang image from v0.5.9-rocm700-mi30x to v0.5.12-rocm700-mi30x"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1425
+
+- config-keys:
+    - glm5-fp8-b300-sglang
+    - glm5-fp8-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130"
+    - "Disable JIT DeepGemm (SGL_ENABLE_JIT_DEEPGEMM=0) to bypass v0.5.12 DeepGemm TMA-descriptor regression on B300 — see sgl-project/sglang#25551"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1421