Revert "Fix glm5-fp8-b300 DeepGemm regression" (#1536)

functionstackx · web-flow · commit f4b05bc144fc · 2026-05-20T17:16:03.000-04:00
This reverts commit 12fb33e.
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh
@@ -23,17 +23,13 @@ nvidia-smi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-pip install --break-system-packages --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
-
-# Testing @trevor-m's suggestion in sgl-project/sglang#25551 (comment 4481466979):
-# downgrade sgl-deep-gemm 0.1.0 → 0.0.1 inside the v0.5.12 container to check
-# whether the deepgemm version jump is what causes the B300 TMA-descriptor
-# CUDA_ERROR_ILLEGAL_ADDRESS regression. Re-enabling JIT DeepGemm so the
-# downgraded version actually runs.
-# --break-system-packages required: the container's Python is PEP-668 externally-managed,
-# so the previous attempt silently failed and left the bundled 0.1.0 in place.
-pip install --break-system-packages --no-deps "sgl-deep-gemm==0.0.1"
-export SGL_ENABLE_JIT_DEEPGEMM=1
+pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
+
+# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor
+# regression on B300 (sm_120) crashes CUDA graph capture with
+# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected
+# kernel path. Restore to =1 once the upstream regression is fixed.
+export SGL_ENABLE_JIT_DEEPGEMM=0
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
@@ -23,17 +23,13 @@ nvidia-smi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-pip install --break-system-packages --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
-
-# Testing @trevor-m's suggestion in sgl-project/sglang#25551 (comment 4481466979):
-# downgrade sgl-deep-gemm 0.1.0 → 0.0.1 inside the v0.5.12 container to check
-# whether the deepgemm version jump is what causes the B300 TMA-descriptor
-# CUDA_ERROR_ILLEGAL_ADDRESS regression. Re-enabling JIT DeepGemm so the
-# downgraded version actually runs.
-# --break-system-packages required: the container's Python is PEP-668 externally-managed,
-# so the previous attempt silently failed and left the bundled 0.1.0 in place.
-pip install --break-system-packages --no-deps "sgl-deep-gemm==0.0.1"
-export SGL_ENABLE_JIT_DEEPGEMM=1
+pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
+
+# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor
+# regression on B300 (sm_120) crashes CUDA graph capture with
+# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected
+# kernel path. Restore to =1 once the upstream regression is fixed.
+export SGL_ENABLE_JIT_DEEPGEMM=0
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3050,10 +3050,3 @@
   description:
     - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
-
-- config-keys:
-    - glm5-fp8-b300-sglang
-    - glm5-fp8-b300-sglang-mtp
-  description:
-    - "Test @trevor-m's suggestion in sgl-project/sglang#25551: pin sgl-deep-gemm==0.0.1 inside v0.5.12 container to isolate whether the deep-gemm 0.0.1→0.1.0 upgrade is the source of the B300 CUDA_ERROR_ILLEGAL_ADDRESS regression."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1512