Skip to content

Commit fb1e9d4

Browse files
committed
Update dpskv4 GB300 MTP SGLang image to nightly-20260518 and clean env vars
1 parent 97ac477 commit fb1e9d4

11 files changed

Lines changed: 63 additions & 178 deletions

.github/configs/amd-master.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ qwen3.5-bf16-mi355x-sglang-mtp:
162162
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
163163

164164
qwen3.5-bf16-mi300x-sglang:
165-
image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
165+
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
166166
model: Qwen/Qwen3.5-397B-A17B
167167
model-prefix: qwen3.5
168168
runner: mi300x
@@ -527,7 +527,7 @@ kimik2.5-int4-mi355x-vllm:
527527
- { tp: 8, conc-start: 4, conc-end: 64 }
528528

529529
kimik2.5-int4-mi325x-vllm:
530-
image: vllm/vllm-openai-rocm:v0.21.0
530+
image: vllm/vllm-openai-rocm:v0.18.0
531531
model: moonshotai/Kimi-K2.5
532532
model-prefix: kimik2.5
533533
runner: mi325x
@@ -802,7 +802,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic:
802802
- { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }
803803

804804
minimaxm2.5-fp8-mi325x-vllm:
805-
image: vllm/vllm-openai-rocm:v0.21.0
805+
image: vllm/vllm-openai-rocm:v0.18.0
806806
model: MiniMaxAI/MiniMax-M2.5
807807
model-prefix: minimaxm2.5
808808
runner: mi325x
@@ -872,7 +872,7 @@ gptoss-fp4-mi300x-vllm:
872872
- { tp: 8, conc-start: 1, conc-end: 16 }
873873

874874
gptoss-fp4-mi325x-vllm:
875-
image: vllm/vllm-openai-rocm:v0.21.0
875+
image: vllm/vllm-openai-rocm:v0.17.0
876876
model: openai/gpt-oss-120b
877877
model-prefix: gptoss
878878
runner: mi325x

.github/configs/nvidia-master.yaml

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,7 +2052,7 @@ dsv4-fp4-b300-sglang-mtp:
20522052
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
20532053

20542054
qwen3.5-bf16-b200-sglang:
2055-
image: lmsysorg/sglang:v0.5.12-cu130
2055+
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
20562056
model: Qwen/Qwen3.5-397B-A17B
20572057
model-prefix: qwen3.5
20582058
runner: b200
@@ -2071,7 +2071,7 @@ qwen3.5-bf16-b200-sglang:
20712071
- { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
20722072

20732073
qwen3.5-bf16-b200-sglang-mtp:
2074-
image: lmsysorg/sglang:v0.5.12-cu130
2074+
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
20752075
model: Qwen/Qwen3.5-397B-A17B
20762076
model-prefix: qwen3.5
20772077
runner: b200
@@ -2089,22 +2089,24 @@ qwen3.5-bf16-b200-sglang-mtp:
20892089
search-space:
20902090
- { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
20912091

2092-
# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml
2093-
# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads
2094-
# as `bmk_agentic_*`). Re-enable once that workflow is aligned.
2095-
# qwen3.5-bf16-b200-sglang-agentic:
2096-
# image: lmsysorg/sglang:v0.5.12-cu130
2097-
# model: Qwen/Qwen3.5-397B-A17B
2098-
# model-prefix: qwen3.5
2099-
# runner: b200
2100-
# precision: bf16
2101-
# framework: sglang
2102-
# multinode: false
2103-
# scenarios:
2104-
# agentic-coding:
2105-
# - duration: 1800
2106-
# search-space:
2107-
# - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
2092+
# Diverged from qwen3.5-bf16-b200-sglang (agentic-coding sibling). Metadata is
2093+
# identical to origin/main's qwen3.5-bf16-b200-sglang; the split exists because this
2094+
# PR adds an agentic-coding scenarios block that differs from main
2095+
# (either main had none or had a different conc/offload sweep).
2096+
# The original qwen3.5-bf16-b200-sglang entry stays byte-identical to origin/main.
2097+
qwen3.5-bf16-b200-sglang-agentic:
2098+
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
2099+
model: Qwen/Qwen3.5-397B-A17B
2100+
model-prefix: qwen3.5
2101+
runner: b200
2102+
precision: bf16
2103+
framework: sglang
2104+
multinode: false
2105+
scenarios:
2106+
agentic-coding:
2107+
- duration: 1800
2108+
search-space:
2109+
- { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
21082110

21092111
qwen3.5-fp8-b200-sglang:
21102112
image: lmsysorg/sglang:nightly-dev-20260422-de962f32
@@ -2632,7 +2634,7 @@ kimik2.5-int4-h200-vllm-agentic:
26322634
- { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
26332635

26342636
kimik2.5-fp4-b200-vllm:
2635-
image: vllm/vllm-openai:v0.20.2
2637+
image: vllm/vllm-openai:v0.17.0
26362638
model: nvidia/Kimi-K2.5-NVFP4
26372639
model-prefix: kimik2.5
26382640
runner: b200
@@ -4252,7 +4254,7 @@ gptoss-fp4-b200-vllm-agentic:
42524254
- { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] }
42534255

42544256
minimaxm2.5-fp8-b200-vllm:
4255-
image: vllm/vllm-openai:v0.21.0
4257+
image: vllm/vllm-openai:v0.19.0-cu130
42564258
model: MiniMaxAI/MiniMax-M2.5
42574259
model-prefix: minimaxm2.5
42584260
runner: b200
@@ -4353,7 +4355,7 @@ minimaxm2.5-fp8-b300-vllm-agentic:
43534355
- { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
43544356

43554357
minimaxm2.5-fp4-b200-vllm:
4356-
image: vllm/vllm-openai:v0.21.0
4358+
image: vllm/vllm-openai:v0.19.0-cu130
43574359
model: nvidia/MiniMax-M2.5-NVFP4
43584360
model-prefix: minimaxm2.5
43594361
runner: b200

KLAUD_DEBUG.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Seen on: #1460 (dsv4-fp8-h200-sglang+mtp).
6666

6767
## 4. Upstream sglang v0.5.12 B300 regressions
6868

69-
Three distinct upstream regressions on NVIDIA B300 (Blackwell Ultra, `sm_103` — compute capability 10.3) shipped in `lmsysorg/sglang:v0.5.12-cu130`. (sm_120 is for *consumer* Blackwell / RTX 50 series, not B300 — don't propagate that.)
69+
Two distinct upstream regressions on NVIDIA B300 (Blackwell, `sm_120`) shipped in `lmsysorg/sglang:v0.5.12-cu130`:
7070

7171
### 4a. DeepGemm TMA-descriptor crash (GLM-5-FP8)
7272
**Symptom:** CUDA graph capture aborts with `CUDA_ERROR_ILLEGAL_ADDRESS (700)` at `/deepgemm/csrc/.../runtime_utils.hpp:143` on the **first batch size** for **every TP rank**. Server never serves a prompt.
@@ -86,17 +86,17 @@ Filed upstream: sgl-project/sglang#25551. Seen on #1421.
8686
2. Comment out the MTP/EAGLE scenarios on B300 in the recipe.
8787
3. Pin to v0.5.11-cu130.
8888

89-
Filed upstream: sgl-project/sglang#25563. Seen on #1420.
89+
Seen on #1420.
9090

9191
### 4c. flash_attn SM-arch assertion (qwen3.5-bf16)
9292
**Symptom:** All 4 TP workers AssertionError on first forward pass:
9393
```
9494
File "/opt/venv/.../sglang/srt/layers/attention/flashattention_backend.py:..."
9595
assert sm_100 <= arch <= sm_110f
9696
```
97-
B300 is `sm_103` (compute capability 10.3, Blackwell Ultra) — which is *nominally inside* the asserted `sm_100..sm_110f` range, yet the assertion still fires. Best guess is the cute kernel's `Arch.sm_110f` set only matches the architecture-specific feature-flag variants it was compiled for (e.g. `sm_100`, `sm_100f`, `sm_110`, `sm_110f`) and `sm_103` / `sm_103a` isn't in that explicit list. Server never becomes healthy; warmup times out at 600s.
97+
B300 is `sm_120`, outside the asserted range. Server never becomes healthy; warmup times out at 600s.
9898

99-
**Fix:** Needs an sglang image with `flash_attn` that recognises `sm_103` / `sm_103a` — no local workaround. Pin to `v0.5.11-cu130` in the meantime.
99+
**Fix:** Needs sglang image with flash_attn supporting `sm_120` — no local workaround. Pin to v0.5.11-cu130 in the meantime.
100100

101101
Seen on #1422.
102102

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dynamo:
1111

1212
model:
1313
path: "deepseek-v4-pro"
14-
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260518-c67b2870"
1515
precision: "mxfp4"
1616

1717
sbatch_directives:
@@ -31,14 +31,12 @@ backend:
3131

3232
prefill_environment:
3333
PYTHONUNBUFFERED: "1"
34-
SGLANG_RADIX_DISABLE_REUSE: "1"
34+
SGLANG_RADIX_FORCE_MISS: "1"
3535
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
3636
SGLANG_DEFAULT_THINKING: "1"
3737
SGLANG_DSV4_REASONING_EFFORT: "max"
3838
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
39-
SGLANG_OPT_USE_JIT_NORM: "1"
40-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
41-
SGLANG_OPT_USE_TOPK_V2: "1"
39+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
4240
NCCL_MNNVL_ENABLE: "1"
4341
NCCL_CUMEM_ENABLE: "1"
4442
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
@@ -49,14 +47,12 @@ backend:
4947

5048
decode_environment:
5149
PYTHONUNBUFFERED: "1"
52-
SGLANG_RADIX_DISABLE_REUSE: "1"
50+
SGLANG_RADIX_FORCE_MISS: "1"
5351
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
5452
SGLANG_DEFAULT_THINKING: "1"
5553
SGLANG_DSV4_REASONING_EFFORT: "max"
5654
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
57-
SGLANG_OPT_USE_JIT_NORM: "1"
58-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
59-
SGLANG_OPT_USE_TOPK_V2: "1"
55+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
6056
NCCL_MNNVL_ENABLE: "1"
6157
NCCL_CUMEM_ENABLE: "1"
6258
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dynamo:
1111

1212
model:
1313
path: "deepseek-v4-pro"
14-
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260518-c67b2870"
1515
precision: "mxfp4"
1616

1717
sbatch_directives:
@@ -31,23 +31,16 @@ backend:
3131

3232
prefill_environment:
3333
PYTHONUNBUFFERED: "1"
34-
SGLANG_RADIX_DISABLE_REUSE: "1"
34+
SGLANG_RADIX_FORCE_MISS: "1"
3535
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
3636
SGLANG_DEFAULT_THINKING: "1"
3737
SGLANG_DSV4_REASONING_EFFORT: "max"
3838
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
39-
SGLANG_OPT_USE_JIT_NORM: "1"
40-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
41-
SGLANG_OPT_USE_TOPK_V2: "1"
42-
4339
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
44-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
45-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
40+
4641
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
47-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
4842
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
4943
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
50-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
5144
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
5245

5346
NCCL_MNNVL_ENABLE: "1"
@@ -60,14 +53,11 @@ backend:
6053

6154
decode_environment:
6255
PYTHONUNBUFFERED: "1"
63-
SGLANG_RADIX_DISABLE_REUSE: "1"
56+
SGLANG_RADIX_FORCE_MISS: "1"
6457
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
6558
SGLANG_DEFAULT_THINKING: "1"
6659
SGLANG_DSV4_REASONING_EFFORT: "max"
6760
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
68-
SGLANG_OPT_USE_JIT_NORM: "1"
69-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
70-
SGLANG_OPT_USE_TOPK_V2: "1"
7161
NCCL_MNNVL_ENABLE: "1"
7262
NCCL_CUMEM_ENABLE: "1"
7363
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dynamo:
1111

1212
model:
1313
path: "deepseek-v4-pro"
14-
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260518-c67b2870"
1515
precision: "mxfp4"
1616

1717
sbatch_directives:
@@ -33,23 +33,16 @@ backend:
3333

3434
prefill_environment:
3535
PYTHONUNBUFFERED: "1"
36-
SGLANG_RADIX_DISABLE_REUSE: "1"
36+
SGLANG_RADIX_FORCE_MISS: "1"
3737
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
3838
SGLANG_DEFAULT_THINKING: "1"
3939
SGLANG_DSV4_REASONING_EFFORT: "max"
4040
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
41-
SGLANG_OPT_USE_JIT_NORM: "1"
42-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
43-
SGLANG_OPT_USE_TOPK_V2: "1"
44-
4541
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
46-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
47-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
42+
4843
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
49-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
5044
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
5145
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
52-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
5346
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
5447

5548
NCCL_MNNVL_ENABLE: "1"
@@ -62,22 +55,16 @@ backend:
6255

6356
decode_environment:
6457
PYTHONUNBUFFERED: "1"
65-
SGLANG_RADIX_DISABLE_REUSE: "1"
58+
SGLANG_RADIX_FORCE_MISS: "1"
6659
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
6760
SGLANG_DEFAULT_THINKING: "1"
6861
SGLANG_DSV4_REASONING_EFFORT: "max"
6962
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
70-
SGLANG_OPT_USE_JIT_NORM: "1"
71-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
72-
SGLANG_OPT_USE_TOPK_V2: "1"
73-
7463
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
75-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
64+
7665
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
77-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
7866
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048"
7967
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
80-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
8168
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
8269

8370
NCCL_MNNVL_ENABLE: "1"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dynamo:
1111

1212
model:
1313
path: "deepseek-v4-pro"
14-
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260518-c67b2870"
1515
precision: "mxfp4"
1616

1717
sbatch_directives:
@@ -33,23 +33,16 @@ backend:
3333

3434
prefill_environment:
3535
PYTHONUNBUFFERED: "1"
36-
SGLANG_RADIX_DISABLE_REUSE: "1"
36+
SGLANG_RADIX_FORCE_MISS: "1"
3737
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
3838
SGLANG_DEFAULT_THINKING: "1"
3939
SGLANG_DSV4_REASONING_EFFORT: "max"
4040
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
41-
SGLANG_OPT_USE_JIT_NORM: "1"
42-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
43-
SGLANG_OPT_USE_TOPK_V2: "1"
44-
4541
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
46-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
47-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
42+
4843
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
49-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
5044
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
5145
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
52-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
5346
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
5447

5548
NCCL_MNNVL_ENABLE: "1"
@@ -62,22 +55,16 @@ backend:
6255

6356
decode_environment:
6457
PYTHONUNBUFFERED: "1"
65-
SGLANG_RADIX_DISABLE_REUSE: "1"
58+
SGLANG_RADIX_FORCE_MISS: "1"
6659
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
6760
SGLANG_DEFAULT_THINKING: "1"
6861
SGLANG_DSV4_REASONING_EFFORT: "max"
6962
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
70-
SGLANG_OPT_USE_JIT_NORM: "1"
71-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
72-
SGLANG_OPT_USE_TOPK_V2: "1"
73-
7463
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
75-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
64+
7665
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
77-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
7866
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048"
7967
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
80-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
8168
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
8269

8370
NCCL_MNNVL_ENABLE: "1"

0 commit comments

Comments
 (0)