Skip to content

Commit 80c944e

Browse files
Update dpskv4 GB300 non-MTP disagg SGLang image to nightly-20260519 (#1492)
* Update dpskv4 GB300 non-MTP disagg SGLang image and clean env vars * Update dsv4 GB300 non-MTP disagg SGLang image to nightly-20260519 * Add custom_tokenizer to sglang DSV4 GB300 recipes Fix benchmark client crash due to transformers not recognizing deepseek_v4 model type when loading tokenizer. * Set SGLANG_OPT_FP8_WO_A_GEMM=0 for all prefill and decode environments --------- Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent afb49f5 commit 80c944e

8 files changed

Lines changed: 87 additions & 155 deletions

.github/configs/nvidia-master.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8740,7 +8740,7 @@ dsv4-fp4-gb300-dynamo-vllm:
87408740
dp-attn: true
87418741

87428742
dsv4-fp4-gb300-dynamo-sglang:
8743-
image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev
8743+
image: lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647
87448744
model: deepseek-ai/DeepSeek-V4-Pro
87458745
model-prefix: dsv4
87468746
runner: gb300-cw

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ name: "disagg-gb300-10p1d-dep4-dep16-14-c8192"
3333

3434
model:
3535
path: "deepseek-v4-pro"
36-
container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev"
36+
container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647"
3737
precision: "fp4"
3838

3939
dynamo:
@@ -74,24 +74,14 @@ backend:
7474

7575
prefill_environment:
7676
PYTHONUNBUFFERED: "1"
77-
SGLANG_RADIX_DISABLE_REUSE: "1"
77+
SGLANG_RADIX_FORCE_MISS: "1"
7878
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
79-
SGLANG_ENABLE_THINKING: "1"
80-
SGLANG_REASONING_EFFORT: "max"
79+
SGLANG_DEFAULT_THINKING: "1"
80+
SGLANG_DSV4_REASONING_EFFORT: "max"
8181
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
8282
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
83-
SGLANG_OPT_USE_JIT_NORM: "1"
84-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
85-
SGLANG_OPT_USE_TOPK_V2: "1"
86-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
87-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
88-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
89-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
90-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
9183
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
92-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
9384
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
94-
SGLANG_OPT_FP8_WO_A_GEMM: "1"
9585
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
9686
NCCL_MNNVL_ENABLE: "1"
9787
NCCL_CUMEM_ENABLE: "1"
@@ -104,24 +94,17 @@ backend:
10494
SGLANG_LOG_FORWARD_ITERS: "1"
10595
SGLANG_LOG_MS: "1"
10696
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
97+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
10798

10899
decode_environment:
109100
PYTHONUNBUFFERED: "1"
110-
SGLANG_RADIX_DISABLE_REUSE: "1"
101+
SGLANG_RADIX_FORCE_MISS: "1"
111102
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
112-
SGLANG_ENABLE_THINKING: "1"
113-
SGLANG_REASONING_EFFORT: "max"
103+
SGLANG_DEFAULT_THINKING: "1"
104+
SGLANG_DSV4_REASONING_EFFORT: "max"
114105
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
115106
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
116-
SGLANG_OPT_USE_JIT_NORM: "1"
117-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
118-
SGLANG_OPT_USE_TOPK_V2: "1"
119-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
120-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
121-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
122-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
123107
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
124-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
125108
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
126109
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
127110
NCCL_MNNVL_ENABLE: "1"
@@ -136,7 +119,7 @@ backend:
136119
SGLANG_LOG_FORWARD_ITERS: "1"
137120
SGLANG_LOG_MS: "1"
138121
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
139-
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
122+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
140123
# is single-node only and corrupts results in 2-node decode setups.
141124

142125
sglang_config:
@@ -152,7 +135,7 @@ backend:
152135
expert-parallel-size: 4
153136

154137
enable-dp-attention: true
155-
moe-a2a-backend: "deepep"
138+
moe-a2a-backend: "megamoe"
156139
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
157140
moe-dense-tp-size: 1
158141

@@ -171,7 +154,7 @@ backend:
171154
stream-interval: 60
172155

173156
load-balance-method: "total_requests"
174-
moe-a2a-backend: "deepep"
157+
moe-a2a-backend: "megamoe"
175158

176159
disaggregation-mode: "decode"
177160
disaggregation-transfer-backend: mooncake
@@ -196,3 +179,4 @@ benchmark:
196179
concurrencies: "8192"
197180
req_rate: "inf"
198181
use_chat_template: false
182+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ name: "disagg-gb300-12p1d-dep4-dep12-15-c21504"
3333

3434
model:
3535
path: "deepseek-v4-pro"
36-
container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev"
36+
container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647"
3737
precision: "fp4"
3838

3939
dynamo:
@@ -74,24 +74,14 @@ backend:
7474

7575
prefill_environment:
7676
PYTHONUNBUFFERED: "1"
77-
SGLANG_RADIX_DISABLE_REUSE: "1"
77+
SGLANG_RADIX_FORCE_MISS: "1"
7878
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
79-
SGLANG_ENABLE_THINKING: "1"
80-
SGLANG_REASONING_EFFORT: "max"
79+
SGLANG_DEFAULT_THINKING: "1"
80+
SGLANG_DSV4_REASONING_EFFORT: "max"
8181
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
8282
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
83-
SGLANG_OPT_USE_JIT_NORM: "1"
84-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
85-
SGLANG_OPT_USE_TOPK_V2: "1"
86-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
87-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
88-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
89-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
90-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
9183
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
92-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
9384
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
94-
SGLANG_OPT_FP8_WO_A_GEMM: "1"
9585
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
9686
NCCL_MNNVL_ENABLE: "1"
9787
NCCL_CUMEM_ENABLE: "1"
@@ -104,24 +94,17 @@ backend:
10494
SGLANG_LOG_FORWARD_ITERS: "1"
10595
SGLANG_LOG_MS: "1"
10696
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
97+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
10798

10899
decode_environment:
109100
PYTHONUNBUFFERED: "1"
110-
SGLANG_RADIX_DISABLE_REUSE: "1"
101+
SGLANG_RADIX_FORCE_MISS: "1"
111102
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
112-
SGLANG_ENABLE_THINKING: "1"
113-
SGLANG_REASONING_EFFORT: "max"
103+
SGLANG_DEFAULT_THINKING: "1"
104+
SGLANG_DSV4_REASONING_EFFORT: "max"
114105
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
115106
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
116-
SGLANG_OPT_USE_JIT_NORM: "1"
117-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
118-
SGLANG_OPT_USE_TOPK_V2: "1"
119-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
120-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
121-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
122-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
123107
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
124-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
125108
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
126109
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
127110
NCCL_MNNVL_ENABLE: "1"
@@ -136,7 +119,7 @@ backend:
136119
SGLANG_LOG_FORWARD_ITERS: "1"
137120
SGLANG_LOG_MS: "1"
138121
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
139-
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
122+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
140123
# is single-node only and corrupts results in 2-node decode setups.
141124

142125
sglang_config:
@@ -152,7 +135,7 @@ backend:
152135
expert-parallel-size: 4
153136

154137
enable-dp-attention: true
155-
moe-a2a-backend: "deepep"
138+
moe-a2a-backend: "megamoe"
156139
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
157140
moe-dense-tp-size: 1
158141

@@ -171,7 +154,7 @@ backend:
171154
stream-interval: 60
172155

173156
load-balance-method: "total_requests"
174-
moe-a2a-backend: "deepep"
157+
moe-a2a-backend: "megamoe"
175158

176159
disaggregation-mode: "decode"
177160
disaggregation-transfer-backend: mooncake
@@ -196,3 +179,4 @@ benchmark:
196179
concurrencies: "21504"
197180
req_rate: "inf"
198181
use_chat_template: false
182+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-dep4-dep16-5-c1024"
3333

3434
model:
3535
path: "deepseek-v4-pro"
36-
container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev"
36+
container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647"
3737
precision: "fp4"
3838

3939
dynamo:
@@ -74,24 +74,14 @@ backend:
7474

7575
prefill_environment:
7676
PYTHONUNBUFFERED: "1"
77-
SGLANG_RADIX_DISABLE_REUSE: "1"
77+
SGLANG_RADIX_FORCE_MISS: "1"
7878
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
79-
SGLANG_ENABLE_THINKING: "1"
80-
SGLANG_REASONING_EFFORT: "max"
79+
SGLANG_DEFAULT_THINKING: "1"
80+
SGLANG_DSV4_REASONING_EFFORT: "max"
8181
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
8282
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
83-
SGLANG_OPT_USE_JIT_NORM: "1"
84-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
85-
SGLANG_OPT_USE_TOPK_V2: "1"
86-
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
87-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
88-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
89-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
90-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
9183
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
92-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
9384
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
94-
SGLANG_OPT_FP8_WO_A_GEMM: "1"
9585
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
9686
NCCL_MNNVL_ENABLE: "1"
9787
NCCL_CUMEM_ENABLE: "1"
@@ -104,24 +94,17 @@ backend:
10494
SGLANG_LOG_FORWARD_ITERS: "1"
10595
SGLANG_LOG_MS: "1"
10696
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
97+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
10798

10899
decode_environment:
109100
PYTHONUNBUFFERED: "1"
110-
SGLANG_RADIX_DISABLE_REUSE: "1"
101+
SGLANG_RADIX_FORCE_MISS: "1"
111102
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
112-
SGLANG_ENABLE_THINKING: "1"
113-
SGLANG_REASONING_EFFORT: "max"
103+
SGLANG_DEFAULT_THINKING: "1"
104+
SGLANG_DSV4_REASONING_EFFORT: "max"
114105
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
115106
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
116-
SGLANG_OPT_USE_JIT_NORM: "1"
117-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
118-
SGLANG_OPT_USE_TOPK_V2: "1"
119-
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
120-
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
121-
SGLANG_OPT_USE_FAST_MASK_EP: "1"
122-
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
123107
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
124-
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
125108
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
126109
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
127110
NCCL_MNNVL_ENABLE: "1"
@@ -136,7 +119,7 @@ backend:
136119
SGLANG_LOG_FORWARD_ITERS: "1"
137120
SGLANG_LOG_MS: "1"
138121
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
139-
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
122+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
140123
# is single-node only and corrupts results in 2-node decode setups.
141124

142125
sglang_config:
@@ -152,7 +135,7 @@ backend:
152135
expert-parallel-size: 4
153136

154137
enable-dp-attention: true
155-
moe-a2a-backend: "deepep"
138+
moe-a2a-backend: "megamoe"
156139
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
157140
moe-dense-tp-size: 1
158141

@@ -171,7 +154,7 @@ backend:
171154
stream-interval: 60
172155

173156
load-balance-method: "total_requests"
174-
moe-a2a-backend: "deepep"
157+
moe-a2a-backend: "megamoe"
175158

176159
disaggregation-mode: "decode"
177160
disaggregation-transfer-backend: mooncake
@@ -196,3 +179,4 @@ benchmark:
196179
concurrencies: "1024"
197180
req_rate: "inf"
198181
use_chat_template: false
182+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-tp4-tp4-2-c1"
3333

3434
model:
3535
path: "deepseek-v4-pro"
36-
container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev"
36+
container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647"
3737
precision: "fp4"
3838

3939
# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
@@ -80,40 +80,37 @@ backend:
8080

8181
prefill_environment:
8282
PYTHONUNBUFFERED: "1"
83-
SGLANG_RADIX_DISABLE_REUSE: "1"
83+
SGLANG_RADIX_FORCE_MISS: "1"
8484
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
85-
SGLANG_ENABLE_THINKING: "1"
86-
SGLANG_REASONING_EFFORT: "max"
85+
SGLANG_DEFAULT_THINKING: "1"
86+
SGLANG_DSV4_REASONING_EFFORT: "max"
8787
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
88-
SGLANG_OPT_USE_JIT_NORM: "1"
89-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
90-
SGLANG_OPT_USE_TOPK_V2: "1"
88+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
9189
NCCL_MNNVL_ENABLE: "1"
9290
NCCL_CUMEM_ENABLE: "1"
9391
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
9492
MC_FORCE_MNNVL: "1"
9593
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
9694
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
9795
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
96+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
9897

9998
decode_environment:
10099
PYTHONUNBUFFERED: "1"
101-
SGLANG_RADIX_DISABLE_REUSE: "1"
100+
SGLANG_RADIX_FORCE_MISS: "1"
102101
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
103-
SGLANG_ENABLE_THINKING: "1"
104-
SGLANG_REASONING_EFFORT: "max"
102+
SGLANG_DEFAULT_THINKING: "1"
103+
SGLANG_DSV4_REASONING_EFFORT: "max"
105104
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
106-
SGLANG_OPT_USE_JIT_NORM: "1"
107-
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
108-
SGLANG_OPT_USE_TOPK_V2: "1"
105+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
109106
NCCL_MNNVL_ENABLE: "1"
110107
NCCL_CUMEM_ENABLE: "1"
111108
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
112109
MC_FORCE_MNNVL: "1"
113110
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
114111
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
115112
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
116-
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
113+
SGLANG_OPT_FP8_WO_A_GEMM: "0"
117114
# is single-node only and corrupts results in 2-node decode setups.
118115

119116
sglang_config:
@@ -167,3 +164,4 @@ benchmark:
167164
concurrencies: "1"
168165
req_rate: "inf"
169166
use_chat_template: false
167+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

0 commit comments

Comments
 (0)