2525nvidia-smi
2626
2727# Common SGLANG env vars (apply to every config).
28- export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
2928export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
3029export SGLANG_OPT_USE_JIT_NORM=1
3130export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
@@ -48,6 +47,8 @@ EVAL_CONTEXT_ARGS=""
4847if [ " ${EVAL_ONLY} " = " true" ]; then
4948 setup_eval_context
5049 EVAL_CONTEXT_ARGS=" --context-length $EVAL_MAX_MODEL_LEN "
50+ else
51+ EVAL_CONTEXT_ARGS=" --context-length 16384"
5152fi
5253
5354start_gpu_monitor --output " $PWD /gpu_metrics.csv"
@@ -60,105 +61,59 @@ else
6061 SWA_FULL_TOKENS_RATIO=0.1
6162fi
6263
63- # Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
64- # script's pattern). DP-attention runs the empirically-tuned high-concurrency
65- # recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
66- # single-instance uses flashinfer_mxfp4 with the cookbook defaults.
64+ # Pick the launch recipe based on the two-line submission frontier:
65+ # TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput.
6766DEEPEP_CONFIG=' {"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
6867
69- # Default; the DP-attn branch below overrides to 0.94.
70- MEM_FRACTION_STATIC=0.90
71-
7268if [ " ${DP_ATTENTION} " = " true" ]; then
69+ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
70+ export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
7371 export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
7472 export SGLANG_OPT_USE_FAST_MASK_EP=1
7573 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
7674 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
7775 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
78- # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
79- # recipes first (they also have ep=8) so they aren't shadowed by the
80- # medium-conc EP_SIZE=8 branch below.
81- if [ " $CONC " = " 2048" ] || [ " $CONC " = " 4096" ] || [ " $CONC " = " 8192" ]; then
82- export NVSHMEM_DISABLE_IB=1
83- export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
84- export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
85- export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
86- if [ " $CONC " = " 2048" ]; then
87- export SGLANG_LOG_FORWARD_ITERS=1
88- export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
89- CUDA_GRAPH_MAX_BS=288
90- MAX_RUNNING_REQUESTS=2560
91- MEM_FRACTION_STATIC=0.87
92- SWA_FULL_TOKENS_RATIO=0.06
93- TOKENIZER_WORKER_NUM=4
94- elif [ " $CONC " = " 4096" ]; then
95- export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
96- CUDA_GRAPH_MAX_BS=544
97- MAX_RUNNING_REQUESTS=4352
98- MEM_FRACTION_STATIC=0.835
99- SWA_FULL_TOKENS_RATIO=0.075
100- TOKENIZER_WORKER_NUM=8
101- else
102- export SGLANG_OPT_USE_ONLINE_COMPRESS=1
103- export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
104- CUDA_GRAPH_MAX_BS=1088
105- MAX_RUNNING_REQUESTS=8192
106- MEM_FRACTION_STATIC=0.80
107- SWA_FULL_TOKENS_RATIO=0.3
108- TOKENIZER_WORKER_NUM=16
109- fi
110- PARALLEL_ARGS=(
111- --dp-size " $TP "
112- --enable-dp-attention
113- --moe-a2a-backend deepep
114- --cuda-graph-max-bs " $CUDA_GRAPH_MAX_BS "
115- --deepep-config " $DEEPEP_CONFIG "
116- --chunked-prefill-size 65536
117- --tokenizer-worker-num " $TOKENIZER_WORKER_NUM "
118- --enable-prefill-delayer
119- )
120- if [ " $CONC " = " 4096" ]; then
121- PARALLEL_ARGS+=(--decode-log-interval 5)
122- fi
123- if [ " $CONC " = " 8192" ]; then
124- PARALLEL_ARGS+=(--stream-interval 30)
125- fi
126- elif [ " ${EP_SIZE} " = " 8" ]; then
127- export NVSHMEM_DISABLE_IB=1
128- export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
129- export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
130- export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
131- PARALLEL_ARGS=(
132- --dp-size " $TP "
133- --enable-dp-attention
134- --moe-a2a-backend deepep
135- --cuda-graph-max-bs 550
136- --deepep-config " $DEEPEP_CONFIG "
137- --chunked-prefill-size 16384
138- --enable-prefill-delayer
139- )
140- MAX_RUNNING_REQUESTS=768
141- MEM_FRACTION_STATIC=0.94
142- else
143- export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
144- export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
145- export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
146- PARALLEL_ARGS=(
147- --dp-size " $TP "
148- --enable-dp-attention
149- --moe-runner-backend flashinfer_mxfp4
150- --disable-flashinfer-autotune
151- --deepep-config " $DEEPEP_CONFIG "
152- --chunked-prefill-size 16384
153- --enable-prefill-delayer
154- )
155- MEM_FRACTION_STATIC=0.94
156- fi
76+ export NVSHMEM_DISABLE_IB=1
77+ export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
78+ export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
79+ export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
80+ export SGLANG_OPT_USE_ONLINE_COMPRESS=1
81+ export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048
82+ export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1
83+ export NCCL_MNNVL_ENABLE=1
84+ export NCCL_CUMEM_ENABLE=1
85+ export MC_FORCE_MNNVL=1
86+ export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True
87+
88+ MEM_FRACTION_STATIC=0.835
89+ MAX_RUNNING_REQUESTS=4352
90+ SWA_FULL_TOKENS_RATIO=0.075
91+ PARALLEL_ARGS=(
92+ --dp-size " $TP "
93+ --enable-dp-attention
94+ --moe-a2a-backend deepep
95+ --deepep-config " $DEEPEP_CONFIG "
96+ --cuda-graph-max-bs 544
97+ --enable-mixed-chunk
98+ --chunked-prefill-size 16384
99+ --max-prefill-tokens 16384
100+ --tokenizer-worker-num 8
101+ --decode-log-interval 5
102+ --stream-interval 30
103+ )
157104else
105+ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1
106+ MEM_FRACTION_STATIC=0.90
107+ MAX_RUNNING_REQUESTS=512
158108 PARALLEL_ARGS=(
159109 --moe-runner-backend flashinfer_mxfp4
160110 --chunked-prefill-size 8192
161111 --disable-flashinfer-autotune
112+ --cuda-graph-max-bs 512
113+ --tokenizer-worker-num 8
114+ --decode-log-interval 60
115+ --stream-interval 30
116+ --scheduler-recv-interval 30
162117 )
163118fi
164119
@@ -177,7 +132,7 @@ PYTHONNOUSERSITE=1 sglang serve \
177132 --port $PORT \
178133 --trust-remote-code \
179134 --tp $TP \
180- --max-running-requests " ${ MAX_RUNNING_REQUESTS:- $(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))} " \
135+ --max-running-requests " $MAX_RUNNING_REQUESTS " \
181136 --mem-fraction-static " $MEM_FRACTION_STATIC " \
182137 --swa-full-tokens-ratio " $SWA_FULL_TOKENS_RATIO " \
183138 " ${PARALLEL_ARGS[@]} " $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
0 commit comments