@@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
3333BENCH_NUM_PROMPTS_MULTIPLIER=" ${BENCH_NUM_PROMPTS_MULTIPLIER:- 10} "
3434BENCH_MAX_CONCURRENCY=" ${BENCH_MAX_CONCURRENCY:- 512} "
3535
36+ # Extract the maximum concurrency from the x-delimited list
37+ BENCH_MAX_CONC_VALUE=$( echo " $BENCH_MAX_CONCURRENCY " | tr ' x' ' \n' | sort -n | tail -1)
38+
3639# Dry Run for debugging purpose
3740DRY_RUN=" ${DRY_RUN:- 0} "
3841
@@ -184,6 +187,15 @@ else
184187 prefill_enable_two_batch_overlap=" false"
185188fi
186189
190+ # When both DP and EP are enabled, override max-running-requests with max bench concurrency
191+ if [[ " $PREFILL_ENABLE_DP " == " true" ]] && [[ " $PREFILL_ENABLE_EP " == " true" ]]; then
192+ prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
193+ prefill_dp_ranks=$PREFILL_TP_SIZE
194+ # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
195+ MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$(( MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2 ))
196+ echo " [DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests , MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL "
197+ fi
198+
187199# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
188200if [[ " $DECODE_ENABLE_DP " == " true" ]]; then
189201 decode_cuda_graph_bs=($( seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END ) )
@@ -196,6 +208,18 @@ else
196208 decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
197209fi
198210
211+ # When both DP and EP are enabled, override max-running-requests and dispatch tokens
212+ if [[ " $DECODE_ENABLE_DP " == " true" ]] && [[ " $DECODE_ENABLE_EP " == " true" ]]; then
213+ decode_max_running_requests=$BENCH_MAX_CONC_VALUE
214+ decode_dp_ranks=$DECODE_TP_SIZE
215+ MORI_MAX_DISPATCH_TOKENS_DECODE=$(( BENCH_MAX_CONC_VALUE / decode_dp_ranks))
216+ MORI_MOE_MAX_INPUT_TOKENS_DECODE=$(( MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10 ))
217+ # Update derived variable
218+ SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$(( MORI_MAX_DISPATCH_TOKENS_DECODE * 2 ))
219+ export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
220+ echo " [DP+EP override] Decode: max-running-requests=$decode_max_running_requests , DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE , MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE , INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD "
221+ fi
222+
199223# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
200224PREFILL_MODE_FLAGS=" --mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
201225if [[ " $PREFILL_DISABLE_RADIX_CACHE " == " True" ]] || [[ " $PREFILL_DISABLE_RADIX_CACHE " == " true" ]]; then
@@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]
343367 DECODE_SERVER_CONFIG=$( echo " $DECODE_SERVER_CONFIG " | sed ' s/--ep-dispatch-algorithm fake//g' )
344368 unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
345369 unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
346- # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
347- # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
348- # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
349- # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising
350- # that an fast follow PR to fix the evals via having quant correction in the fp8 combine
351370fi
352371
353372# =============================================================================
@@ -398,7 +417,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
398417 PREFILL_MORI_MOE_ENV=" SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} "
399418 fi
400419 set +x
401- PREFILL_CMD=" ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
420+ PREFILL_CMD=" SGLANG_MORI_COMBINE_DTYPE= ${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
402421 --model-path $MODEL_DIR /$MODEL_NAME \
403422 --disaggregation-mode prefill \
404423 --disaggregation-ib-device ${IBDEVICES} \
@@ -630,7 +649,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
630649 PREFILL_MORI_MOE_ENV=" SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} "
631650 fi
632651 set +x
633- PREFILL_CMD=" ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
652+ PREFILL_CMD=" SGLANG_MORI_COMBINE_DTYPE= ${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
634653 --model-path $MODEL_DIR /${MODEL_NAME} \
635654 --disaggregation-mode prefill \
636655 --disaggregation-ib-device ${IBDEVICES} \
698717 DECODE_MORI_MOE_ENV=" SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
699718 fi
700719 set +x
701- DECODE_CMD=" ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
720+ DECODE_CMD=" SGLANG_MORI_COMBINE_DTYPE= ${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
702721 --model-path ${MODEL_DIR} /${MODEL_NAME} \
703722 --disaggregation-mode decode \
704723 --disaggregation-ib-device ${IBDEVICES} \
758777fi
759778
760779echo " Script completed successfully"
761- exit 0
780+ exit 0
0 commit comments