Skip to content

Commit e8b2230

Browse files
ichbinblauclaude
andcommitted
fix: restore DP+EP override blocks and trailing newline in server_sglang.sh
Add BENCH_MAX_CONC_VALUE extraction and the two DP+EP override blocks that the refactor from server.sh dropped. These adjust max-running-requests, dispatch tokens, and MOE input tokens when both DP and EP are enabled. Also add trailing newline for POSIX compliance. server_sglang.sh now matches upstream server.sh exactly. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
1 parent d034fcd commit e8b2230

1 file changed

Lines changed: 25 additions & 6 deletions

File tree

benchmarks/multi_node/amd_utils/server_sglang.sh

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
3333
BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
3434
BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
3535

36+
# Extract the maximum concurrency from the x-delimited list
37+
BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
38+
3639
# Dry Run for debugging purpose
3740
DRY_RUN="${DRY_RUN:-0}"
3841

@@ -184,6 +187,15 @@ else
184187
prefill_enable_two_batch_overlap="false"
185188
fi
186189

190+
# When both DP and EP are enabled, override max-running-requests with max bench concurrency
191+
if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
192+
prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
193+
prefill_dp_ranks=$PREFILL_TP_SIZE
194+
# MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
195+
MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
196+
echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
197+
fi
198+
187199
# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
188200
if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
189201
decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
@@ -196,6 +208,18 @@ else
196208
decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
197209
fi
198210

211+
# When both DP and EP are enabled, override max-running-requests and dispatch tokens
212+
if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
213+
decode_max_running_requests=$BENCH_MAX_CONC_VALUE
214+
decode_dp_ranks=$DECODE_TP_SIZE
215+
MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
216+
MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
217+
# Update derived variable
218+
SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
219+
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
220+
echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
221+
fi
222+
199223
# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
200224
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
201225
if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
@@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]
343367
DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
344368
unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
345369
unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
346-
# NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
347-
# or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
348-
# 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
349-
# and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising
350-
# that an fast follow PR to fix the evals via having quant correction in the fp8 combine
351370
fi
352371

353372
# =============================================================================
@@ -758,4 +777,4 @@ else
758777
fi
759778

760779
echo "Script completed successfully"
761-
exit 0
780+
exit 0

0 commit comments

Comments
 (0)