Skip to content

Commit 8fa3c96

Browse files
committed
fix(agentic): cap MI355X HiCache graph capture
1 parent dbfbd56 commit 8fa3c96

1 file changed

Lines changed: 12 additions & 1 deletion

File tree

benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ mkdir -p "$RESULT_DIR"
4444

4545
CACHE_ARGS=()
4646
WARMUP_ARGS=()
47+
CUDA_GRAPH_MAX_BS="$CONC"
4748
case "$OFFLOADING" in
4849
none)
4950
CACHE_ARGS=(--disable-radix-cache)
@@ -83,6 +84,16 @@ case "$OFFLOADING" in
8384
# request has timed out after 600s on this Qwen MI355X path. Let aiperf
8485
# own benchmark traffic instead of blocking server readiness on it.
8586
WARMUP_ARGS=(--skip-server-warmup)
87+
# Keep request concurrency as the swept variable, but do not force
88+
# HiCache runs to capture ROCm graphs at every high concurrency point.
89+
# The conc=32 HiCache job crashed after startup readiness, before any
90+
# aiperf traffic, while conc=16 is the highest known-good capture size
91+
# for this model/server path. Requests above the capture size can still
92+
# run; they just do not require a larger captured graph at startup.
93+
HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
94+
if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
95+
CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
96+
fi
8697
;;
8798
*)
8899
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
@@ -105,7 +116,7 @@ SGLANG_CMD=(
105116
--trust-remote-code
106117
--tokenizer-worker-num 6
107118
--enable-aiter-allreduce-fusion
108-
--cuda-graph-max-bs "$CONC"
119+
--cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
109120
--max-running-requests "$CONC"
110121
--max-prefill-tokens 32768
111122
--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"

0 commit comments

Comments
 (0)