@@ -44,6 +44,7 @@ mkdir -p "$RESULT_DIR"
4444
4545CACHE_ARGS=()
4646WARMUP_ARGS=()
47+ CUDA_GRAPH_MAX_BS=" $CONC "
4748case " $OFFLOADING " in
4849 none)
4950 CACHE_ARGS=(--disable-radix-cache)
@@ -83,6 +84,16 @@ case "$OFFLOADING" in
8384 # request has timed out after 600s on this Qwen MI355X path. Let aiperf
8485 # own benchmark traffic instead of blocking server readiness on it.
8586 WARMUP_ARGS=(--skip-server-warmup)
87+ # Keep request concurrency as the swept variable, but do not force
88+ # HiCache runs to capture ROCm graphs at every high concurrency point.
89+ # The conc=32 HiCache job crashed after startup readiness, before any
90+ # aiperf traffic, while conc=16 is the highest known-good capture size
91+ # for this model/server path. Requests above the capture size can still
92+ # run; they just do not require a larger captured graph at startup.
93+ HICACHE_CUDA_GRAPH_MAX_BS=" ${HICACHE_CUDA_GRAPH_MAX_BS:- 16} "
94+ if [ " $HICACHE_CUDA_GRAPH_MAX_BS " -lt " $CUDA_GRAPH_MAX_BS " ]; then
95+ CUDA_GRAPH_MAX_BS=" $HICACHE_CUDA_GRAPH_MAX_BS "
96+ fi
8697 ;;
8798 * )
8899 echo " Error: unsupported OFFLOADING value '$OFFLOADING ' (expected one of: none, hicache)" >&2
@@ -105,7 +116,7 @@ SGLANG_CMD=(
105116 --trust-remote-code
106117 --tokenizer-worker-num 6
107118 --enable-aiter-allreduce-fusion
108- --cuda-graph-max-bs " $CONC "
119+ --cuda-graph-max-bs " $CUDA_GRAPH_MAX_BS "
109120 --max-running-requests " $CONC "
110121 --max-prefill-tokens 32768
111122 --scheduler-recv-interval " $SCHEDULER_RECV_INTERVAL "
0 commit comments