1+ #! /usr/bin/env bash
2+ set -euo pipefail
3+ set -x
4+
5+ # Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
6+ #
7+ # Base server recipe follows the upstream MI300X reference
8+ # (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
9+ # aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
10+ # The agentic harness (resolve_trace_source / build_replay_cmd /
11+ # run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
12+ # --disable-radix-cache is dropped because agentic replay needs prefix reuse.
13+ #
14+ # Required env vars:
15+ # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
16+ #
17+ # OFFLOADING values:
18+ # none - SGLang GPU KV with the default RadixAttention prefix cache.
19+ # hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
20+
21+ source " $( dirname " $0 " ) /../../benchmark_lib.sh"
22+
23+ check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
24+
25+ PORT=${PORT:- 8888}
26+ DURATION=${DURATION:- 1800}
27+ EP_SIZE=${EP_SIZE:- 1}
28+
29+ SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:- 30}
30+
31+ if [[ -n " ${SLURM_JOB_ID:- } " ]]; then
32+ echo " JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:- unknown} "
33+ fi
34+
35+ # `hf download` creates the target dir if missing and is itself idempotent.
36+ # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
37+ # Either way, MODEL_PATH is what the server is launched with.
38+ if [[ -n " ${MODEL_PATH:- } " ]]; then
39+ if [[ ! -d " $MODEL_PATH " || -z " $( ls -A " $MODEL_PATH " 2> /dev/null) " ]]; then
40+ hf download " $MODEL " --local-dir " $MODEL_PATH "
41+ fi
42+ else
43+ hf download " $MODEL "
44+ export MODEL_PATH=" $MODEL "
45+ fi
46+
47+ rocm-smi || true
48+ amd-smi || true
49+
50+ # ---- Resolve traces and install deps ----------------------------------------
51+ # Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
52+ # unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
53+ # signal at high concurrency.
54+ # export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
55+ # 060226
56+ export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
57+
58+ # ---- Resolve traces and install deps ----------------------------------------
59+ resolve_trace_source
60+ install_agentic_deps
61+
62+ # ---- Cache / offload config -------------------------------------------------
63+ SERVER_LOG=" $RESULT_DIR /server.log"
64+ mkdir -p " $RESULT_DIR "
65+
66+ CACHE_ARGS=()
67+ WARMUP_ARGS=()
68+ CUDA_GRAPH_MAX_BS=" $CONC "
69+ case " $OFFLOADING " in
70+ none)
71+ # Leave SGLang's default RadixAttention prefix cache on — agentic
72+ # replay needs it; --disable-radix-cache would zero the hit rate.
73+ ;;
74+ hicache)
75+ # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
76+ # TP rank (one hierarchical KV, one hierarchical Mamba), so the
77+ # node-total DRAM budget divides by TP and the host-pool count.
78+ TOTAL_CPU_DRAM_GB=3000
79+ HICACHE_HOST_POOL_COUNT=" ${HICACHE_HOST_POOL_COUNT:- 2} "
80+ HICACHE_MAX_SIZE_GB_PER_RANK_POOL=" ${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:- ${HICACHE_MAX_SIZE_GB_PER_RANK:- 300} } "
81+ HICACHE_WRITE_POLICY=" ${HICACHE_WRITE_POLICY:- write_through_selective} "
82+ # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
83+ # requires page_size=1. Keep the safer direct/layer_first copy path;
84+ # kernel/page_first faults on first prefill in this mode on ROCm.
85+ HICACHE_PAGE_SIZE=" ${HICACHE_PAGE_SIZE:- 1} "
86+ HICACHE_IO_BACKEND=" ${HICACHE_IO_BACKEND:- direct} "
87+ HICACHE_MEM_LAYOUT=" ${HICACHE_MEM_LAYOUT:- layer_first} "
88+ HICACHE_SIZE_GB=" ${HICACHE_SIZE_GB:- $((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))} "
89+ if [ " $HICACHE_SIZE_GB " -gt " $HICACHE_MAX_SIZE_GB_PER_RANK_POOL " ]; then
90+ HICACHE_SIZE_GB=" $HICACHE_MAX_SIZE_GB_PER_RANK_POOL "
91+ fi
92+ if [ " $HICACHE_SIZE_GB " -lt 1 ]; then
93+ echo " Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB , TP=$TP , HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT " >&2
94+ exit 1
95+ fi
96+ echo " HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP} , host_pool_count=${HICACHE_HOST_POOL_COUNT} "
97+ CACHE_ARGS=(
98+ --page-size " $HICACHE_PAGE_SIZE "
99+ --enable-hierarchical-cache
100+ --hicache-size " $HICACHE_SIZE_GB "
101+ --hicache-io-backend " $HICACHE_IO_BACKEND "
102+ --hicache-mem-layout " $HICACHE_MEM_LAYOUT "
103+ --hicache-write-policy " $HICACHE_WRITE_POLICY "
104+ )
105+ # HiCache startup reaches API readiness but SGLang's internal warmup
106+ # request can time out on this path; let aiperf own benchmark traffic.
107+ WARMUP_ARGS=(--skip-server-warmup)
108+ # Don't force ROCm graph capture at every high concurrency point; conc=16
109+ # is the highest known-good capture size for this model/server path.
110+ HICACHE_CUDA_GRAPH_MAX_BS=" ${HICACHE_CUDA_GRAPH_MAX_BS:- 256} "
111+ if [ " $HICACHE_CUDA_GRAPH_MAX_BS " -lt " $CUDA_GRAPH_MAX_BS " ]; then
112+ CUDA_GRAPH_MAX_BS=" $HICACHE_CUDA_GRAPH_MAX_BS "
113+ fi
114+ ;;
115+ * )
116+ echo " Error: unsupported OFFLOADING value '$OFFLOADING ' (expected one of: none, hicache)" >&2
117+ exit 1
118+ ;;
119+ esac
120+
121+ echo " Starting SGLang server..."
122+ export PYTHONNOUSERSITE=1
123+
124+ python3 -m sglang.launch_server \
125+ --attention-backend aiter \
126+ --model-path $MODEL \
127+ --host=0.0.0.0 \
128+ --port $PORT \
129+ --tensor-parallel-size $TP \
130+ --ep-size $EP_SIZE \
131+ --trust-remote-code \
132+ --model-loader-extra-config ' {"enable_multithread_load": true}' \
133+ --watchdog-timeout 1200 \
134+ --tokenizer-worker-num 6 \
135+ --cuda-graph-max-bs $CONC \
136+ --max-running-requests $CONC \
137+ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
138+ --mem-fraction-static 0.8 \
139+ " ${CACHE_ARGS[@]} " \
140+ " ${WARMUP_ARGS[@]} " \
141+ --enable-metrics > " $SERVER_LOG " 2>&1 &
142+ SERVER_PID=$!
143+ echo " Server PID: $SERVER_PID "
144+
145+ wait_for_server_ready --port " $PORT " --server-log " $SERVER_LOG " --server-pid " $SERVER_PID "
146+
147+ # ---- Run benchmark ----------------------------------------------------------
148+ build_replay_cmd " $RESULT_DIR "
149+
150+ run_agentic_replay_and_write_outputs " $RESULT_DIR "
0 commit comments