22set -euo pipefail
33set -x
44
5- # Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
5+ # Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
6+ #
7+ # Base server recipe follows the upstream MI300X reference
8+ # (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
9+ # aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
10+ # The agentic harness (resolve_trace_source / build_replay_cmd /
11+ # run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
12+ # --disable-radix-cache is dropped because agentic replay needs prefix reuse.
613#
714# Required env vars:
8- # MODEL, TP, CONC, RESULT_DIR
15+ # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
16+ #
17+ # OFFLOADING values:
18+ # none - SGLang GPU KV with the default RadixAttention prefix cache.
19+ # hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
920
1021source " $( dirname " $0 " ) /../../benchmark_lib.sh"
1122
12- check_env_vars MODEL TP CONC RESULT_DIR DURATION
23+ check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
1324
14- if [ -z " ${MAX_MODEL_LEN :- } " ] || [ " $MAX_MODEL_LEN " = " 0 " ] ; then
15- MAX_MODEL_LEN=131072
16- fi
25+ PORT= ${PORT :- 8888}
26+ DURATION= ${DURATION :- 1800}
27+ EP_SIZE= ${EP_SIZE :- 1}
1728
1829if [[ -n " ${SLURM_JOB_ID:- } " ]]; then
1930 echo " JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:- unknown} "
3041 hf download " $MODEL "
3142 export MODEL_PATH=" $MODEL "
3243fi
44+
3345rocm-smi || true
3446amd-smi || true
47+ # ---- Resolve traces and install deps ----------------------------------------
48+ # Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
49+ # unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
50+ # signal at high concurrency.
51+ # export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
52+ # 060226
53+ export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
3554
3655# ---- Resolve traces and install deps ----------------------------------------
3756resolve_trace_source
@@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"
4867
4968pip install -U transformers
5069
70+ CACHE_ARGS=()
71+ WARMUP_ARGS=()
72+ CUDA_GRAPH_MAX_BS=" $CONC "
73+ case " $OFFLOADING " in
74+ none)
75+ # Leave SGLang's default RadixAttention prefix cache on — agentic
76+ # replay needs it; --disable-radix-cache would zero the hit rate.
77+ ;;
78+ hicache)
79+ # GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
80+ # so one HiCache host pool per TP rank is sufficient.
81+ # The node-total DRAM budget divides by TP and host-pool count.
82+ TOTAL_CPU_DRAM_GB=3000
83+ HICACHE_HOST_POOL_COUNT=" ${HICACHE_HOST_POOL_COUNT:- 1} "
84+ HICACHE_MAX_SIZE_GB_PER_RANK_POOL=" ${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:- ${HICACHE_MAX_SIZE_GB_PER_RANK:- 500} } "
85+ HICACHE_WRITE_POLICY=" ${HICACHE_WRITE_POLICY:- write_through_selective} "
86+ # GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
87+ # so page_size can be left at the default. Keep the safer direct/layer_first
88+ # copy path on ROCm.
89+ HICACHE_PAGE_SIZE=" ${HICACHE_PAGE_SIZE:- 1} "
90+ HICACHE_IO_BACKEND=" ${HICACHE_IO_BACKEND:- direct} "
91+ HICACHE_MEM_LAYOUT=" ${HICACHE_MEM_LAYOUT:- layer_first} "
92+ HICACHE_SIZE_GB=" ${HICACHE_SIZE_GB:- $((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))} "
93+ if [ " $HICACHE_SIZE_GB " -gt " $HICACHE_MAX_SIZE_GB_PER_RANK_POOL " ]; then
94+ HICACHE_SIZE_GB=" $HICACHE_MAX_SIZE_GB_PER_RANK_POOL "
95+ fi
96+ if [ " $HICACHE_SIZE_GB " -lt 1 ]; then
97+ echo " Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB , TP=$TP , HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT " >&2
98+ exit 1
99+ fi
100+ echo " HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP} , host_pool_count=${HICACHE_HOST_POOL_COUNT} "
101+ CACHE_ARGS=(
102+ --page-size " $HICACHE_PAGE_SIZE "
103+ --enable-hierarchical-cache
104+ --hicache-size " $HICACHE_SIZE_GB "
105+ --hicache-io-backend " $HICACHE_IO_BACKEND "
106+ --hicache-mem-layout " $HICACHE_MEM_LAYOUT "
107+ --hicache-write-policy " $HICACHE_WRITE_POLICY "
108+ )
109+ # HiCache startup reaches API readiness but SGLang's internal warmup
110+ # request can time out on this path; let aiperf own benchmark traffic.
111+ WARMUP_ARGS=(--skip-server-warmup)
112+ # Don't force ROCm graph capture at every high concurrency point; conc=16
113+ # is the highest known-good capture size for this model/server path.
114+ HICACHE_CUDA_GRAPH_MAX_BS=" ${HICACHE_CUDA_GRAPH_MAX_BS:- 16} "
115+ if [ " $HICACHE_CUDA_GRAPH_MAX_BS " -lt " $CUDA_GRAPH_MAX_BS " ]; then
116+ CUDA_GRAPH_MAX_BS=" $HICACHE_CUDA_GRAPH_MAX_BS "
117+ fi
118+ ;;
119+ * )
120+ echo " Error: unsupported OFFLOADING value '$OFFLOADING ' (expected one of: none, hicache)" >&2
121+ exit 1
122+ ;;
123+ esac
124+
51125echo " Starting SGLang server..."
52126export PYTHONNOUSERSITE=1
53127
128+ pip install -U transformers
54129python3 -m sglang.launch_server \
55- --model-path " $MODEL_PATH " --served-model-name " $MODEL " \
130+ --model-path " $MODEL_PATH " \
131+ --served-model-name " $MODEL " \
56132 --host=0.0.0.0 \
57133 --port $PORT \
58134 --tensor-parallel-size $TP \
59135 --trust-remote-code \
60136 --cuda-graph-max-bs $CONC \
61137 --max-running-requests $CONC \
62- --context-length $MAX_MODEL_LEN \
63138 --mem-fraction-static 0.85 \
64139 --tool-call-parser glm47 \
65140 --reasoning-parser glm45 \
66141 --model-loader-extra-config ' {"enable_multithread_load": true, "num_threads": 8}' \
67142 --nsa-prefill-backend tilelang \
68143 --nsa-decode-backend tilelang \
144+ --watchdog-timeout 1200 \
69145 --kv-cache-dtype fp8_e4m3 \
70146 --tokenizer-worker-num $(( TP* 2 )) \
147+ " ${CACHE_ARGS[@]} " \
148+ " ${WARMUP_ARGS[@]} " \
71149 --enable-metrics > " $SERVER_LOG " 2>&1 &
72150SERVER_PID=$!
73151echo " Server PID: $SERVER_PID "
@@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
77155# ---- Run benchmark ----------------------------------------------------------
78156build_replay_cmd " $RESULT_DIR "
79157
80- run_agentic_replay_and_write_outputs " $RESULT_DIR "
158+ run_agentic_replay_and_write_outputs " $RESULT_DIR "
0 commit comments