22set -euo pipefail
33set -x
44
5- # Agentic trace replay benchmark for MiniMax-M2 .5 FP8 on MI355X using vLLM.
5+ # Agentic trace replay benchmark for Kimi-K2 .5 FP4 on MI355X using vLLM.
66#
77# Required env vars:
8- # MODEL, TP, CONC, RESULT_DIR
8+ # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
9+ #
10+ # OFFLOADING values:
11+ # none - vLLM GPU KV only.
12+ # cpu - vLLM native CPU offload.
13+ # lmcache - LMCache MP server + vLLM LMCacheMPConnector.
914
1015source " $( dirname " $0 " ) /../../benchmark_lib.sh"
1116
12- check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
17+ check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
1318
14- if [ -z " ${MAX_MODEL_LEN :- } " ] || [ " $MAX_MODEL_LEN " = " 0 " ] ; then
15- MAX_MODEL_LEN=131072
16- fi
19+ PORT= ${PORT :- 8888}
20+ DURATION= ${DURATION :- 1800}
21+ EP_SIZE= ${EP_SIZE :- 1}
1722
1823if [[ -n " ${SLURM_JOB_ID:- } " ]]; then
1924 echo " JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:- unknown} "
@@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
2429 export HIP_VISIBLE_DEVICES=" $ROCR_VISIBLE_DEVICES "
2530fi
2631
32+ if [[ " $MODEL " != /* ]]; then hf download " $MODEL " ; fi
33+ rocm-smi || true
34+ amd-smi || true
35+
2736# `hf download` creates the target dir if missing and is itself idempotent.
2837# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
2938# Either way, MODEL_PATH is what the server is launched with.
@@ -35,59 +44,237 @@ else
3544 hf download " $MODEL "
3645 export MODEL_PATH=" $MODEL "
3746fi
38- rocm-smi || true
39- amd-smi || true
4047
4148# ---- Resolve traces and install deps ----------------------------------------
4249# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
4350# corpus has requests up to ~1M proxy tokens that would be rejected.
4451# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
45- export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
52+ # export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
53+ # 060226
54+ export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
4655
4756resolve_trace_source
4857install_agentic_deps
4958
5059# ---- Server config ----------------------------------------------------------
5160SERVER_LOG=" $RESULT_DIR /server.log"
61+ LMCACHE_LOG=" $RESULT_DIR /lmcache_server.log"
5262mkdir -p " $RESULT_DIR "
5363
54- OFFLOAD_ARGS=" "
64+ OFFLOAD_ARGS=()
65+ PREFIX_CACHE_ARGS=()
66+
67+ # ---- Lmcache config ----------------------------------------------------------
68+ LMCACHE_PID=" "
69+
70+ cleanup_lmcache_server () {
71+ if [[ -n " $LMCACHE_PID " ]] && kill -0 " $LMCACHE_PID " 2> /dev/null; then
72+ kill " $LMCACHE_PID " 2> /dev/null || true
73+ wait " $LMCACHE_PID " 2> /dev/null || true
74+ fi
75+ }
76+
77+ trap cleanup_lmcache_server EXIT
78+
79+ wait_for_lmcache_ready () {
80+ { set +x; } 2> /dev/null
81+ local attempts=" ${LMCACHE_READY_ATTEMPTS:- 120} "
82+ local tail_pid=" "
83+
84+ while [ ! -f " $LMCACHE_LOG " ]; do
85+ if [[ -n " $LMCACHE_PID " ]] && ! kill -0 " $LMCACHE_PID " 2> /dev/null; then
86+ echo " LMCache server died before creating log file. Exiting." >&2
87+ exit 1
88+ fi
89+ sleep 1
90+ done
91+
92+ tail -f -n +1 " $LMCACHE_LOG " &
93+ tail_pid=$!
94+
95+ for (( i = 1 ; i <= attempts; i++ )) ; do
96+ if curl --output /dev/null --silent --fail " http://127.0.0.1:${LMCACHE_HTTP_PORT} /healthcheck" ; then
97+ kill " $tail_pid " 2> /dev/null || true
98+ wait " $tail_pid " 2> /dev/null || true
99+ return 0
100+ fi
101+ if [[ -n " $LMCACHE_PID " ]] && ! kill -0 " $LMCACHE_PID " 2> /dev/null; then
102+ echo " LMCache server died before becoming healthy. Log follows:" >&2
103+ kill " $tail_pid " 2> /dev/null || true
104+ wait " $tail_pid " 2> /dev/null || true
105+ cat " $LMCACHE_LOG " >&2 || true
106+ exit 1
107+ fi
108+ sleep 1
109+ done
110+
111+ echo " Timed out waiting for LMCache server healthcheck. Log follows:" >&2
112+ kill " $tail_pid " 2> /dev/null || true
113+ wait " $tail_pid " 2> /dev/null || true
114+ cat " $LMCACHE_LOG " >&2 || true
115+ exit 1
116+ }
117+
55118case " $OFFLOADING " in
56119 none) ;;
57120 cpu)
58- # SimpleCPUOffloadConnector now works on ROCm with the
59- # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b).
60- # Use the same offload path as NVIDIA so cross-vendor cpu-offload
61- # numbers are apples-to-apples.
62- # MI355X nodes have substantial DRAM; override workflow default (600 GB)
63- # so we offload up to 2 TB of KV cache.
64- TOTAL_CPU_DRAM_GB=2000
65- export VLLM_USE_SIMPLE_KV_OFFLOAD=1
66- OFFLOAD_ARGS=" --kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
121+ unset VLLM_USE_SIMPLE_KV_OFFLOAD
122+ # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
123+ # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
124+ # worker RSS / page cache / slurm cgroup).
125+ TOTAL_CPU_DRAM_GB=3000
126+ TOTAL_CPU_DRAM_PARTITION_GB=" ${TOTAL_CPU_DRAM_PARTITION_GB:- $((TOTAL_CPU_DRAM_GB / (8 / TP)))} "
127+ # Use vLLM's regular native KV-offload path (OffloadingConnector),
128+ # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
129+ # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
130+ # would switch it to SimpleCPUOffloadConnector. We intentionally leave
131+ # that env var UNSET here so the regular OffloadingConnector path is
132+ # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
133+ # form constructs the KVTransferConfig at engine startup
134+ # (vllm/config/vllm.py:662).
135+
136+ # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
137+ # This gives extra cache hit than disabling hybrid kv cache manager
138+ OFFLOAD_ARGS=(
139+ --kv_offloading_backend native
140+ --kv_offloading_size " $TOTAL_CPU_DRAM_PARTITION_GB "
141+ )
142+ ;;
143+ lmcache)
144+ { set +x; } 2> /dev/null
145+ unset VLLM_USE_SIMPLE_KV_OFFLOAD
146+
147+ git clone https://github.com/LMCache/LMCache.git
148+ cd LMCache
149+ pip install -r requirements/build.txt
150+ CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation
151+ cd ..
152+
153+ python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
154+
155+ # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
156+ # pool, but let the external MP server own that pool so vLLM does not
157+ # split --kv-offloading-size across TP ranks through the integrated
158+ # LMCache backend.
159+ TOTAL_CPU_DRAM_GB=3000
160+ LMCACHE_HOST=" ${LMCACHE_HOST:- 127.0.0.1} "
161+ LMCACHE_PORT=" ${LMCACHE_PORT:- 5555} "
162+ LMCACHE_HTTP_PORT=" ${LMCACHE_HTTP_PORT:- 8080} "
163+ # LMCacheMPConnector concatenates lmcache.mp.host and port into the
164+ # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
165+ # ZMQ-style host string.
166+ LMCACHE_CONNECT_HOST=" ${LMCACHE_CONNECT_HOST:- tcp:// $LMCACHE_HOST } "
167+ LMCACHE_L1_SIZE_GB=" ${LMCACHE_L1_SIZE_GB:- $((TOTAL_CPU_DRAM_GB / (8 / TP)))} "
168+ LMCACHE_L1_INIT_SIZE_GB=" ${LMCACHE_L1_INIT_SIZE_GB:- 20} "
169+ # LMCache read locks are leases on chunks that lookup has promised
170+ # vLLM can retrieve. The default 300s TTL is too short for this
171+ # long-context agentic queue: TP8/conc32 can spend >300s between
172+ # lookup and retrieve while GPU KV is saturated, which leaves the
173+ # object present in L1 but no longer readable. Keep the 2.5 TB pool
174+ # size unchanged and only extend the lookup-to-retrieve lease.
175+ LMCACHE_L1_READ_TTL_SECONDS=" ${LMCACHE_L1_READ_TTL_SECONDS:- 7200} "
176+ # (srok) check 256 vs 32
177+ # LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
178+ LMCACHE_CHUNK_SIZE=" ${LMCACHE_CHUNK_SIZE:- 32} "
179+ LMCACHE_MAX_WORKERS=" ${LMCACHE_MAX_WORKERS:- $TP } "
180+ export PYTHONHASHSEED=" ${PYTHONHASHSEED:- 0} "
181+ export LMCACHE_BLOCKING_TIMEOUT_SECS=120
182+
183+ set -x
184+ echo " Starting LMCache MP server..."
185+ LMCACHE_CMD=(
186+ lmcache server
187+ --host " $LMCACHE_HOST "
188+ --port " $LMCACHE_PORT "
189+ --http-host " $LMCACHE_HOST "
190+ --http-port " $LMCACHE_HTTP_PORT "
191+ --l1-size-gb " $LMCACHE_L1_SIZE_GB "
192+ --l1-init-size-gb " $LMCACHE_L1_INIT_SIZE_GB "
193+ --l1-read-ttl-seconds " $LMCACHE_L1_READ_TTL_SECONDS "
194+ --chunk-size " $LMCACHE_CHUNK_SIZE "
195+ --max-workers " $LMCACHE_MAX_WORKERS "
196+ --eviction-policy LRU
197+ )
198+ printf ' %q ' " ${LMCACHE_CMD[@]} " > " $RESULT_DIR /lmcache_command.txt"
199+ printf ' \n' >> " $RESULT_DIR /lmcache_command.txt"
200+ " ${LMCACHE_CMD[@]} " > " $LMCACHE_LOG " 2>&1 &
201+ LMCACHE_PID=$!
202+ echo " LMCache server PID: $LMCACHE_PID "
203+ wait_for_lmcache_ready
204+
205+ PREFIX_CACHE_ARGS=(--enable-prefix-caching)
206+ # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
207+ # This gives extra cache hit than disabling hybrid kv cache manager
208+ OFFLOAD_ARGS=(
209+ --kv-transfer-config
210+ " {\" kv_connector\" :\" LMCacheMPConnector\" ,\" kv_connector_module_path\" :\" lmcache.integration.vllm.lmcache_mp_connector\" ,\" kv_role\" :\" kv_both\" ,\" kv_connector_extra_config\" :{\" lmcache.mp.host\" :\" $LMCACHE_CONNECT_HOST \" ,\" lmcache.mp.port\" :$LMCACHE_PORT }}"
211+ )
67212 ;;
68213 * ) echo " Error: unsupported OFFLOADING value '$OFFLOADING '" >&2 ; exit 1 ;;
69214esac
70215
71- if [ " $EP_SIZE " -gt 1 ]; then EP=" --enable-expert-parallel" ; else EP=" " ; fi
216+ # ---- LLM server config ----------------------------------------------------------
217+ EP_ARGS=()
218+ if [ " $EP_SIZE " -gt 1 ]; then
219+ EP_ARGS=(--enable-expert-parallel)
220+ fi
72221
73222echo " Starting vllm server..."
223+ export PYTHONNOUSERSITE=1
224+
225+ # Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
226+ pip install -q amd-quark
227+
228+ # Workaround for MEC FW <177 RCCL memory reclaim issue
229+ version=$( rocm-smi --showfw 2> /dev/null | grep MEC | head -n 1 | awk ' {print $NF}' )
230+ if [[ " $version " == " " || ${version:- 0} -lt 177 ]]; then
231+ export HSA_NO_SCRATCH_RECLAIM=1
232+ fi
233+
74234export VLLM_ROCM_USE_AITER=1
75235export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
76- export PYTHONNOUSERSITE=1
236+ export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
237+ VLLM_BLOCK_SIZE=32
238+ ASYNC_SCHEDULING_ARGS=" "
239+
240+ if [[ " $TP " == " 8" && " $EP_SIZE " == " 8" ]]; then
241+ export VLLM_ROCM_USE_AITER_MOE=0
242+ ASYNC_SCHEDULING_ARGS=" --no-async-scheduling"
243+ echo " TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
244+ elif (( CONC < 64 )) ; then
245+ ASYNC_SCHEDULING_ARGS=" --no-async-scheduling"
246+ echo " c${CONC} : using block size 32, shuffle disabled, async scheduling disabled."
247+ elif (( CONC == 64 )) ; then
248+ ASYNC_SCHEDULING_ARGS=" --no-async-scheduling"
249+ export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
250+ VLLM_BLOCK_SIZE=16
251+ echo " c64: using block size 16, shuffle enabled, async scheduling disabled."
252+ else
253+ export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
254+ VLLM_BLOCK_SIZE=16
255+ echo " c${CONC} : using block size 16, shuffle enabled, async scheduling enabled."
256+ fi
77257
78- vllm serve " $MODEL_PATH " --served-model-name " $MODEL " \
79- --host 0.0.0.0 \
80- --port $PORT \
81- --tensor-parallel-size=$TP \
82- $EP \
83- --gpu-memory-utilization 0.95 \
84- --max-model-len $MAX_MODEL_LEN \
85- --kv-cache-dtype fp8 \
86- --block-size=32 \
87- --max-num-seqs $CONC \
88- --attention-backend " ROCM_AITER_UNIFIED_ATTN" \
89- --trust-remote-code \
90- $OFFLOAD_ARGS > " $SERVER_LOG " 2>&1 &
258+ { set +x; } 2> /dev/null
259+ VLLM_CMD=(
260+ vllm serve " $MODEL "
261+ --host 0.0.0.0
262+ --port " $PORT "
263+ --tensor-parallel-size=" $TP "
264+ " ${EP_ARGS[@]} "
265+ --gpu-memory-utilization 0.95
266+ --kv-cache-dtype fp8
267+ --block-size=$VLLM_BLOCK_SIZE
268+ --trust-remote-code
269+ --attention-backend " ROCM_AITER_FA"
270+ --max-num-seqs " $CONC "
271+ $ASYNC_SCHEDULING_ARGS
272+ " ${PREFIX_CACHE_ARGS[@]} "
273+ " ${OFFLOAD_ARGS[@]} "
274+ )
275+ printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
276+ printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
277+ " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
91278SERVER_PID=$!
92279echo " Server PID: $SERVER_PID "
93280
0 commit comments