Skip to content

Commit dc25a0b

Browse files
seungrokjclaude
andcommitted
[AMD] minimaxm2.5-fp8-mi355x-vllm-agentic: add lmcache variant config and update script
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent fe3afa9 commit dc25a0b

2 files changed

Lines changed: 241 additions & 35 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2536,7 +2536,7 @@ kimik2.5-fp4-mi355x-vllm-agentic:
25362536

25372537
# target
25382538
kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
2539-
image: vllm/vllm-openai-rocm:v0.21.0
2539+
image: vllm/vllm-openai-rocm:v0.22.0
25402540
model: amd/Kimi-K2.5-MXFP4
25412541
model-prefix: kimik2.5
25422542
runner: mi355x
@@ -2568,6 +2568,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
25682568
- { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
25692569
- { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] }
25702570

2571+
# target
2572+
minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
2573+
image: vllm/vllm-openai-rocm:v0.22.0
2574+
model: MiniMaxAI/MiniMax-M2.5
2575+
model-prefix: minimaxm2.5
2576+
runner: mi355x
2577+
precision: fp8
2578+
framework: vllm
2579+
multinode: false
2580+
scenarios:
2581+
agentic-coding:
2582+
# MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
2583+
# Compute saturates first; cpu offload likely won't help, but worth confirming.
2584+
# AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
2585+
- duration: 1800
2586+
search-space:
2587+
- { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
2588+
- { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
2589+
25712590
minimaxm2.5-fp8-mi300x-vllm-agentic:
25722591
image: vllm/vllm-openai-rocm:v0.22.0
25732592
model: MiniMaxAI/MiniMax-M2.5

benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh

Lines changed: 221 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,23 @@
22
set -euo pipefail
33
set -x
44

5-
# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM.
5+
# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
66
#
77
# Required env vars:
8-
# MODEL, TP, CONC, RESULT_DIR
8+
# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
9+
#
10+
# OFFLOADING values:
11+
# none - vLLM GPU KV only.
12+
# cpu - vLLM native CPU offload.
13+
# lmcache - LMCache MP server + vLLM LMCacheMPConnector.
914

1015
source "$(dirname "$0")/../../benchmark_lib.sh"
1116

12-
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
17+
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
1318

14-
if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
15-
MAX_MODEL_LEN=131072
16-
fi
19+
PORT=${PORT:-8888}
20+
DURATION=${DURATION:-1800}
21+
EP_SIZE=${EP_SIZE:-1}
1722

1823
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
1924
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
2429
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
2530
fi
2631

32+
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
33+
rocm-smi || true
34+
amd-smi || true
35+
2736
# `hf download` creates the target dir if missing and is itself idempotent.
2837
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
2938
# Either way, MODEL_PATH is what the server is launched with.
@@ -35,59 +44,237 @@ else
3544
hf download "$MODEL"
3645
export MODEL_PATH="$MODEL"
3746
fi
38-
rocm-smi || true
39-
amd-smi || true
4047

4148
# ---- Resolve traces and install deps ----------------------------------------
4249
# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
4350
# corpus has requests up to ~1M proxy tokens that would be rejected.
4451
# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
45-
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
52+
#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
53+
#060226
54+
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
4655

4756
resolve_trace_source
4857
install_agentic_deps
4958

5059
# ---- Server config ----------------------------------------------------------
5160
SERVER_LOG="$RESULT_DIR/server.log"
61+
LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
5262
mkdir -p "$RESULT_DIR"
5363

54-
OFFLOAD_ARGS=""
64+
OFFLOAD_ARGS=()
65+
PREFIX_CACHE_ARGS=()
66+
67+
# ---- Lmcache config ----------------------------------------------------------
68+
LMCACHE_PID=""
69+
70+
cleanup_lmcache_server() {
71+
if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
72+
kill "$LMCACHE_PID" 2>/dev/null || true
73+
wait "$LMCACHE_PID" 2>/dev/null || true
74+
fi
75+
}
76+
77+
trap cleanup_lmcache_server EXIT
78+
79+
wait_for_lmcache_ready() {
80+
{ set +x; } 2>/dev/null
81+
local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
82+
local tail_pid=""
83+
84+
while [ ! -f "$LMCACHE_LOG" ]; do
85+
if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
86+
echo "LMCache server died before creating log file. Exiting." >&2
87+
exit 1
88+
fi
89+
sleep 1
90+
done
91+
92+
tail -f -n +1 "$LMCACHE_LOG" &
93+
tail_pid=$!
94+
95+
for ((i = 1; i <= attempts; i++)); do
96+
if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
97+
kill "$tail_pid" 2>/dev/null || true
98+
wait "$tail_pid" 2>/dev/null || true
99+
return 0
100+
fi
101+
if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
102+
echo "LMCache server died before becoming healthy. Log follows:" >&2
103+
kill "$tail_pid" 2>/dev/null || true
104+
wait "$tail_pid" 2>/dev/null || true
105+
cat "$LMCACHE_LOG" >&2 || true
106+
exit 1
107+
fi
108+
sleep 1
109+
done
110+
111+
echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
112+
kill "$tail_pid" 2>/dev/null || true
113+
wait "$tail_pid" 2>/dev/null || true
114+
cat "$LMCACHE_LOG" >&2 || true
115+
exit 1
116+
}
117+
55118
case "$OFFLOADING" in
56119
none) ;;
57120
cpu)
58-
# SimpleCPUOffloadConnector now works on ROCm with the
59-
# vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b).
60-
# Use the same offload path as NVIDIA so cross-vendor cpu-offload
61-
# numbers are apples-to-apples.
62-
# MI355X nodes have substantial DRAM; override workflow default (600 GB)
63-
# so we offload up to 2 TB of KV cache.
64-
TOTAL_CPU_DRAM_GB=2000
65-
export VLLM_USE_SIMPLE_KV_OFFLOAD=1
66-
OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
121+
unset VLLM_USE_SIMPLE_KV_OFFLOAD
122+
# MI355X nodes have ~2.7 TiB of host DRAM available for offload;
123+
# reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
124+
# worker RSS / page cache / slurm cgroup).
125+
TOTAL_CPU_DRAM_GB=3000
126+
TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
127+
# Use vLLM's regular native KV-offload path (OffloadingConnector),
128+
# NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
129+
# OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
130+
# would switch it to SimpleCPUOffloadConnector. We intentionally leave
131+
# that env var UNSET here so the regular OffloadingConnector path is
132+
# used. The shortcut --kv_offloading_backend native + --kv_offloading_size
133+
# form constructs the KVTransferConfig at engine startup
134+
# (vllm/config/vllm.py:662).
135+
136+
# Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
137+
# This gives extra cache hit than disabling hybrid kv cache manager
138+
OFFLOAD_ARGS=(
139+
--kv_offloading_backend native
140+
--kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
141+
)
142+
;;
143+
lmcache)
144+
{ set +x; } 2>/dev/null
145+
unset VLLM_USE_SIMPLE_KV_OFFLOAD
146+
147+
git clone https://github.com/LMCache/LMCache.git
148+
cd LMCache
149+
pip install -r requirements/build.txt
150+
CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation
151+
cd ..
152+
153+
python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
154+
155+
# Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
156+
# pool, but let the external MP server own that pool so vLLM does not
157+
# split --kv-offloading-size across TP ranks through the integrated
158+
# LMCache backend.
159+
TOTAL_CPU_DRAM_GB=3000
160+
LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
161+
LMCACHE_PORT="${LMCACHE_PORT:-5555}"
162+
LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
163+
# LMCacheMPConnector concatenates lmcache.mp.host and port into the
164+
# ZMQ endpoint. Bind the server to a raw host, but pass the connector a
165+
# ZMQ-style host string.
166+
LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
167+
LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
168+
LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
169+
# LMCache read locks are leases on chunks that lookup has promised
170+
# vLLM can retrieve. The default 300s TTL is too short for this
171+
# long-context agentic queue: TP8/conc32 can spend >300s between
172+
# lookup and retrieve while GPU KV is saturated, which leaves the
173+
# object present in L1 but no longer readable. Keep the 2.5 TB pool
174+
# size unchanged and only extend the lookup-to-retrieve lease.
175+
LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
176+
# (srok) check 256 vs 32
177+
#LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
178+
LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
179+
LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
180+
export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
181+
export LMCACHE_BLOCKING_TIMEOUT_SECS=120
182+
183+
set -x
184+
echo "Starting LMCache MP server..."
185+
LMCACHE_CMD=(
186+
lmcache server
187+
--host "$LMCACHE_HOST"
188+
--port "$LMCACHE_PORT"
189+
--http-host "$LMCACHE_HOST"
190+
--http-port "$LMCACHE_HTTP_PORT"
191+
--l1-size-gb "$LMCACHE_L1_SIZE_GB"
192+
--l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
193+
--l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
194+
--chunk-size "$LMCACHE_CHUNK_SIZE"
195+
--max-workers "$LMCACHE_MAX_WORKERS"
196+
--eviction-policy LRU
197+
)
198+
printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
199+
printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
200+
"${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
201+
LMCACHE_PID=$!
202+
echo "LMCache server PID: $LMCACHE_PID"
203+
wait_for_lmcache_ready
204+
205+
PREFIX_CACHE_ARGS=(--enable-prefix-caching)
206+
# Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
207+
# This gives extra cache hit than disabling hybrid kv cache manager
208+
OFFLOAD_ARGS=(
209+
--kv-transfer-config
210+
"{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
211+
)
67212
;;
68213
*) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
69214
esac
70215

71-
if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi
216+
# ---- LLM server config ----------------------------------------------------------
217+
EP_ARGS=()
218+
if [ "$EP_SIZE" -gt 1 ]; then
219+
EP_ARGS=(--enable-expert-parallel)
220+
fi
72221

73222
echo "Starting vllm server..."
223+
export PYTHONNOUSERSITE=1
224+
225+
# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
226+
pip install -q amd-quark
227+
228+
# Workaround for MEC FW <177 RCCL memory reclaim issue
229+
version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
230+
if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
231+
export HSA_NO_SCRATCH_RECLAIM=1
232+
fi
233+
74234
export VLLM_ROCM_USE_AITER=1
75235
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
76-
export PYTHONNOUSERSITE=1
236+
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
237+
VLLM_BLOCK_SIZE=32
238+
ASYNC_SCHEDULING_ARGS=""
239+
240+
if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
241+
export VLLM_ROCM_USE_AITER_MOE=0
242+
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
243+
echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
244+
elif (( CONC < 64 )); then
245+
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
246+
echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
247+
elif (( CONC == 64 )); then
248+
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
249+
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
250+
VLLM_BLOCK_SIZE=16
251+
echo "c64: using block size 16, shuffle enabled, async scheduling disabled."
252+
else
253+
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
254+
VLLM_BLOCK_SIZE=16
255+
echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
256+
fi
77257

78-
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
79-
--host 0.0.0.0 \
80-
--port $PORT \
81-
--tensor-parallel-size=$TP \
82-
$EP \
83-
--gpu-memory-utilization 0.95 \
84-
--max-model-len $MAX_MODEL_LEN \
85-
--kv-cache-dtype fp8 \
86-
--block-size=32 \
87-
--max-num-seqs $CONC \
88-
--attention-backend "ROCM_AITER_UNIFIED_ATTN" \
89-
--trust-remote-code \
90-
$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
258+
{ set +x; } 2>/dev/null
259+
VLLM_CMD=(
260+
vllm serve "$MODEL"
261+
--host 0.0.0.0
262+
--port "$PORT"
263+
--tensor-parallel-size="$TP"
264+
"${EP_ARGS[@]}"
265+
--gpu-memory-utilization 0.95
266+
--kv-cache-dtype fp8
267+
--block-size=$VLLM_BLOCK_SIZE
268+
--trust-remote-code
269+
--attention-backend "ROCM_AITER_FA"
270+
--max-num-seqs "$CONC"
271+
$ASYNC_SCHEDULING_ARGS
272+
"${PREFIX_CACHE_ARGS[@]}"
273+
"${OFFLOAD_ARGS[@]}"
274+
)
275+
printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
276+
printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
277+
"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
91278
SERVER_PID=$!
92279
echo "Server PID: $SERVER_PID"
93280

0 commit comments

Comments
 (0)