[AMD] minimaxm2.5-fp8-mi355x-vllm-agentic: add lmcache variant config and update script

seungrokj · claude · seungrokj · commit dc25a0bb0857 · 2026-06-03T23:26:02.000+09:00
Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -2536,7 +2536,7 @@ kimik2.5-fp4-mi355x-vllm-agentic:
 
 # target
 kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -2568,6 +2568,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
+# target
+minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp8-mi300x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -2,18 +2,23 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM.
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
 # `hf download` creates the target dir if missing and is itself idempotent.
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
@@ -35,59 +44,237 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
-rocm-smi || true
-amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
 
 resolve_trace_source
 install_agentic_deps
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
-        # SimpleCPUOffloadConnector now works on ROCm with the
-        # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b).
-        # Use the same offload path as NVIDIA so cross-vendor cpu-offload
-        # numbers are apples-to-apples.
-        # MI355X nodes have substantial DRAM; override workflow default (600 GB)
-        # so we offload up to 2 TB of KV cache.
-        TOTAL_CPU_DRAM_GB=2000
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        # (srok) check 256 vs 32
+        #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+        )
         ;;
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
-if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
 
 echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export PYTHONNOUSERSITE=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+VLLM_BLOCK_SIZE=32
+ASYNC_SCHEDULING_ARGS=""
+
+if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    export VLLM_ROCM_USE_AITER_MOE=0
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
+elif (( CONC < 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
+elif (( CONC == 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c64: using block size 16, shuffle enabled, async scheduling disabled."
+else
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
+fi
 
-vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
-$EP \
---gpu-memory-utilization 0.95 \
---max-model-len $MAX_MODEL_LEN \
---kv-cache-dtype fp8 \
---block-size=32 \
---max-num-seqs $CONC \
---attention-backend "ROCM_AITER_UNIFIED_ATTN" \
---trust-remote-code \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 
+    --block-size=$VLLM_BLOCK_SIZE 
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    $ASYNC_SCHEDULING_ARGS 
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"