diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7f1c8192d..ef4ff772d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
+# target
+qwen3.5-fp4-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+      - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+
 qwen3.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/Qwen3.5-397B-A17B-MXFP4
@@ -872,6 +888,22 @@ minimaxm2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
+# target
+minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/MiniMax-M2.5-MXFP4
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/MiniMax-M2.5-MXFP4
@@ -2494,6 +2526,23 @@ glm5.1-fp4-mi355x-sglang-agentic:
       # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
+# target
+glm5.1-fp4-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
+      - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }
+
 kimik2.5-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
@@ -2518,6 +2567,22 @@ kimik2.5-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
 
+# target
+kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 1, offloading: none, conc-list:    [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
 minimaxm2.5-fp8-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -2536,6 +2601,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
+# target
+minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+      - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp8-mi300x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -2573,8 +2657,9 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
+# target
 qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
@@ -2585,8 +2670,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+      - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
 
 dsv4-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 3b85a31cd..6bea8dddd 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -2,18 +2,29 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR DURATION
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,8 +41,16 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
@@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"
 
 pip install -U transformers
 
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
+        # so one HiCache host pool per TP rank is sufficient.
+        # The node-total DRAM budget divides by TP and host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
+        # so page_size can be left at the default. Keep the safer direct/layer_first
+        # copy path on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
+pip install -U transformers
 python3 -m sglang.launch_server \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --model-path "$MODEL_PATH" \
+    --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --context-length $MAX_MODEL_LEN \
     --mem-fraction-static 0.85 \
     --tool-call-parser glm47 \
     --reasoning-parser glm45 \
     --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
     --nsa-prefill-backend tilelang \
     --nsa-decode-backend tilelang \
+    --watchdog-timeout 1200 \
     --kv-cache-dtype fp8_e4m3 \
     --tokenizer-worker-num $((TP*2)) \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index 139b12256..b3211ff49 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -14,15 +14,11 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
-
-# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
-# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
-# script we need the concrete value so AgentX filters prompt+max_tokens against
-# the same limit vLLM enforces.
-if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
-    MAX_MODEL_LEN=262144
-fi
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -44,546 +40,22 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
-pip install amd-quark
-
-# Disable AITER RMSNorm for TP < 8 due to accuracy issues
-if [ "${TP}" -lt 8 ]; then
-  export VLLM_ROCM_USE_AITER_RMSNORM=0
-fi
-
-write_lmcache_rocm_mp_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
-
-import os
-import threading
-
-if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
-    import builtins
-    import sys
-
-    _orig_import = builtins.__import__
-
-    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-
-        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
-            return
-
-        _orig_init = _LazyMemoryAllocator.__init__
-        _orig_allocate = _LazyMemoryAllocator.allocate
-        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-
-        def _expand_to(self, target_size: int) -> None:
-            target_size = min(
-                self._final_size,
-                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-            )
-            lock = self._agentic_rocm_demand_expand_lock
-            with lock:
-                if target_size <= self._curr_size:
-                    return
-
-                start_size = self._curr_size
-                while self._curr_size < target_size:
-                    commit_start = self._curr_size
-                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-                    while self._curr_size < commit_target:
-                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-                        self._curr_size += self.PIN_CHUNK_SIZE
-                    self._commit_expansion(self._curr_size - commit_start)
-
-                self._log_expansion_progress(self._curr_size - start_size)
-
-        def _retry_with_demand_expansion(self, allocate_once):
-            obj = allocate_once()
-            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
-            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-
-            while obj is None and self._curr_size < self._final_size:
-                _expand_to(self, self._curr_size + step_bytes)
-                obj = allocate_once()
-
-            return obj
-
-        def _patched_init(self, *args, **kwargs):
-            _orig_init(self, *args, **kwargs)
-            self._agentic_rocm_demand_expand_lock = threading.Lock()
-
-            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-            # the final pinned size in a background thread. On ROCm Kimi TP4,
-            # vLLM reaches KV-cache registration only after that 2.5 TB pool
-            # is fully pinned, and the server-side IPC open path can stall
-            # before acknowledging register_kv_caches. Keep the same final
-            # capacity, but pin/commit extra host memory only when L1
-            # allocations actually need it.
-            self._stop_expand.set()
-            self._expand_thread.join()
-            _lazy_memory_allocator.logger.info(
-                "Agentic ROCm patch: using demand-driven LMCache pinned "
-                "memory expansion; final capacity remains %s MB",
-                self._final_size >> 20,
-            )
-
-        def _patched_allocate(
-            self,
-            shapes,
-            dtypes,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-            )
-
-        def _patched_batched_allocate(
-            self,
-            shapes,
-            dtypes,
-            batch_size,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_batched_allocate(
-                    self, shapes, dtypes, batch_size, fmt, allocator_type
-                ),
-            )
-
-        _LazyMemoryAllocator.__init__ = _patched_init
-        _LazyMemoryAllocator.allocate = _patched_allocate
-        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
-
-    def _patch_l1_memory_manager(_memory_manager) -> None:
-        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-            return
-        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
-            return
-
-        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-
-        def _patched_get_memory_usage(self):
-            allocator = getattr(self, "_allocator", None)
-            if isinstance(allocator, _LazyMemoryAllocator):
-                address_manager = allocator.get_address_manager()
-                used_size = (
-                    address_manager.get_heap_size() - address_manager.get_free_size()
-                )
-                return used_size, allocator._final_size
-            return _orig_get_memory_usage(self)
-
-        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
-
-    def _maybe_patch_lazy_memory_allocator() -> None:
-        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-            _patch_lazy_memory_allocator(module)
-
-    def _maybe_patch_l1_memory_manager() -> None:
-        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-        if module is not None and hasattr(module, "L1MemoryManager"):
-            _patch_l1_memory_manager(module)
-
-    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
-        module = _orig_import(name, globals, locals, fromlist, level)
-        if name == "lmcache.v1.lazy_memory_allocator" or (
-            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-        ):
-            _maybe_patch_lazy_memory_allocator()
-        if name == "lmcache.v1.distributed.memory_manager" or (
-            name.startswith("lmcache")
-            and "lmcache.v1.distributed.memory_manager" in sys.modules
-        ):
-            _maybe_patch_l1_memory_manager()
-        return module
-
-    builtins.__import__ = _agentic_rocm_import
-    _maybe_patch_lazy_memory_allocator()
-    _maybe_patch_l1_memory_manager()
-
-if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
-    import torch
-    import lmcache.non_cuda_equivalents as lmc
-
-    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
-        _DTYPE_BY_NAME = {
-            "bfloat16": torch.bfloat16,
-            "float16": torch.float16,
-            "float32": torch.float32,
-        }
-
-        def _dtype_from_env() -> torch.dtype:
-            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
-            try:
-                return _DTYPE_BY_NAME[name]
-            except KeyError as exc:
-                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
-
-        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            block_stride = shape_desc.block_stride_elems or (
-                shape_desc.bs * shape_desc.nh * shape_desc.hs
-            )
-            base = lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.nb * block_stride,),
-                dtype,
-                device,
-            )
-            return torch.as_strided(
-                base,
-                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
-                (block_stride, shape_desc.nh * shape_desc.hs, 1),
-            )
-
-        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            return lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
-                dtype,
-                device,
-            )
-
-        def multi_layer_block_kv_transfer(
-            group_kv_pointers,
-            tmp_buffer_ptrs,
-            block_ids,
-            paged_memory_device,
-            direction,
-            shape_desc,
-            lmcache_chunk_size,
-            gpu_kv_format,
-            skip_blocks=0,
-        ) -> None:
-            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
-            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
-            # fallback has no block-transfer entrypoint yet, so implement the
-            # same gather/scatter contract with torch indexing on ROCm.
-            if shape_desc.kv_size != 1:
-                raise NotImplementedError(
-                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
-                )
-
-            dtype = _dtype_from_env()
-            device = (
-                paged_memory_device
-                if isinstance(paged_memory_device, torch.device)
-                else torch.device(paged_memory_device)
-            )
-            num_layers = int(group_kv_pointers.numel())
-            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
-            direction_name = getattr(direction, "name", str(direction))
-
-            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
-                start = chunk_idx * blocks_per_chunk
-                end = start + blocks_per_chunk
-                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
-
-                dest_slot_offset = 0
-                if skip_blocks and chunk_idx == 0:
-                    chunk_blocks = chunk_blocks[int(skip_blocks):]
-                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
-                if chunk_blocks.numel() == 0:
-                    continue
-
-                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
-                tmp = _tmp_view(
-                    int(tmp_ptr),
-                    shape_desc,
-                    num_layers,
-                    lmcache_chunk_size,
-                    dtype,
-                    device,
-                )
-
-                for layer_idx in range(num_layers):
-                    paged = _paged_view(
-                        int(group_kv_pointers[layer_idx].item()),
-                        shape_desc,
-                        dtype,
-                        device,
-                    )
-                    tmp_slice = tmp[
-                        0,
-                        layer_idx,
-                        dest_slot_offset : dest_slot_offset + num_slots,
-                        :,
-                    ]
-                    if direction_name == "D2H":
-                        gathered = paged.index_select(0, chunk_blocks).reshape(
-                            num_slots, shape_desc.nh * shape_desc.hs
-                        )
-                        tmp_slice.copy_(gathered)
-                    elif direction_name == "H2D":
-                        src = tmp_slice.reshape(
-                            int(chunk_blocks.numel()),
-                            shape_desc.bs,
-                            shape_desc.nh * shape_desc.hs,
-                        )
-                        paged.index_copy_(0, chunk_blocks, src)
-                    else:
-                        raise ValueError(f"Unsupported transfer direction: {direction}")
-
-        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
-
-# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ----
-if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0":
-    import chunked_connector_patch  # noqa: F401
-
-# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ----
-import scheduler_assertion_patch  # noqa: F401
-PY
-}
-
-write_chunked_connector_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/chunked_connector_patch.py" <<'PY'
-"""
-Monkey-patch for LMCacheMPConnector to add chunked KV loading.
-
-Fixes GPU block exhaustion deadlock at high concurrency by capping
-the number of external tokens reported AND retrieved per scheduling step.
-
-Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=<tokens> and import this
-module from sitecustomize.py before LMCache is loaded.
-"""
-
-import logging
-import os
-import sys
-import builtins
-
-logger = logging.getLogger("chunked_lmcache_patch")
-
-_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768"))
-
-# Per-request chunk tracking (module-level, survives across calls)
-_chunk_state: dict[str, dict] = {}
-
-
-def _apply_patch():
-    """Patch LMCacheMPConnector in-place."""
-    mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector")
-    if mod is None:
-        return
-    cls = getattr(mod, "LMCacheMPConnector", None)
-    if cls is None or getattr(cls, "_chunked_patch_applied", False):
-        return
-
-    LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None)
-    _orig_get_matched = cls.get_num_new_matched_tokens
-    _orig_get_finished = cls.get_finished
-
-    def _get_blocks_per_chunk(self):
-        block_size = getattr(self, "block_size", 1)
-        return max(1, _MAX_TOKENS // block_size)
-
-    def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens):
-        full_match = _orig_get_matched(self, request, num_computed_tokens)
-        if full_match <= 0 or _MAX_TOKENS <= 0:
-            return full_match
-
-        req_id = request.request_id
-        block_size = getattr(self, "block_size", 1)
-        blocks_per_chunk = _get_blocks_per_chunk(self)
-        full_match_blocks = full_match // block_size
-
-        state = _chunk_state.get(req_id)
-        if state is None or state.get("num_computed_at_start") != num_computed_tokens:
-            state = {
-                "full_match_blocks": full_match_blocks,
-                "chunk_end_blocks": 0,
-                "num_computed_at_start": num_computed_tokens,
-                "lookup_done": False,
-            }
-            _chunk_state[req_id] = state
-
-        if state["lookup_done"]:
-            return 0
-
-        remaining = state["full_match_blocks"] - state["chunk_end_blocks"]
-        if remaining <= 0:
-            state["lookup_done"] = True
-            return 0
-
-        this_chunk = min(remaining, blocks_per_chunk)
-        state["chunk_end_blocks"] += this_chunk
-        if state["chunk_end_blocks"] >= state["full_match_blocks"]:
-            state["lookup_done"] = True
-
-        capped = this_chunk * block_size
-        if capped < full_match:
-            logger.debug(
-                "Chunked LMCache: req %s capped %d -> %d tokens "
-                "(chunk %d/%d blocks)",
-                req_id, full_match, capped, this_chunk, full_match_blocks,
-            )
-
-        # Cap the tracker's hit blocks to match what we report
-        tracker = getattr(request, "kv_transfer_params", None)
-        if tracker is not None:
-            orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0)
-            if orig_hits > this_chunk:
-                tracker.num_lmcache_hit_blocks = this_chunk
-
-        return capped
-
-    def _patched_get_finished(self, scheduler_output):
-        result = _orig_get_finished(self, scheduler_output)
-        # Clean up chunk state for finished requests.
-        # vLLM passes scheduler_output as a set of request-ID strings
-        # (not a SchedulerOutput object), so iterate directly when it
-        # is a set/frozenset; fall back to the attribute path for
-        # forward compatibility.
-        if isinstance(scheduler_output, (set, frozenset)):
-            finished = scheduler_output
-        else:
-            finished = getattr(scheduler_output, "finished_req_ids", [])
-        for req in finished:
-            _chunk_state.pop(req, None)
-        return result
-
-    cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens
-    cls.get_finished = _patched_get_finished
-    cls._chunked_patch_applied = True
-    logger.info(
-        "Chunked LMCache connector patch applied "
-        "(max_tokens_per_load=%d)", _MAX_TOKENS,
-    )
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "lmcache.integration.vllm.lmcache_mp_connector"
-        or (
-            name.startswith("lmcache")
-            and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-write_scheduler_assertion_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY'
-"""
-Patch vLLM scheduler to handle stale finished_recving gracefully.
-
-The assertion at scheduler.py crashes when a KV transfer reports
-"finished recving" but the request is already in RUNNING state.
-This happens when transfers complete asynchronously and the scheduler
-has already moved the request forward.
-
-Fix: Instead of asserting, log a warning and skip.
-"""
-
-import logging
-import sys
-import builtins
-
-logger = logging.getLogger("scheduler_assertion_patch")
-
-
-def _apply_patch():
-    """Patch vLLM scheduler's _update_from_kv_xfer_finished."""
-    sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler")
-    if sched_mod is None:
-        return
-    req_mod = sys.modules.get("vllm.v1.request")
-    if req_mod is None:
-        return
-    Scheduler = getattr(sched_mod, "Scheduler", None)
-    RequestStatus = getattr(req_mod, "RequestStatus", None)
-    if Scheduler is None or RequestStatus is None:
-        return
-    if getattr(Scheduler, "_kv_xfer_patch_applied", False):
-        return
-
-    _orig_update = Scheduler._update_from_kv_xfer_finished
-
-    def _patched_update(self, kv_connector_output):
-        if self.connector is not None:
-            self.connector.update_connector_output(kv_connector_output)
-        for req_id in kv_connector_output.finished_recving or ():
-            if req_id not in self.requests:
-                continue
-            req = self.requests[req_id]
-            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
-                self.finished_recving_kv_req_ids.add(req_id)
-            elif RequestStatus.is_finished(req.status):
-                self._free_blocks(self.requests[req_id])
-            else:
-                logger.warning(
-                    "Stale finished_recving for req %s in status %s; skipping.",
-                    req_id, req.status.name,
-                )
-        for req_id in kv_connector_output.finished_sending or ():
-            if req_id not in self.requests:
-                continue
-            self._free_blocks(self.requests[req_id])
-
-    Scheduler._update_from_kv_xfer_finished = _patched_update
-    Scheduler._kv_xfer_patch_applied = True
-    logger.info("Scheduler KV transfer assertion patch applied")
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "vllm.v1.core.sched.scheduler"
-        or (
-            name.startswith("vllm")
-            and "vllm.v1.core.sched.scheduler" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-# Workaround for MEC FW <177 RCCL memory reclaim issue
-version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
-if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
-    export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
@@ -591,6 +63,8 @@ mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
 PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
 LMCACHE_PID=""
 
 cleanup_lmcache_server() {
@@ -648,7 +122,9 @@ case "$OFFLOADING" in
         # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
         # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
         # worker RSS / page cache / slurm cgroup).
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         # Use vLLM's regular native KV-offload path (OffloadingConnector),
         # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
         # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
@@ -659,7 +135,7 @@ case "$OFFLOADING" in
         # (vllm/config/vllm.py:662).
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
-            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
             --disable-hybrid-kv-cache-manager
         )
         ;;
@@ -667,74 +143,20 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        agentic_pip_install --quiet --no-cache-dir lmcache
-        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
-        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
-        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
-        # that module is importable, even when this run is not using EP/NIXL
-        # kernels. The CUDA extension then fails immediately on AMD nodes with
-        # "ImportError: libcuda.so.1".
-        #
-        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
-        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
-        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
-        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
-        python3 -m pip uninstall -y \
-            nixl nixl-cu12 nixl-cu13 nixl_ep \
-            >/dev/null 2>&1 || true
-        python3 -m pip uninstall -y \
-            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
-            >/dev/null 2>&1 || true
-        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
-        python3 - <<'PY'
-import importlib.util
-import sys
-
-spec = importlib.util.find_spec("nixl_ep")
-if spec is not None:
-    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
-    print(
-        "Error: nixl_ep is still importable after LMCache install; "
-        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
-        f"location={locations}",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-
-try:
-    from cupy_backends.cuda.api import runtime as cupy_runtime
-except Exception as exc:
-    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
-    sys.exit(1)
-
-if not getattr(cupy_runtime, "is_hip", False):
-    print(
-        "Error: CuPy is still using the CUDA backend after installing "
-        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-PY
-        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
-        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR"
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
-        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
-        # Cap external KV tokens loaded per scheduling step to prevent GPU
-        # block exhaustion deadlock at high concurrency (c>=32).  Default
-        # 32768 keeps peak block demand within the GPU KV pool.  Set to 0 to
-        # disable chunking (only safe at low concurrency).
-        export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}"
-        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
         # pool, but let the external MP server own that pool so vLLM does not
         # split --kv-offloading-size across TP ranks through the integrated
         # LMCache backend.
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
@@ -742,7 +164,7 @@ PY
         # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
         # ZMQ-style host string.
         LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
         # LMCache read locks are leases on chunks that lookup has promised
         # vLLM can retrieve. The default 300s TTL is too short for this
@@ -750,10 +172,11 @@ PY
         # lookup and retrieve while GPU KV is saturated, which leaves the
         # object present in L1 but no longer readable. Keep the 2.5 TB pool
         # size unchanged and only extend the lookup-to-retrieve lease.
-        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
         LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
         LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
         export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
 
         echo "Starting LMCache MP server..."
         LMCACHE_CMD=(
@@ -786,6 +209,7 @@ PY
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
+# ---- LLM server config ----------------------------------------------------------
 EP_ARGS=()
 if [ "$EP_SIZE" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
@@ -794,6 +218,23 @@ fi
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install amd-quark
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
 { set +x; } 2>/dev/null
 VLLM_CMD=(
     vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
@@ -802,9 +243,9 @@ VLLM_CMD=(
     --tensor-parallel-size="$TP"
     "${EP_ARGS[@]}"
     --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8 \
     --block-size=1
     --trust-remote-code
-    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     --mm-encoder-tp-mode data
     "${PREFIX_CACHE_ARGS[@]}"
@@ -821,4 +262,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
new file mode 100755
index 000000000..640fe7f65
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
@@ -0,0 +1,272 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility for vLLM 0.14+
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
+case "$OFFLOADING" in
+    none) ;;
+    cpu)
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # (srok)TODO: intentionally increased DRAM size
+        TOTAL_CPU_DRAM_GB=2000
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        # (srok) check 256 vs 32
+        #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
+esac
+
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 \
+    --block-size=32
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index cd114fe96..9f1f79a3f 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -2,18 +2,23 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM.
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
 # `hf download` creates the target dir if missing and is itself idempotent.
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
@@ -35,59 +44,240 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
-rocm-smi || true
-amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
 
 resolve_trace_source
 install_agentic_deps
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
-        # SimpleCPUOffloadConnector now works on ROCm with the
-        # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b).
-        # Use the same offload path as NVIDIA so cross-vendor cpu-offload
-        # numbers are apples-to-apples.
-        # MI355X nodes have substantial DRAM; override workflow default (600 GB)
-        # so we offload up to 2 TB of KV cache.
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # (srok)TODO: intentionally increased DRAM size
         TOTAL_CPU_DRAM_GB=2000
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        # (srok) check 256 vs 32
+        #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+        )
         ;;
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
-if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
 
 echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export PYTHONNOUSERSITE=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+VLLM_BLOCK_SIZE=32
+ASYNC_SCHEDULING_ARGS=""
+
+if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    export VLLM_ROCM_USE_AITER_MOE=0
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
+elif (( CONC < 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
+elif (( CONC == 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c64: using block size 16, shuffle enabled, async scheduling disabled."
+else
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
+fi
 
-vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
-$EP \
---gpu-memory-utilization 0.95 \
---max-model-len $MAX_MODEL_LEN \
---kv-cache-dtype fp8 \
---block-size=32 \
---max-num-seqs $CONC \
---attention-backend "ROCM_AITER_UNIFIED_ATTN" \
---trust-remote-code \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 
+    --block-size=$VLLM_BLOCK_SIZE 
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    $ASYNC_SCHEDULING_ARGS 
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh
similarity index 53%
rename from benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
rename to benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh
index cdded8860..fe85b05ab 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh
@@ -2,23 +2,31 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
 #
 # OFFLOADING values:
-#   none    - SGLang GPU KV only with radix cache disabled.
-#   hicache - SGLang HiCache with local CPU hierarchical cache.
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -35,14 +43,23 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# ---- Server config ----------------------------------------------------------
+# ---- Cache / offload config -------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
@@ -55,26 +72,19 @@ case "$OFFLOADING" in
         # replay needs it; --disable-radix-cache would zero the hit rate.
         ;;
     hicache)
-        # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid
-        # GDN/Mamba path allocates two HiCache host pools per TP rank: one for
-        # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB
-        # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per
-        # host pool, not 250 GB. Keep overrides for one-off tuning.
-        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}"
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
         HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
-        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
         HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
-        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on
-        # MI355X, which requires page_size=1. The kernel/page_first HiCache
-        # transfer path faults on first prefill in this mode on ROCm, so keep
-        # the default on the safer direct/layer_first copy path. These remain
-        # env-overridable for future SGLang/ROCm fixes.
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
         HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
         HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
         HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
-        # SGLang --hicache-size is per rank per host pool, while the workflow
-        # input is a node-total DRAM budget. Divide by TP and the number of
-        # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning.
         HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
         if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
             HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
@@ -92,17 +102,12 @@ case "$OFFLOADING" in
             --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
             --hicache-write-policy "$HICACHE_WRITE_POLICY"
         )
-        # HiCache startup reaches API readiness, but SGLang's internal warmup
-        # request has timed out after 600s on this Qwen MI355X path. Let aiperf
-        # own benchmark traffic instead of blocking server readiness on it.
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
         WARMUP_ARGS=(--skip-server-warmup)
-        # Keep request concurrency as the swept variable, but do not force
-        # HiCache runs to capture ROCm graphs at every high concurrency point.
-        # The conc=32 HiCache job crashed after startup readiness, before any
-        # aiperf traffic, while conc=16 is the highest known-good capture size
-        # for this model/server path. Requests above the capture size can still
-        # run; they just do not require a larger captured graph at startup.
-        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
         if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
             CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
         fi
@@ -116,31 +121,24 @@ esac
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
-{ set +x; } 2>/dev/null
-SGLANG_CMD=(
-    python3 -m sglang.launch_server
-    --attention-backend triton
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
-    --host=0.0.0.0
-    --port "$PORT"
-    --tensor-parallel-size "$TP"
-    --ep-size "$EP_SIZE"
-    --trust-remote-code
-    --tokenizer-worker-num 6
-    --enable-aiter-allreduce-fusion
-    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
-    --max-running-requests "$CONC"
-    --max-prefill-tokens 32768
-    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
-    --mem-fraction-static 0.8
-    --context-length "$MAX_MODEL_LEN"
-    --enable-metrics
-    "${CACHE_ARGS[@]}"
-    "${WARMUP_ARGS[@]}"
-)
-printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
-printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
-"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+python3 -m sglang.launch_server \
+    --attention-backend aiter \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --ep-size $EP_SIZE \
+    --trust-remote-code \
+    --model-loader-extra-config '{"enable_multithread_load": true}' \
+    --watchdog-timeout 1200  \
+    --tokenizer-worker-num 6 \
+    --cuda-graph-max-bs $CONC \
+    --max-running-requests $CONC \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --mem-fraction-static 0.8 \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
@@ -149,4 +147,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index ff901b674..8c6f82410 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -2,18 +2,31 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,23 +43,87 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# ---- Start SGLang server ----------------------------------------------------
+# ---- Cache / offload config -------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
@@ -56,10 +133,10 @@ python3 -m sglang.launch_server \
     --enable-aiter-allreduce-fusion \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --max-prefill-tokens 32768 \
-    --scheduler-recv-interval 30 \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
     --mem-fraction-static 0.8 \
-    --context-length $MAX_MODEL_LEN \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -69,4 +146,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file