Skip to content

Commit 859aec5

Browse files
committed
fix(agentic): cap Kimi LMCache CPU pool per rank
1 parent 1a300d3 commit 859aec5

1 file changed

Lines changed: 21 additions & 10 deletions

File tree

benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,33 @@ case "$OFFLOADING" in
6464
agentic_pip_install --quiet --no-cache-dir lmcache
6565
python3 -c "import lmcache.integration.vllm.vllm_v1_adapter" >/dev/null
6666

67-
# B200 DGXC nodes have ~2.7 TiB host DRAM. Keep LMCache's local CPU
68-
# pool at the same 2.5 TB envelope as native offload while leaving room
69-
# for vLLM worker RSS and page cache. vLLM splits this total across TP
70-
# ranks for --kv-offloading-backend=lmcache.
67+
# B200 DGXC nodes have ~2.7 TiB host DRAM. Keep the TP=8 LMCache
68+
# path at the same 2.5 TB envelope as native offload while leaving room
69+
# for vLLM worker RSS and page cache.
70+
#
71+
# vLLM splits --kv-offloading-size across TP ranks for LMCache. In the
72+
# current vLLM 0.21.0 + LMCache 0.4.5 integrated connector path, Kimi's
73+
# MLA/HND layout cannot use LazyMixedMemoryAllocator and falls back to a
74+
# full pinned MixedMemoryAllocator allocation. That means TP=4 with a
75+
# 2.5 TB total tries to cudaHostAlloc ~625 GB per rank and fails during
76+
# engine startup, while TP=8 at ~312.5 GB per rank starts successfully.
77+
# Cap lower-TP LMCache runs to the same proven per-rank envelope.
7178
TOTAL_CPU_DRAM_GB=2500
79+
LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK="${LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK:-313}"
80+
LMCACHE_TOTAL_CPU_DRAM_GB="$TOTAL_CPU_DRAM_GB"
81+
if (( LMCACHE_TOTAL_CPU_DRAM_GB > TP * LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK )); then
82+
LMCACHE_TOTAL_CPU_DRAM_GB=$((TP * LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK))
83+
fi
84+
echo "LMCache CPU offload pool: ${LMCACHE_TOTAL_CPU_DRAM_GB} GB total across TP=${TP}"
7285
export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
73-
# Avoid pinning the full CPU pool during engine startup; the integrated
74-
# LMCache allocator grows as agentic prefixes accumulate.
75-
export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR="${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:-true}"
76-
export LMCACHE_LAZY_MEMORY_INITIAL_RATIO="${LMCACHE_LAZY_MEMORY_INITIAL_RATIO:-0.01}"
77-
export LMCACHE_LAZY_MEMORY_STEP_RATIO="${LMCACHE_LAZY_MEMORY_STEP_RATIO:-0.02}"
86+
# Avoid a noisy failed lazy-allocator fallback; the per-rank cap above is
87+
# the actual startup guard for this Kimi/vLLM/LMCache combination.
88+
export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR="${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:-false}"
7889

7990
PREFIX_CACHE_ARGS=(--enable-prefix-caching)
8091
OFFLOAD_ARGS=(
8192
--kv-offloading-backend lmcache
82-
--kv-offloading-size "$TOTAL_CPU_DRAM_GB"
93+
--kv-offloading-size "$LMCACHE_TOTAL_CPU_DRAM_GB"
8394
--disable-hybrid-kv-cache-manager
8495
)
8596
;;

0 commit comments

Comments
 (0)