@@ -64,22 +64,33 @@ case "$OFFLOADING" in
6464 agentic_pip_install --quiet --no-cache-dir lmcache
6565 python3 -c " import lmcache.integration.vllm.vllm_v1_adapter" > /dev/null
6666
67- # B200 DGXC nodes have ~2.7 TiB host DRAM. Keep LMCache's local CPU
68- # pool at the same 2.5 TB envelope as native offload while leaving room
69- # for vLLM worker RSS and page cache. vLLM splits this total across TP
70- # ranks for --kv-offloading-backend=lmcache.
67+ # B200 DGXC nodes have ~2.7 TiB host DRAM. Keep the TP=8 LMCache
68+ # path at the same 2.5 TB envelope as native offload while leaving room
69+ # for vLLM worker RSS and page cache.
70+ #
71+ # vLLM splits --kv-offloading-size across TP ranks for LMCache. In the
72+ # current vLLM 0.21.0 + LMCache 0.4.5 integrated connector path, Kimi's
73+ # MLA/HND layout cannot use LazyMixedMemoryAllocator and falls back to a
74+ # full pinned MixedMemoryAllocator allocation. That means TP=4 with a
75+ # 2.5 TB total tries to cudaHostAlloc ~625 GB per rank and fails during
76+ # engine startup, while TP=8 at ~312.5 GB per rank starts successfully.
77+ # Cap lower-TP LMCache runs to the same proven per-rank envelope.
7178 TOTAL_CPU_DRAM_GB=2500
79+ LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK=" ${LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK:- 313} "
80+ LMCACHE_TOTAL_CPU_DRAM_GB=" $TOTAL_CPU_DRAM_GB "
81+ if (( LMCACHE_TOTAL_CPU_DRAM_GB > TP * LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK )) ; then
82+ LMCACHE_TOTAL_CPU_DRAM_GB=$(( TP * LMCACHE_MAX_LOCAL_CPU_GB_PER_RANK))
83+ fi
84+ echo " LMCache CPU offload pool: ${LMCACHE_TOTAL_CPU_DRAM_GB} GB total across TP=${TP} "
7285 export LMCACHE_CHUNK_SIZE=" ${LMCACHE_CHUNK_SIZE:- 256} "
73- # Avoid pinning the full CPU pool during engine startup; the integrated
74- # LMCache allocator grows as agentic prefixes accumulate.
75- export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR=" ${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:- true} "
76- export LMCACHE_LAZY_MEMORY_INITIAL_RATIO=" ${LMCACHE_LAZY_MEMORY_INITIAL_RATIO:- 0.01} "
77- export LMCACHE_LAZY_MEMORY_STEP_RATIO=" ${LMCACHE_LAZY_MEMORY_STEP_RATIO:- 0.02} "
86+ # Avoid a noisy failed lazy-allocator fallback; the per-rank cap above is
87+ # the actual startup guard for this Kimi/vLLM/LMCache combination.
88+ export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR=" ${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:- false} "
7889
7990 PREFIX_CACHE_ARGS=(--enable-prefix-caching)
8091 OFFLOAD_ARGS=(
8192 --kv-offloading-backend lmcache
82- --kv-offloading-size " $TOTAL_CPU_DRAM_GB "
93+ --kv-offloading-size " $LMCACHE_TOTAL_CPU_DRAM_GB "
8394 --disable-hybrid-kv-cache-manager
8495 )
8596 ;;
0 commit comments