@@ -90,6 +90,166 @@ wait_for_lmcache_ready() {
9090 exit 1
9191}
9292
93+ write_lmcache_cuda_mp_patch () {
94+ local patch_dir=" $1 "
95+ mkdir -p " $patch_dir "
96+ cat > " $patch_dir /sitecustomize.py" << 'PY '
97+ """Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
98+
99+ import os
100+ import threading
101+
102+ if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
103+ import builtins
104+ import sys
105+
106+ _orig_import = builtins.__import__
107+
108+ def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
109+ _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
110+
111+ if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
112+ return
113+
114+ _orig_init = _LazyMemoryAllocator.__init__
115+ _orig_allocate = _LazyMemoryAllocator.allocate
116+ _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
117+
118+ def _expand_to(self, target_size: int) -> None:
119+ target_size = min(
120+ self._final_size,
121+ _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
122+ )
123+ lock = self._agentic_cuda_demand_expand_lock
124+ with lock:
125+ if target_size <= self._curr_size:
126+ return
127+
128+ start_size = self._curr_size
129+ while self._curr_size < target_size:
130+ commit_start = self._curr_size
131+ commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
132+ while self._curr_size < commit_target:
133+ self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
134+ self._curr_size += self.PIN_CHUNK_SIZE
135+ self._commit_expansion(self._curr_size - commit_start)
136+
137+ self._log_expansion_progress(self._curr_size - start_size)
138+
139+ def _retry_with_demand_expansion(self, allocate_once):
140+ obj = allocate_once()
141+ step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
142+ step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
143+
144+ while obj is None and self._curr_size < self._final_size:
145+ _expand_to(self, self._curr_size + step_bytes)
146+ obj = allocate_once()
147+
148+ return obj
149+
150+ def _patched_init(self, *args, **kwargs):
151+ _orig_init(self, *args, **kwargs)
152+ self._agentic_cuda_demand_expand_lock = threading.Lock()
153+
154+ # LMCache MP's upstream LazyMemoryAllocator currently expands to
155+ # the final pinned size in a background thread. On CUDA Kimi TP4,
156+ # vLLM reaches KV-cache registration only after that 1.5 TB pool
157+ # is fully pinned, and the server-side IPC open path can stall
158+ # before acknowledging register_kv_caches. Keep the same final
159+ # capacity, but pin/commit extra host memory only when L1
160+ # allocations actually need it.
161+ self._stop_expand.set()
162+ self._expand_thread.join()
163+ _lazy_memory_allocator.logger.info(
164+ "Agentic CUDA patch: using demand-driven LMCache pinned "
165+ "memory expansion; final capacity remains %s MB",
166+ self._final_size >> 20,
167+ )
168+
169+ def _patched_allocate(
170+ self,
171+ shapes,
172+ dtypes,
173+ fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
174+ allocator_type=None,
175+ ):
176+ return _retry_with_demand_expansion(
177+ self,
178+ lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
179+ )
180+
181+ def _patched_batched_allocate(
182+ self,
183+ shapes,
184+ dtypes,
185+ batch_size,
186+ fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
187+ allocator_type=None,
188+ ):
189+ return _retry_with_demand_expansion(
190+ self,
191+ lambda: _orig_batched_allocate(
192+ self, shapes, dtypes, batch_size, fmt, allocator_type
193+ ),
194+ )
195+
196+ _LazyMemoryAllocator.__init__ = _patched_init
197+ _LazyMemoryAllocator.allocate = _patched_allocate
198+ _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
199+ _LazyMemoryAllocator._agentic_cuda_demand_patch = True
200+
201+ def _patch_l1_memory_manager(_memory_manager) -> None:
202+ _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
203+ _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
204+ if _L1MemoryManager is None or _LazyMemoryAllocator is None:
205+ return
206+ if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
207+ return
208+
209+ _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
210+
211+ def _patched_get_memory_usage(self):
212+ allocator = getattr(self, "_allocator", None)
213+ if isinstance(allocator, _LazyMemoryAllocator):
214+ address_manager = allocator.get_address_manager()
215+ used_size = (
216+ address_manager.get_heap_size() - address_manager.get_free_size()
217+ )
218+ return used_size, allocator._final_size
219+ return _orig_get_memory_usage(self)
220+
221+ _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
222+ _L1MemoryManager._agentic_cuda_final_capacity_patch = True
223+
224+ def _maybe_patch_lazy_memory_allocator() -> None:
225+ module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
226+ if module is not None and hasattr(module, "LazyMemoryAllocator"):
227+ _patch_lazy_memory_allocator(module)
228+
229+ def _maybe_patch_l1_memory_manager() -> None:
230+ module = sys.modules.get("lmcache.v1.distributed.memory_manager")
231+ if module is not None and hasattr(module, "L1MemoryManager"):
232+ _patch_l1_memory_manager(module)
233+
234+ def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
235+ module = _orig_import(name, globals, locals, fromlist, level)
236+ if name == "lmcache.v1.lazy_memory_allocator" or (
237+ name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
238+ ):
239+ _maybe_patch_lazy_memory_allocator()
240+ if name == "lmcache.v1.distributed.memory_manager" or (
241+ name.startswith("lmcache")
242+ and "lmcache.v1.distributed.memory_manager" in sys.modules
243+ ):
244+ _maybe_patch_l1_memory_manager()
245+ return module
246+
247+ builtins.__import__ = _agentic_cuda_import
248+ _maybe_patch_lazy_memory_allocator()
249+ _maybe_patch_l1_memory_manager()
250+ PY
251+ }
252+
93253case " $OFFLOADING " in
94254 none)
95255 ;;
@@ -114,6 +274,10 @@ case "$OFFLOADING" in
114274 unset VLLM_USE_SIMPLE_KV_OFFLOAD
115275
116276 agentic_pip_install --quiet --no-cache-dir lmcache
277+ LMCACHE_CUDA_PATCH_DIR=" $RESULT_DIR /lmcache_cuda_patch"
278+ write_lmcache_cuda_mp_patch " $LMCACHE_CUDA_PATCH_DIR "
279+ export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
280+ export PYTHONPATH=" $LMCACHE_CUDA_PATCH_DIR ${PYTHONPATH: +: $PYTHONPATH } "
117281 python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
118282
119283 # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode
0 commit comments