manual

seungrokj · seungrokj · commit 806b3c948bd7 · 2026-05-26T13:40:00.000+09:00
Signed-off-by: seungrokj &lt;seungrok.jung@amd.com&gt;
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -90,6 +90,166 @@ wait_for_lmcache_ready() {
     exit 1
 }
 
+write_lmcache_cuda_mp_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
+
+import os
+import threading
+
+if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
+    import builtins
+    import sys
+
+    _orig_import = builtins.__import__
+
+    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
+        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
+
+        if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
+            return
+
+        _orig_init = _LazyMemoryAllocator.__init__
+        _orig_allocate = _LazyMemoryAllocator.allocate
+        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
+
+        def _expand_to(self, target_size: int) -> None:
+            target_size = min(
+                self._final_size,
+                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
+            )
+            lock = self._agentic_cuda_demand_expand_lock
+            with lock:
+                if target_size <= self._curr_size:
+                    return
+
+                start_size = self._curr_size
+                while self._curr_size < target_size:
+                    commit_start = self._curr_size
+                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
+                    while self._curr_size < commit_target:
+                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
+                        self._curr_size += self.PIN_CHUNK_SIZE
+                    self._commit_expansion(self._curr_size - commit_start)
+
+                self._log_expansion_progress(self._curr_size - start_size)
+
+        def _retry_with_demand_expansion(self, allocate_once):
+            obj = allocate_once()
+            step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
+            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
+
+            while obj is None and self._curr_size < self._final_size:
+                _expand_to(self, self._curr_size + step_bytes)
+                obj = allocate_once()
+
+            return obj
+
+        def _patched_init(self, *args, **kwargs):
+            _orig_init(self, *args, **kwargs)
+            self._agentic_cuda_demand_expand_lock = threading.Lock()
+
+            # LMCache MP's upstream LazyMemoryAllocator currently expands to
+            # the final pinned size in a background thread. On CUDA Kimi TP4,
+            # vLLM reaches KV-cache registration only after that 1.5 TB pool
+            # is fully pinned, and the server-side IPC open path can stall
+            # before acknowledging register_kv_caches. Keep the same final
+            # capacity, but pin/commit extra host memory only when L1
+            # allocations actually need it.
+            self._stop_expand.set()
+            self._expand_thread.join()
+            _lazy_memory_allocator.logger.info(
+                "Agentic CUDA patch: using demand-driven LMCache pinned "
+                "memory expansion; final capacity remains %s MB",
+                self._final_size >> 20,
+            )
+
+        def _patched_allocate(
+            self,
+            shapes,
+            dtypes,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
+            )
+
+        def _patched_batched_allocate(
+            self,
+            shapes,
+            dtypes,
+            batch_size,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_batched_allocate(
+                    self, shapes, dtypes, batch_size, fmt, allocator_type
+                ),
+            )
+
+        _LazyMemoryAllocator.__init__ = _patched_init
+        _LazyMemoryAllocator.allocate = _patched_allocate
+        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
+        _LazyMemoryAllocator._agentic_cuda_demand_patch = True
+
+    def _patch_l1_memory_manager(_memory_manager) -> None:
+        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
+        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
+        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
+            return
+        if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
+            return
+
+        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
+
+        def _patched_get_memory_usage(self):
+            allocator = getattr(self, "_allocator", None)
+            if isinstance(allocator, _LazyMemoryAllocator):
+                address_manager = allocator.get_address_manager()
+                used_size = (
+                    address_manager.get_heap_size() - address_manager.get_free_size()
+                )
+                return used_size, allocator._final_size
+            return _orig_get_memory_usage(self)
+
+        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
+        _L1MemoryManager._agentic_cuda_final_capacity_patch = True
+
+    def _maybe_patch_lazy_memory_allocator() -> None:
+        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
+        if module is not None and hasattr(module, "LazyMemoryAllocator"):
+            _patch_lazy_memory_allocator(module)
+
+    def _maybe_patch_l1_memory_manager() -> None:
+        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
+        if module is not None and hasattr(module, "L1MemoryManager"):
+            _patch_l1_memory_manager(module)
+
+    def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
+        module = _orig_import(name, globals, locals, fromlist, level)
+        if name == "lmcache.v1.lazy_memory_allocator" or (
+            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
+        ):
+            _maybe_patch_lazy_memory_allocator()
+        if name == "lmcache.v1.distributed.memory_manager" or (
+            name.startswith("lmcache")
+            and "lmcache.v1.distributed.memory_manager" in sys.modules
+        ):
+            _maybe_patch_l1_memory_manager()
+        return module
+
+    builtins.__import__ = _agentic_cuda_import
+    _maybe_patch_lazy_memory_allocator()
+    _maybe_patch_l1_memory_manager()
+PY
+}
+
 case "$OFFLOADING" in
     none)
         ;;
@@ -114,6 +274,10 @@ case "$OFFLOADING" in
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
         agentic_pip_install --quiet --no-cache-dir lmcache
+        LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch"
+        write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR"
+        export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
+        export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode