Skip to content

Commit 806b3c9

Browse files
committed
manual
Signed-off-by: seungrokj <seungrok.jung@amd.com>
1 parent 461bbe7 commit 806b3c9

1 file changed

Lines changed: 164 additions & 0 deletions

File tree

benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,166 @@ wait_for_lmcache_ready() {
9090
exit 1
9191
}
9292

93+
write_lmcache_cuda_mp_patch() {
94+
local patch_dir="$1"
95+
mkdir -p "$patch_dir"
96+
cat > "$patch_dir/sitecustomize.py" <<'PY'
97+
"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
98+
99+
import os
100+
import threading
101+
102+
if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
103+
import builtins
104+
import sys
105+
106+
_orig_import = builtins.__import__
107+
108+
def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
109+
_LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
110+
111+
if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
112+
return
113+
114+
_orig_init = _LazyMemoryAllocator.__init__
115+
_orig_allocate = _LazyMemoryAllocator.allocate
116+
_orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
117+
118+
def _expand_to(self, target_size: int) -> None:
119+
target_size = min(
120+
self._final_size,
121+
_lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
122+
)
123+
lock = self._agentic_cuda_demand_expand_lock
124+
with lock:
125+
if target_size <= self._curr_size:
126+
return
127+
128+
start_size = self._curr_size
129+
while self._curr_size < target_size:
130+
commit_start = self._curr_size
131+
commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
132+
while self._curr_size < commit_target:
133+
self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
134+
self._curr_size += self.PIN_CHUNK_SIZE
135+
self._commit_expansion(self._curr_size - commit_start)
136+
137+
self._log_expansion_progress(self._curr_size - start_size)
138+
139+
def _retry_with_demand_expansion(self, allocate_once):
140+
obj = allocate_once()
141+
step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
142+
step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
143+
144+
while obj is None and self._curr_size < self._final_size:
145+
_expand_to(self, self._curr_size + step_bytes)
146+
obj = allocate_once()
147+
148+
return obj
149+
150+
def _patched_init(self, *args, **kwargs):
151+
_orig_init(self, *args, **kwargs)
152+
self._agentic_cuda_demand_expand_lock = threading.Lock()
153+
154+
# LMCache MP's upstream LazyMemoryAllocator currently expands to
155+
# the final pinned size in a background thread. On CUDA Kimi TP4,
156+
# vLLM reaches KV-cache registration only after that 1.5 TB pool
157+
# is fully pinned, and the server-side IPC open path can stall
158+
# before acknowledging register_kv_caches. Keep the same final
159+
# capacity, but pin/commit extra host memory only when L1
160+
# allocations actually need it.
161+
self._stop_expand.set()
162+
self._expand_thread.join()
163+
_lazy_memory_allocator.logger.info(
164+
"Agentic CUDA patch: using demand-driven LMCache pinned "
165+
"memory expansion; final capacity remains %s MB",
166+
self._final_size >> 20,
167+
)
168+
169+
def _patched_allocate(
170+
self,
171+
shapes,
172+
dtypes,
173+
fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
174+
allocator_type=None,
175+
):
176+
return _retry_with_demand_expansion(
177+
self,
178+
lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
179+
)
180+
181+
def _patched_batched_allocate(
182+
self,
183+
shapes,
184+
dtypes,
185+
batch_size,
186+
fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
187+
allocator_type=None,
188+
):
189+
return _retry_with_demand_expansion(
190+
self,
191+
lambda: _orig_batched_allocate(
192+
self, shapes, dtypes, batch_size, fmt, allocator_type
193+
),
194+
)
195+
196+
_LazyMemoryAllocator.__init__ = _patched_init
197+
_LazyMemoryAllocator.allocate = _patched_allocate
198+
_LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
199+
_LazyMemoryAllocator._agentic_cuda_demand_patch = True
200+
201+
def _patch_l1_memory_manager(_memory_manager) -> None:
202+
_L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
203+
_LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
204+
if _L1MemoryManager is None or _LazyMemoryAllocator is None:
205+
return
206+
if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
207+
return
208+
209+
_orig_get_memory_usage = _L1MemoryManager.get_memory_usage
210+
211+
def _patched_get_memory_usage(self):
212+
allocator = getattr(self, "_allocator", None)
213+
if isinstance(allocator, _LazyMemoryAllocator):
214+
address_manager = allocator.get_address_manager()
215+
used_size = (
216+
address_manager.get_heap_size() - address_manager.get_free_size()
217+
)
218+
return used_size, allocator._final_size
219+
return _orig_get_memory_usage(self)
220+
221+
_L1MemoryManager.get_memory_usage = _patched_get_memory_usage
222+
_L1MemoryManager._agentic_cuda_final_capacity_patch = True
223+
224+
def _maybe_patch_lazy_memory_allocator() -> None:
225+
module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
226+
if module is not None and hasattr(module, "LazyMemoryAllocator"):
227+
_patch_lazy_memory_allocator(module)
228+
229+
def _maybe_patch_l1_memory_manager() -> None:
230+
module = sys.modules.get("lmcache.v1.distributed.memory_manager")
231+
if module is not None and hasattr(module, "L1MemoryManager"):
232+
_patch_l1_memory_manager(module)
233+
234+
def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
235+
module = _orig_import(name, globals, locals, fromlist, level)
236+
if name == "lmcache.v1.lazy_memory_allocator" or (
237+
name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
238+
):
239+
_maybe_patch_lazy_memory_allocator()
240+
if name == "lmcache.v1.distributed.memory_manager" or (
241+
name.startswith("lmcache")
242+
and "lmcache.v1.distributed.memory_manager" in sys.modules
243+
):
244+
_maybe_patch_l1_memory_manager()
245+
return module
246+
247+
builtins.__import__ = _agentic_cuda_import
248+
_maybe_patch_lazy_memory_allocator()
249+
_maybe_patch_l1_memory_manager()
250+
PY
251+
}
252+
93253
case "$OFFLOADING" in
94254
none)
95255
;;
@@ -114,6 +274,10 @@ case "$OFFLOADING" in
114274
unset VLLM_USE_SIMPLE_KV_OFFLOAD
115275

116276
agentic_pip_install --quiet --no-cache-dir lmcache
277+
LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch"
278+
write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR"
279+
export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
280+
export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
117281
python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
118282

119283
# Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode

0 commit comments

Comments
 (0)