@@ -90,165 +90,42 @@ wait_for_lmcache_ready() {
9090 exit 1
9191}
9292
93- write_lmcache_cuda_mp_patch () {
94- local patch_dir=" $1 "
95- mkdir -p " $patch_dir "
96- cat > " $patch_dir /sitecustomize.py" << 'PY '
97- """Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
98-
99- import os
100- import threading
101-
102- if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
103- import builtins
104- import sys
105-
106- _orig_import = builtins.__import__
107-
108- def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
109- _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
110-
111- if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
112- return
113-
114- _orig_init = _LazyMemoryAllocator.__init__
115- _orig_allocate = _LazyMemoryAllocator.allocate
116- _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
117-
118- def _expand_to(self, target_size: int) -> None:
119- target_size = min(
120- self._final_size,
121- _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
122- )
123- lock = self._agentic_cuda_demand_expand_lock
124- with lock:
125- if target_size <= self._curr_size:
126- return
127-
128- start_size = self._curr_size
129- while self._curr_size < target_size:
130- commit_start = self._curr_size
131- commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
132- while self._curr_size < commit_target:
133- self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
134- self._curr_size += self.PIN_CHUNK_SIZE
135- self._commit_expansion(self._curr_size - commit_start)
136-
137- self._log_expansion_progress(self._curr_size - start_size)
138-
139- def _retry_with_demand_expansion(self, allocate_once):
140- obj = allocate_once()
141- step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
142- step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
143-
144- while obj is None and self._curr_size < self._final_size:
145- _expand_to(self, self._curr_size + step_bytes)
146- obj = allocate_once()
147-
148- return obj
149-
150- def _patched_init(self, *args, **kwargs):
151- _orig_init(self, *args, **kwargs)
152- self._agentic_cuda_demand_expand_lock = threading.Lock()
153-
154- # LMCache MP's upstream LazyMemoryAllocator currently expands to
155- # the final pinned size in a background thread. On CUDA Kimi TP4,
156- # vLLM reaches KV-cache registration only after that 1.5 TB pool
157- # is fully pinned, and the server-side IPC open path can stall
158- # before acknowledging register_kv_caches. Keep the same final
159- # capacity, but pin/commit extra host memory only when L1
160- # allocations actually need it.
161- self._stop_expand.set()
162- self._expand_thread.join()
163- _lazy_memory_allocator.logger.info(
164- "Agentic CUDA patch: using demand-driven LMCache pinned "
165- "memory expansion; final capacity remains %s MB",
166- self._final_size >> 20,
167- )
168-
169- def _patched_allocate(
170- self,
171- shapes,
172- dtypes,
173- fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
174- allocator_type=None,
175- ):
176- return _retry_with_demand_expansion(
177- self,
178- lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
179- )
180-
181- def _patched_batched_allocate(
182- self,
183- shapes,
184- dtypes,
185- batch_size,
186- fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
187- allocator_type=None,
188- ):
189- return _retry_with_demand_expansion(
190- self,
191- lambda: _orig_batched_allocate(
192- self, shapes, dtypes, batch_size, fmt, allocator_type
193- ),
194- )
195-
196- _LazyMemoryAllocator.__init__ = _patched_init
197- _LazyMemoryAllocator.allocate = _patched_allocate
198- _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
199- _LazyMemoryAllocator._agentic_cuda_demand_patch = True
200-
201- def _patch_l1_memory_manager(_memory_manager) -> None:
202- _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
203- _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
204- if _L1MemoryManager is None or _LazyMemoryAllocator is None:
205- return
206- if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
207- return
208-
209- _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
210-
211- def _patched_get_memory_usage(self):
212- allocator = getattr(self, "_allocator", None)
213- if isinstance(allocator, _LazyMemoryAllocator):
214- address_manager = allocator.get_address_manager()
215- used_size = (
216- address_manager.get_heap_size() - address_manager.get_free_size()
217- )
218- return used_size, allocator._final_size
219- return _orig_get_memory_usage(self)
220-
221- _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
222- _L1MemoryManager._agentic_cuda_final_capacity_patch = True
223-
224- def _maybe_patch_lazy_memory_allocator() -> None:
225- module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
226- if module is not None and hasattr(module, "LazyMemoryAllocator"):
227- _patch_lazy_memory_allocator(module)
228-
229- def _maybe_patch_l1_memory_manager() -> None:
230- module = sys.modules.get("lmcache.v1.distributed.memory_manager")
231- if module is not None and hasattr(module, "L1MemoryManager"):
232- _patch_l1_memory_manager(module)
93+ echo " Starting vllm server..."
94+ export TORCH_CUDA_ARCH_LIST=" 10.0"
95+ export PYTHONNOUSERSITE=1
96+ # Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
97+ # eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
98+ # (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
99+ # trips before the engine starts. Our --gpu-memory-utilization=0.90 already
100+ # leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
101+ # net the estimator provides, so disabling it is redundant rather than
102+ # unsafe.
103+ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
233104
234- def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
235- module = _orig_import(name, globals, locals, fromlist, level)
236- if name == "lmcache.v1.lazy_memory_allocator" or (
237- name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
238- ):
239- _maybe_patch_lazy_memory_allocator()
240- if name == "lmcache.v1.distributed.memory_manager" or (
241- name.startswith("lmcache")
242- and "lmcache.v1.distributed.memory_manager" in sys.modules
243- ):
244- _maybe_patch_l1_memory_manager()
245- return module
105+ { set +x; } 2> /dev/null
106+ VLLM_CMD=(
107+ vllm serve " $MODEL "
108+ --host 0.0.0.0
109+ --port " $PORT "
110+ --tensor-parallel-size=" $TP "
111+ --gpu-memory-utilization 0.90
112+ --max-num-seqs " $CONC "
113+ --reasoning-parser kimi_k2
114+ --tool-call-parser kimi_k2
115+ --compilation_config.pass_config.fuse_allreduce_rms true
116+ --kv-cache-dtype fp8
117+ --max-cudagraph-capture-size 2048
118+ --stream-interval 20
119+ --trust-remote-code
120+ " ${PREFIX_CACHE_ARGS[@]} "
121+ " ${OFFLOAD_ARGS[@]} "
122+ )
123+ printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
124+ printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
125+ " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
126+ SERVER_PID=$!
127+ echo " Server PID: $SERVER_PID "
246128
247- builtins.__import__ = _agentic_cuda_import
248- _maybe_patch_lazy_memory_allocator()
249- _maybe_patch_l1_memory_manager()
250- PY
251- }
252129
253130case " $OFFLOADING " in
254131 none)
@@ -274,10 +151,6 @@ case "$OFFLOADING" in
274151 unset VLLM_USE_SIMPLE_KV_OFFLOAD
275152
276153 agentic_pip_install --quiet --no-cache-dir lmcache
277- LMCACHE_CUDA_PATCH_DIR=" $RESULT_DIR /lmcache_cuda_patch"
278- write_lmcache_cuda_mp_patch " $LMCACHE_CUDA_PATCH_DIR "
279- export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
280- export PYTHONPATH=" $LMCACHE_CUDA_PATCH_DIR ${PYTHONPATH: +: $PYTHONPATH } "
281154 python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
282155
283156 # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode
@@ -337,41 +210,6 @@ case "$OFFLOADING" in
337210 ;;
338211esac
339212
340- echo " Starting vllm server..."
341- export TORCH_CUDA_ARCH_LIST=" 10.0"
342- export PYTHONNOUSERSITE=1
343- # Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
344- # eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
345- # (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
346- # trips before the engine starts. Our --gpu-memory-utilization=0.90 already
347- # leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
348- # net the estimator provides, so disabling it is redundant rather than
349- # unsafe.
350- export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
351-
352- { set +x; } 2> /dev/null
353- VLLM_CMD=(
354- vllm serve " $MODEL "
355- --host 0.0.0.0
356- --port " $PORT "
357- --tensor-parallel-size=" $TP "
358- --gpu-memory-utilization 0.90
359- --max-num-seqs " $CONC "
360- --reasoning-parser kimi_k2
361- --tool-call-parser kimi_k2
362- --compilation_config.pass_config.fuse_allreduce_rms true
363- --kv-cache-dtype fp8
364- --max-cudagraph-capture-size 2048
365- --stream-interval 20
366- --trust-remote-code
367- " ${PREFIX_CACHE_ARGS[@]} "
368- " ${OFFLOAD_ARGS[@]} "
369- )
370- printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
371- printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
372- " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
373- SERVER_PID=$!
374- echo " Server PID: $SERVER_PID "
375213
376214wait_for_server_ready --port " $PORT " --server-log " $SERVER_LOG " --server-pid " $SERVER_PID "
377215
0 commit comments