Skip to content

Commit 18eb2d5

Browse files
committed
manual
Signed-off-by: seungrokj <seungrok.jung@amd.com>
1 parent 806b3c9 commit 18eb2d5

1 file changed

Lines changed: 34 additions & 196 deletions

File tree

benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh

Lines changed: 34 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -90,165 +90,42 @@ wait_for_lmcache_ready() {
9090
exit 1
9191
}
9292

93-
write_lmcache_cuda_mp_patch() {
94-
local patch_dir="$1"
95-
mkdir -p "$patch_dir"
96-
cat > "$patch_dir/sitecustomize.py" <<'PY'
97-
"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
98-
99-
import os
100-
import threading
101-
102-
if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
103-
import builtins
104-
import sys
105-
106-
_orig_import = builtins.__import__
107-
108-
def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
109-
_LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
110-
111-
if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
112-
return
113-
114-
_orig_init = _LazyMemoryAllocator.__init__
115-
_orig_allocate = _LazyMemoryAllocator.allocate
116-
_orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
117-
118-
def _expand_to(self, target_size: int) -> None:
119-
target_size = min(
120-
self._final_size,
121-
_lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
122-
)
123-
lock = self._agentic_cuda_demand_expand_lock
124-
with lock:
125-
if target_size <= self._curr_size:
126-
return
127-
128-
start_size = self._curr_size
129-
while self._curr_size < target_size:
130-
commit_start = self._curr_size
131-
commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
132-
while self._curr_size < commit_target:
133-
self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
134-
self._curr_size += self.PIN_CHUNK_SIZE
135-
self._commit_expansion(self._curr_size - commit_start)
136-
137-
self._log_expansion_progress(self._curr_size - start_size)
138-
139-
def _retry_with_demand_expansion(self, allocate_once):
140-
obj = allocate_once()
141-
step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
142-
step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
143-
144-
while obj is None and self._curr_size < self._final_size:
145-
_expand_to(self, self._curr_size + step_bytes)
146-
obj = allocate_once()
147-
148-
return obj
149-
150-
def _patched_init(self, *args, **kwargs):
151-
_orig_init(self, *args, **kwargs)
152-
self._agentic_cuda_demand_expand_lock = threading.Lock()
153-
154-
# LMCache MP's upstream LazyMemoryAllocator currently expands to
155-
# the final pinned size in a background thread. On CUDA Kimi TP4,
156-
# vLLM reaches KV-cache registration only after that 1.5 TB pool
157-
# is fully pinned, and the server-side IPC open path can stall
158-
# before acknowledging register_kv_caches. Keep the same final
159-
# capacity, but pin/commit extra host memory only when L1
160-
# allocations actually need it.
161-
self._stop_expand.set()
162-
self._expand_thread.join()
163-
_lazy_memory_allocator.logger.info(
164-
"Agentic CUDA patch: using demand-driven LMCache pinned "
165-
"memory expansion; final capacity remains %s MB",
166-
self._final_size >> 20,
167-
)
168-
169-
def _patched_allocate(
170-
self,
171-
shapes,
172-
dtypes,
173-
fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
174-
allocator_type=None,
175-
):
176-
return _retry_with_demand_expansion(
177-
self,
178-
lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
179-
)
180-
181-
def _patched_batched_allocate(
182-
self,
183-
shapes,
184-
dtypes,
185-
batch_size,
186-
fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
187-
allocator_type=None,
188-
):
189-
return _retry_with_demand_expansion(
190-
self,
191-
lambda: _orig_batched_allocate(
192-
self, shapes, dtypes, batch_size, fmt, allocator_type
193-
),
194-
)
195-
196-
_LazyMemoryAllocator.__init__ = _patched_init
197-
_LazyMemoryAllocator.allocate = _patched_allocate
198-
_LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
199-
_LazyMemoryAllocator._agentic_cuda_demand_patch = True
200-
201-
def _patch_l1_memory_manager(_memory_manager) -> None:
202-
_L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
203-
_LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
204-
if _L1MemoryManager is None or _LazyMemoryAllocator is None:
205-
return
206-
if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
207-
return
208-
209-
_orig_get_memory_usage = _L1MemoryManager.get_memory_usage
210-
211-
def _patched_get_memory_usage(self):
212-
allocator = getattr(self, "_allocator", None)
213-
if isinstance(allocator, _LazyMemoryAllocator):
214-
address_manager = allocator.get_address_manager()
215-
used_size = (
216-
address_manager.get_heap_size() - address_manager.get_free_size()
217-
)
218-
return used_size, allocator._final_size
219-
return _orig_get_memory_usage(self)
220-
221-
_L1MemoryManager.get_memory_usage = _patched_get_memory_usage
222-
_L1MemoryManager._agentic_cuda_final_capacity_patch = True
223-
224-
def _maybe_patch_lazy_memory_allocator() -> None:
225-
module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
226-
if module is not None and hasattr(module, "LazyMemoryAllocator"):
227-
_patch_lazy_memory_allocator(module)
228-
229-
def _maybe_patch_l1_memory_manager() -> None:
230-
module = sys.modules.get("lmcache.v1.distributed.memory_manager")
231-
if module is not None and hasattr(module, "L1MemoryManager"):
232-
_patch_l1_memory_manager(module)
93+
echo "Starting vllm server..."
94+
export TORCH_CUDA_ARCH_LIST="10.0"
95+
export PYTHONNOUSERSITE=1
96+
# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
97+
# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
98+
# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
99+
# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
100+
# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
101+
# net the estimator provides, so disabling it is redundant rather than
102+
# unsafe.
103+
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
233104

234-
def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
235-
module = _orig_import(name, globals, locals, fromlist, level)
236-
if name == "lmcache.v1.lazy_memory_allocator" or (
237-
name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
238-
):
239-
_maybe_patch_lazy_memory_allocator()
240-
if name == "lmcache.v1.distributed.memory_manager" or (
241-
name.startswith("lmcache")
242-
and "lmcache.v1.distributed.memory_manager" in sys.modules
243-
):
244-
_maybe_patch_l1_memory_manager()
245-
return module
105+
{ set +x; } 2>/dev/null
106+
VLLM_CMD=(
107+
vllm serve "$MODEL"
108+
--host 0.0.0.0
109+
--port "$PORT"
110+
--tensor-parallel-size="$TP"
111+
--gpu-memory-utilization 0.90
112+
--max-num-seqs "$CONC"
113+
--reasoning-parser kimi_k2
114+
--tool-call-parser kimi_k2
115+
--compilation_config.pass_config.fuse_allreduce_rms true
116+
--kv-cache-dtype fp8
117+
--max-cudagraph-capture-size 2048
118+
--stream-interval 20
119+
--trust-remote-code
120+
"${PREFIX_CACHE_ARGS[@]}"
121+
"${OFFLOAD_ARGS[@]}"
122+
)
123+
printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
124+
printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
125+
"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
126+
SERVER_PID=$!
127+
echo "Server PID: $SERVER_PID"
246128

247-
builtins.__import__ = _agentic_cuda_import
248-
_maybe_patch_lazy_memory_allocator()
249-
_maybe_patch_l1_memory_manager()
250-
PY
251-
}
252129

253130
case "$OFFLOADING" in
254131
none)
@@ -274,10 +151,6 @@ case "$OFFLOADING" in
274151
unset VLLM_USE_SIMPLE_KV_OFFLOAD
275152

276153
agentic_pip_install --quiet --no-cache-dir lmcache
277-
LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch"
278-
write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR"
279-
export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
280-
export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
281154
python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
282155

283156
# Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode
@@ -337,41 +210,6 @@ case "$OFFLOADING" in
337210
;;
338211
esac
339212

340-
echo "Starting vllm server..."
341-
export TORCH_CUDA_ARCH_LIST="10.0"
342-
export PYTHONNOUSERSITE=1
343-
# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
344-
# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
345-
# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
346-
# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
347-
# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
348-
# net the estimator provides, so disabling it is redundant rather than
349-
# unsafe.
350-
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
351-
352-
{ set +x; } 2>/dev/null
353-
VLLM_CMD=(
354-
vllm serve "$MODEL"
355-
--host 0.0.0.0
356-
--port "$PORT"
357-
--tensor-parallel-size="$TP"
358-
--gpu-memory-utilization 0.90
359-
--max-num-seqs "$CONC"
360-
--reasoning-parser kimi_k2
361-
--tool-call-parser kimi_k2
362-
--compilation_config.pass_config.fuse_allreduce_rms true
363-
--kv-cache-dtype fp8
364-
--max-cudagraph-capture-size 2048
365-
--stream-interval 20
366-
--trust-remote-code
367-
"${PREFIX_CACHE_ARGS[@]}"
368-
"${OFFLOAD_ARGS[@]}"
369-
)
370-
printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
371-
printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
372-
"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
373-
SERVER_PID=$!
374-
echo "Server PID: $SERVER_PID"
375213

376214
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
377215

0 commit comments

Comments
 (0)