Skip to content

Commit b089e28

Browse files
seungrokjclaude
andcommitted
fix(agentic): add CUDA LMCache MP patch for Kimi FP4 B200
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 2af7377 commit b089e28

2 files changed

Lines changed: 4 additions & 344 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,10 +616,12 @@ kimik2.5-fp4-mi355x-vllm-agentic:
616616
agentic-coding:
617617
- duration: 1800
618618
search-space:
619-
- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
620619
- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] }
621-
- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
622620
- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] }
621+
#- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
622+
#- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] }
623+
#- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
624+
#- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] }
623625

624626
kimik2.5-fp4-mi355x-atom:
625627
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2

benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh

Lines changed: 0 additions & 342 deletions
Original file line numberDiff line numberDiff line change
@@ -55,290 +55,6 @@ if [ "${TP}" -lt 8 ]; then
5555
export VLLM_ROCM_USE_AITER_RMSNORM=0
5656
fi
5757

58-
#write_lmcache_rocm_mp_patch() {
59-
# local patch_dir="$1"
60-
# mkdir -p "$patch_dir"
61-
# cat > "$patch_dir/sitecustomize.py" <<'PY'
62-
#"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
63-
#
64-
#import os
65-
#import threading
66-
#
67-
#if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
68-
# import builtins
69-
# import sys
70-
#
71-
# _orig_import = builtins.__import__
72-
#
73-
# def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
74-
# _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
75-
#
76-
# if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
77-
# return
78-
#
79-
# _orig_init = _LazyMemoryAllocator.__init__
80-
# _orig_allocate = _LazyMemoryAllocator.allocate
81-
# _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
82-
#
83-
# def _expand_to(self, target_size: int) -> None:
84-
# target_size = min(
85-
# self._final_size,
86-
# _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
87-
# )
88-
# lock = self._agentic_rocm_demand_expand_lock
89-
# with lock:
90-
# if target_size <= self._curr_size:
91-
# return
92-
#
93-
# start_size = self._curr_size
94-
# while self._curr_size < target_size:
95-
# commit_start = self._curr_size
96-
# commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
97-
# while self._curr_size < commit_target:
98-
# self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
99-
# self._curr_size += self.PIN_CHUNK_SIZE
100-
# self._commit_expansion(self._curr_size - commit_start)
101-
#
102-
# self._log_expansion_progress(self._curr_size - start_size)
103-
#
104-
# def _retry_with_demand_expansion(self, allocate_once):
105-
# obj = allocate_once()
106-
# step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
107-
# step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
108-
#
109-
# while obj is None and self._curr_size < self._final_size:
110-
# _expand_to(self, self._curr_size + step_bytes)
111-
# obj = allocate_once()
112-
#
113-
# return obj
114-
#
115-
# def _patched_init(self, *args, **kwargs):
116-
# _orig_init(self, *args, **kwargs)
117-
# self._agentic_rocm_demand_expand_lock = threading.Lock()
118-
#
119-
# # LMCache MP's upstream LazyMemoryAllocator currently expands to
120-
# # the final pinned size in a background thread. On ROCm Kimi TP4,
121-
# # vLLM reaches KV-cache registration only after that 2.5 TB pool
122-
# # is fully pinned, and the server-side IPC open path can stall
123-
# # before acknowledging register_kv_caches. Keep the same final
124-
# # capacity, but pin/commit extra host memory only when L1
125-
# # allocations actually need it.
126-
# self._stop_expand.set()
127-
# self._expand_thread.join()
128-
# _lazy_memory_allocator.logger.info(
129-
# "Agentic ROCm patch: using demand-driven LMCache pinned "
130-
# "memory expansion; final capacity remains %s MB",
131-
# self._final_size >> 20,
132-
# )
133-
#
134-
# def _patched_allocate(
135-
# self,
136-
# shapes,
137-
# dtypes,
138-
# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
139-
# allocator_type=None,
140-
# ):
141-
# return _retry_with_demand_expansion(
142-
# self,
143-
# lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
144-
# )
145-
#
146-
# def _patched_batched_allocate(
147-
# self,
148-
# shapes,
149-
# dtypes,
150-
# batch_size,
151-
# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
152-
# allocator_type=None,
153-
# ):
154-
# return _retry_with_demand_expansion(
155-
# self,
156-
# lambda: _orig_batched_allocate(
157-
# self, shapes, dtypes, batch_size, fmt, allocator_type
158-
# ),
159-
# )
160-
#
161-
# _LazyMemoryAllocator.__init__ = _patched_init
162-
# _LazyMemoryAllocator.allocate = _patched_allocate
163-
# _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
164-
# _LazyMemoryAllocator._agentic_rocm_demand_patch = True
165-
#
166-
# def _patch_l1_memory_manager(_memory_manager) -> None:
167-
# _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
168-
# _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
169-
# if _L1MemoryManager is None or _LazyMemoryAllocator is None:
170-
# return
171-
# if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
172-
# return
173-
#
174-
# _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
175-
#
176-
# def _patched_get_memory_usage(self):
177-
# allocator = getattr(self, "_allocator", None)
178-
# if isinstance(allocator, _LazyMemoryAllocator):
179-
# address_manager = allocator.get_address_manager()
180-
# used_size = (
181-
# address_manager.get_heap_size() - address_manager.get_free_size()
182-
# )
183-
# return used_size, allocator._final_size
184-
# return _orig_get_memory_usage(self)
185-
#
186-
# _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
187-
# _L1MemoryManager._agentic_rocm_final_capacity_patch = True
188-
#
189-
# def _maybe_patch_lazy_memory_allocator() -> None:
190-
# module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
191-
# if module is not None and hasattr(module, "LazyMemoryAllocator"):
192-
# _patch_lazy_memory_allocator(module)
193-
#
194-
# def _maybe_patch_l1_memory_manager() -> None:
195-
# module = sys.modules.get("lmcache.v1.distributed.memory_manager")
196-
# if module is not None and hasattr(module, "L1MemoryManager"):
197-
# _patch_l1_memory_manager(module)
198-
#
199-
# def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
200-
# module = _orig_import(name, globals, locals, fromlist, level)
201-
# if name == "lmcache.v1.lazy_memory_allocator" or (
202-
# name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
203-
# ):
204-
# _maybe_patch_lazy_memory_allocator()
205-
# if name == "lmcache.v1.distributed.memory_manager" or (
206-
# name.startswith("lmcache")
207-
# and "lmcache.v1.distributed.memory_manager" in sys.modules
208-
# ):
209-
# _maybe_patch_l1_memory_manager()
210-
# return module
211-
#
212-
# builtins.__import__ = _agentic_rocm_import
213-
# _maybe_patch_lazy_memory_allocator()
214-
# _maybe_patch_l1_memory_manager()
215-
#
216-
#if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
217-
# import torch
218-
# import lmcache.non_cuda_equivalents as lmc
219-
#
220-
# if not hasattr(lmc, "multi_layer_block_kv_transfer"):
221-
# _DTYPE_BY_NAME = {
222-
# "bfloat16": torch.bfloat16,
223-
# "float16": torch.float16,
224-
# "float32": torch.float32,
225-
# }
226-
#
227-
# def _dtype_from_env() -> torch.dtype:
228-
# name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
229-
# try:
230-
# return _DTYPE_BY_NAME[name]
231-
# except KeyError as exc:
232-
# raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
233-
#
234-
# def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
235-
# block_stride = shape_desc.block_stride_elems or (
236-
# shape_desc.bs * shape_desc.nh * shape_desc.hs
237-
# )
238-
# base = lmc._tensor_from_ptr(
239-
# ptr,
240-
# (shape_desc.nb * block_stride,),
241-
# dtype,
242-
# device,
243-
# )
244-
# return torch.as_strided(
245-
# base,
246-
# (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
247-
# (block_stride, shape_desc.nh * shape_desc.hs, 1),
248-
# )
249-
#
250-
# def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
251-
# return lmc._tensor_from_ptr(
252-
# ptr,
253-
# (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
254-
# dtype,
255-
# device,
256-
# )
257-
#
258-
# def multi_layer_block_kv_transfer(
259-
# group_kv_pointers,
260-
# tmp_buffer_ptrs,
261-
# block_ids,
262-
# paged_memory_device,
263-
# direction,
264-
# shape_desc,
265-
# lmcache_chunk_size,
266-
# gpu_kv_format,
267-
# skip_blocks=0,
268-
# ) -> None:
269-
# # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
270-
# # shape [num_blocks, block_size, hidden_size]. LMCache's Python
271-
# # fallback has no block-transfer entrypoint yet, so implement the
272-
# # same gather/scatter contract with torch indexing on ROCm.
273-
# if shape_desc.kv_size != 1:
274-
# raise NotImplementedError(
275-
# "ROCm LMCache MP block fallback currently supports MLA KV caches only"
276-
# )
277-
#
278-
# dtype = _dtype_from_env()
279-
# device = (
280-
# paged_memory_device
281-
# if isinstance(paged_memory_device, torch.device)
282-
# else torch.device(paged_memory_device)
283-
# )
284-
# num_layers = int(group_kv_pointers.numel())
285-
# blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
286-
# direction_name = getattr(direction, "name", str(direction))
287-
#
288-
# for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
289-
# start = chunk_idx * blocks_per_chunk
290-
# end = start + blocks_per_chunk
291-
# chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
292-
#
293-
# dest_slot_offset = 0
294-
# if skip_blocks and chunk_idx == 0:
295-
# chunk_blocks = chunk_blocks[int(skip_blocks):]
296-
# dest_slot_offset = int(skip_blocks) * shape_desc.bs
297-
# if chunk_blocks.numel() == 0:
298-
# continue
299-
#
300-
# num_slots = int(chunk_blocks.numel()) * shape_desc.bs
301-
# tmp = _tmp_view(
302-
# int(tmp_ptr),
303-
# shape_desc,
304-
# num_layers,
305-
# lmcache_chunk_size,
306-
# dtype,
307-
# device,
308-
# )
309-
#
310-
# for layer_idx in range(num_layers):
311-
# paged = _paged_view(
312-
# int(group_kv_pointers[layer_idx].item()),
313-
# shape_desc,
314-
# dtype,
315-
# device,
316-
# )
317-
# tmp_slice = tmp[
318-
# 0,
319-
# layer_idx,
320-
# dest_slot_offset : dest_slot_offset + num_slots,
321-
# :,
322-
# ]
323-
# if direction_name == "D2H":
324-
# gathered = paged.index_select(0, chunk_blocks).reshape(
325-
# num_slots, shape_desc.nh * shape_desc.hs
326-
# )
327-
# tmp_slice.copy_(gathered)
328-
# elif direction_name == "H2D":
329-
# src = tmp_slice.reshape(
330-
# int(chunk_blocks.numel()),
331-
# shape_desc.bs,
332-
# shape_desc.nh * shape_desc.hs,
333-
# )
334-
# paged.index_copy_(0, chunk_blocks, src)
335-
# else:
336-
# raise ValueError(f"Unsupported transfer direction: {direction}")
337-
#
338-
# lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
339-
#PY
340-
#}
341-
34258
# Workaround for MEC FW <177 RCCL memory reclaim issue
34359
version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
34460
if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
@@ -431,64 +147,6 @@ case "$OFFLOADING" in
431147
{ set +x; } 2>/dev/null
432148
unset VLLM_USE_SIMPLE_KV_OFFLOAD
433149

434-
#agentic_pip_install --quiet --no-cache-dir lmcache
435-
## LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
436-
## CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
437-
## during Kimi fused-MoE model inspection it imports nixl_ep whenever
438-
## that module is importable, even when this run is not using EP/NIXL
439-
## kernels. The CUDA extension then fails immediately on AMD nodes with
440-
## "ImportError: libcuda.so.1".
441-
##
442-
## LMCache MP also uses CuPy stream APIs while registering vLLM's KV
443-
## caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
444-
## with cudaErrorInsufficientDriver when LMCache touches the stream. Use
445-
## the ROCm 7 CuPy wheel so the same API dispatches through HIP.
446-
#python3 -m pip uninstall -y \
447-
# nixl nixl-cu12 nixl-cu13 nixl_ep \
448-
# >/dev/null 2>&1 || true
449-
#python3 -m pip uninstall -y \
450-
# cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
451-
# >/dev/null 2>&1 || true
452-
#agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
453-
454-
455-
456-
# python3 - <<'PY'
457-
#import importlib.util
458-
#import sys
459-
#
460-
#spec = importlib.util.find_spec("nixl_ep")
461-
#if spec is not None:
462-
# locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
463-
# print(
464-
# "Error: nixl_ep is still importable after LMCache install; "
465-
# "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
466-
# f"location={locations}",
467-
# file=sys.stderr,
468-
# )
469-
# sys.exit(1)
470-
#
471-
#try:
472-
# from cupy_backends.cuda.api import runtime as cupy_runtime
473-
#except Exception as exc:
474-
# print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
475-
# sys.exit(1)
476-
#
477-
#if not getattr(cupy_runtime, "is_hip", False):
478-
# print(
479-
# "Error: CuPy is still using the CUDA backend after installing "
480-
# "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
481-
# file=sys.stderr,
482-
# )
483-
# sys.exit(1)
484-
#PY
485-
#LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
486-
#write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
487-
#export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0
488-
#export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
489-
#export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0
490-
#export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
491-
492150
git clone https://github.com/seungrokj/LMCache.git
493151
cd LMCache
494152
pip install -r requirements/build.txt

0 commit comments

Comments
 (0)