@@ -55,290 +55,6 @@ if [ "${TP}" -lt 8 ]; then
5555 export VLLM_ROCM_USE_AITER_RMSNORM=0
5656fi
5757
58- # write_lmcache_rocm_mp_patch() {
59- # local patch_dir="$1"
60- # mkdir -p "$patch_dir"
61- # cat > "$patch_dir/sitecustomize.py" <<'PY'
62- # """Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
63- #
64- # import os
65- # import threading
66- #
67- # if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
68- # import builtins
69- # import sys
70- #
71- # _orig_import = builtins.__import__
72- #
73- # def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
74- # _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
75- #
76- # if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
77- # return
78- #
79- # _orig_init = _LazyMemoryAllocator.__init__
80- # _orig_allocate = _LazyMemoryAllocator.allocate
81- # _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
82- #
83- # def _expand_to(self, target_size: int) -> None:
84- # target_size = min(
85- # self._final_size,
86- # _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
87- # )
88- # lock = self._agentic_rocm_demand_expand_lock
89- # with lock:
90- # if target_size <= self._curr_size:
91- # return
92- #
93- # start_size = self._curr_size
94- # while self._curr_size < target_size:
95- # commit_start = self._curr_size
96- # commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
97- # while self._curr_size < commit_target:
98- # self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
99- # self._curr_size += self.PIN_CHUNK_SIZE
100- # self._commit_expansion(self._curr_size - commit_start)
101- #
102- # self._log_expansion_progress(self._curr_size - start_size)
103- #
104- # def _retry_with_demand_expansion(self, allocate_once):
105- # obj = allocate_once()
106- # step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
107- # step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
108- #
109- # while obj is None and self._curr_size < self._final_size:
110- # _expand_to(self, self._curr_size + step_bytes)
111- # obj = allocate_once()
112- #
113- # return obj
114- #
115- # def _patched_init(self, *args, **kwargs):
116- # _orig_init(self, *args, **kwargs)
117- # self._agentic_rocm_demand_expand_lock = threading.Lock()
118- #
119- # # LMCache MP's upstream LazyMemoryAllocator currently expands to
120- # # the final pinned size in a background thread. On ROCm Kimi TP4,
121- # # vLLM reaches KV-cache registration only after that 2.5 TB pool
122- # # is fully pinned, and the server-side IPC open path can stall
123- # # before acknowledging register_kv_caches. Keep the same final
124- # # capacity, but pin/commit extra host memory only when L1
125- # # allocations actually need it.
126- # self._stop_expand.set()
127- # self._expand_thread.join()
128- # _lazy_memory_allocator.logger.info(
129- # "Agentic ROCm patch: using demand-driven LMCache pinned "
130- # "memory expansion; final capacity remains %s MB",
131- # self._final_size >> 20,
132- # )
133- #
134- # def _patched_allocate(
135- # self,
136- # shapes,
137- # dtypes,
138- # fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
139- # allocator_type=None,
140- # ):
141- # return _retry_with_demand_expansion(
142- # self,
143- # lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
144- # )
145- #
146- # def _patched_batched_allocate(
147- # self,
148- # shapes,
149- # dtypes,
150- # batch_size,
151- # fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
152- # allocator_type=None,
153- # ):
154- # return _retry_with_demand_expansion(
155- # self,
156- # lambda: _orig_batched_allocate(
157- # self, shapes, dtypes, batch_size, fmt, allocator_type
158- # ),
159- # )
160- #
161- # _LazyMemoryAllocator.__init__ = _patched_init
162- # _LazyMemoryAllocator.allocate = _patched_allocate
163- # _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
164- # _LazyMemoryAllocator._agentic_rocm_demand_patch = True
165- #
166- # def _patch_l1_memory_manager(_memory_manager) -> None:
167- # _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
168- # _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
169- # if _L1MemoryManager is None or _LazyMemoryAllocator is None:
170- # return
171- # if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
172- # return
173- #
174- # _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
175- #
176- # def _patched_get_memory_usage(self):
177- # allocator = getattr(self, "_allocator", None)
178- # if isinstance(allocator, _LazyMemoryAllocator):
179- # address_manager = allocator.get_address_manager()
180- # used_size = (
181- # address_manager.get_heap_size() - address_manager.get_free_size()
182- # )
183- # return used_size, allocator._final_size
184- # return _orig_get_memory_usage(self)
185- #
186- # _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
187- # _L1MemoryManager._agentic_rocm_final_capacity_patch = True
188- #
189- # def _maybe_patch_lazy_memory_allocator() -> None:
190- # module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
191- # if module is not None and hasattr(module, "LazyMemoryAllocator"):
192- # _patch_lazy_memory_allocator(module)
193- #
194- # def _maybe_patch_l1_memory_manager() -> None:
195- # module = sys.modules.get("lmcache.v1.distributed.memory_manager")
196- # if module is not None and hasattr(module, "L1MemoryManager"):
197- # _patch_l1_memory_manager(module)
198- #
199- # def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
200- # module = _orig_import(name, globals, locals, fromlist, level)
201- # if name == "lmcache.v1.lazy_memory_allocator" or (
202- # name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
203- # ):
204- # _maybe_patch_lazy_memory_allocator()
205- # if name == "lmcache.v1.distributed.memory_manager" or (
206- # name.startswith("lmcache")
207- # and "lmcache.v1.distributed.memory_manager" in sys.modules
208- # ):
209- # _maybe_patch_l1_memory_manager()
210- # return module
211- #
212- # builtins.__import__ = _agentic_rocm_import
213- # _maybe_patch_lazy_memory_allocator()
214- # _maybe_patch_l1_memory_manager()
215- #
216- # if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
217- # import torch
218- # import lmcache.non_cuda_equivalents as lmc
219- #
220- # if not hasattr(lmc, "multi_layer_block_kv_transfer"):
221- # _DTYPE_BY_NAME = {
222- # "bfloat16": torch.bfloat16,
223- # "float16": torch.float16,
224- # "float32": torch.float32,
225- # }
226- #
227- # def _dtype_from_env() -> torch.dtype:
228- # name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
229- # try:
230- # return _DTYPE_BY_NAME[name]
231- # except KeyError as exc:
232- # raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
233- #
234- # def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
235- # block_stride = shape_desc.block_stride_elems or (
236- # shape_desc.bs * shape_desc.nh * shape_desc.hs
237- # )
238- # base = lmc._tensor_from_ptr(
239- # ptr,
240- # (shape_desc.nb * block_stride,),
241- # dtype,
242- # device,
243- # )
244- # return torch.as_strided(
245- # base,
246- # (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
247- # (block_stride, shape_desc.nh * shape_desc.hs, 1),
248- # )
249- #
250- # def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
251- # return lmc._tensor_from_ptr(
252- # ptr,
253- # (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
254- # dtype,
255- # device,
256- # )
257- #
258- # def multi_layer_block_kv_transfer(
259- # group_kv_pointers,
260- # tmp_buffer_ptrs,
261- # block_ids,
262- # paged_memory_device,
263- # direction,
264- # shape_desc,
265- # lmcache_chunk_size,
266- # gpu_kv_format,
267- # skip_blocks=0,
268- # ) -> None:
269- # # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
270- # # shape [num_blocks, block_size, hidden_size]. LMCache's Python
271- # # fallback has no block-transfer entrypoint yet, so implement the
272- # # same gather/scatter contract with torch indexing on ROCm.
273- # if shape_desc.kv_size != 1:
274- # raise NotImplementedError(
275- # "ROCm LMCache MP block fallback currently supports MLA KV caches only"
276- # )
277- #
278- # dtype = _dtype_from_env()
279- # device = (
280- # paged_memory_device
281- # if isinstance(paged_memory_device, torch.device)
282- # else torch.device(paged_memory_device)
283- # )
284- # num_layers = int(group_kv_pointers.numel())
285- # blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
286- # direction_name = getattr(direction, "name", str(direction))
287- #
288- # for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
289- # start = chunk_idx * blocks_per_chunk
290- # end = start + blocks_per_chunk
291- # chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
292- #
293- # dest_slot_offset = 0
294- # if skip_blocks and chunk_idx == 0:
295- # chunk_blocks = chunk_blocks[int(skip_blocks):]
296- # dest_slot_offset = int(skip_blocks) * shape_desc.bs
297- # if chunk_blocks.numel() == 0:
298- # continue
299- #
300- # num_slots = int(chunk_blocks.numel()) * shape_desc.bs
301- # tmp = _tmp_view(
302- # int(tmp_ptr),
303- # shape_desc,
304- # num_layers,
305- # lmcache_chunk_size,
306- # dtype,
307- # device,
308- # )
309- #
310- # for layer_idx in range(num_layers):
311- # paged = _paged_view(
312- # int(group_kv_pointers[layer_idx].item()),
313- # shape_desc,
314- # dtype,
315- # device,
316- # )
317- # tmp_slice = tmp[
318- # 0,
319- # layer_idx,
320- # dest_slot_offset : dest_slot_offset + num_slots,
321- # :,
322- # ]
323- # if direction_name == "D2H":
324- # gathered = paged.index_select(0, chunk_blocks).reshape(
325- # num_slots, shape_desc.nh * shape_desc.hs
326- # )
327- # tmp_slice.copy_(gathered)
328- # elif direction_name == "H2D":
329- # src = tmp_slice.reshape(
330- # int(chunk_blocks.numel()),
331- # shape_desc.bs,
332- # shape_desc.nh * shape_desc.hs,
333- # )
334- # paged.index_copy_(0, chunk_blocks, src)
335- # else:
336- # raise ValueError(f"Unsupported transfer direction: {direction}")
337- #
338- # lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
339- # PY
340- # }
341-
34258# Workaround for MEC FW <177 RCCL memory reclaim issue
34359version=$( rocm-smi --showfw 2> /dev/null | grep MEC | head -n 1 | awk ' {print $NF}' )
34460if [[ " $version " == " " || ${version:- 0} -lt 177 ]]; then
@@ -431,64 +147,6 @@ case "$OFFLOADING" in
431147 { set +x; } 2> /dev/null
432148 unset VLLM_USE_SIMPLE_KV_OFFLOAD
433149
434- # agentic_pip_install --quiet --no-cache-dir lmcache
435- # # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
436- # # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
437- # # during Kimi fused-MoE model inspection it imports nixl_ep whenever
438- # # that module is importable, even when this run is not using EP/NIXL
439- # # kernels. The CUDA extension then fails immediately on AMD nodes with
440- # # "ImportError: libcuda.so.1".
441- # #
442- # # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
443- # # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
444- # # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
445- # # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
446- # python3 -m pip uninstall -y \
447- # nixl nixl-cu12 nixl-cu13 nixl_ep \
448- # >/dev/null 2>&1 || true
449- # python3 -m pip uninstall -y \
450- # cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
451- # >/dev/null 2>&1 || true
452- # agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
453-
454-
455-
456- # python3 - <<'PY'
457- # import importlib.util
458- # import sys
459- #
460- # spec = importlib.util.find_spec("nixl_ep")
461- # if spec is not None:
462- # locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
463- # print(
464- # "Error: nixl_ep is still importable after LMCache install; "
465- # "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
466- # f"location={locations}",
467- # file=sys.stderr,
468- # )
469- # sys.exit(1)
470- #
471- # try:
472- # from cupy_backends.cuda.api import runtime as cupy_runtime
473- # except Exception as exc:
474- # print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
475- # sys.exit(1)
476- #
477- # if not getattr(cupy_runtime, "is_hip", False):
478- # print(
479- # "Error: CuPy is still using the CUDA backend after installing "
480- # "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
481- # file=sys.stderr,
482- # )
483- # sys.exit(1)
484- # PY
485- # LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
486- # write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
487- # export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0
488- # export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
489- # export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0
490- # export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
491-
492150 git clone https://github.com/seungrokj/LMCache.git
493151 cd LMCache
494152 pip install -r requirements/build.txt
0 commit comments