From 7a8600179c880313a634aecd73553c0292444257 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:49:30 +0900 Subject: [PATCH 01/14] [AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, refactor Kimi/Qwen scripts Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 35 +- .../agentic/kimik2.5_fp4_mi355x.sh | 674 ++---------------- .../agentic/minimaxm2.5_fp4_mi355x.sh | 256 +++++++ .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 ++- 4 files changed, 425 insertions(+), 652 deletions(-) create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7f1c8192d..134af929a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,6 +872,21 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } +minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/MiniMax-M2.5-MXFP4 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2518,6 +2533,16 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } +kimik2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -2574,19 +2599,15 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 139b12256..d05b27253 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,15 +14,11 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. -# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this -# script we need the concrete value so AgentX filters prompt+max_tokens against -# the same limit vLLM enforces. -if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then - MAX_MODEL_LEN=262144 -fi +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -33,557 +29,22 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -write_lmcache_rocm_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_rocm_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_rocm_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On ROCm Kimi TP4, - # vLLM reaches KV-cache registration only after that 2.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic ROCm patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_rocm_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_rocm_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) - - def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module - - builtins.__import__ = _agentic_rocm_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() - -if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": - import torch - import lmcache.non_cuda_equivalents as lmc - - if not hasattr(lmc, "multi_layer_block_kv_transfer"): - _DTYPE_BY_NAME = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - - def _dtype_from_env() -> torch.dtype: - name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") - try: - return _DTYPE_BY_NAME[name] - except KeyError as exc: - raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc - - def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - block_stride = shape_desc.block_stride_elems or ( - shape_desc.bs * shape_desc.nh * shape_desc.hs - ) - base = lmc._tensor_from_ptr( - ptr, - (shape_desc.nb * block_stride,), - dtype, - device, - ) - return torch.as_strided( - base, - (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), - (block_stride, shape_desc.nh * shape_desc.hs, 1), - ) - - def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - return lmc._tensor_from_ptr( - ptr, - (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), - dtype, - device, - ) - - def multi_layer_block_kv_transfer( - group_kv_pointers, - tmp_buffer_ptrs, - block_ids, - paged_memory_device, - direction, - shape_desc, - lmcache_chunk_size, - gpu_kv_format, - skip_blocks=0, - ) -> None: - # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with - # shape [num_blocks, block_size, hidden_size]. LMCache's Python - # fallback has no block-transfer entrypoint yet, so implement the - # same gather/scatter contract with torch indexing on ROCm. - if shape_desc.kv_size != 1: - raise NotImplementedError( - "ROCm LMCache MP block fallback currently supports MLA KV caches only" - ) - - dtype = _dtype_from_env() - device = ( - paged_memory_device - if isinstance(paged_memory_device, torch.device) - else torch.device(paged_memory_device) - ) - num_layers = int(group_kv_pointers.numel()) - blocks_per_chunk = lmcache_chunk_size // shape_desc.bs - direction_name = getattr(direction, "name", str(direction)) - - for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): - start = chunk_idx * blocks_per_chunk - end = start + blocks_per_chunk - chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) - - dest_slot_offset = 0 - if skip_blocks and chunk_idx == 0: - chunk_blocks = chunk_blocks[int(skip_blocks):] - dest_slot_offset = int(skip_blocks) * shape_desc.bs - if chunk_blocks.numel() == 0: - continue - - num_slots = int(chunk_blocks.numel()) * shape_desc.bs - tmp = _tmp_view( - int(tmp_ptr), - shape_desc, - num_layers, - lmcache_chunk_size, - dtype, - device, - ) - - for layer_idx in range(num_layers): - paged = _paged_view( - int(group_kv_pointers[layer_idx].item()), - shape_desc, - dtype, - device, - ) - tmp_slice = tmp[ - 0, - layer_idx, - dest_slot_offset : dest_slot_offset + num_slots, - :, - ] - if direction_name == "D2H": - gathered = paged.index_select(0, chunk_blocks).reshape( - num_slots, shape_desc.nh * shape_desc.hs - ) - tmp_slice.copy_(gathered) - elif direction_name == "H2D": - src = tmp_slice.reshape( - int(chunk_blocks.numel()), - shape_desc.bs, - shape_desc.nh * shape_desc.hs, - ) - paged.index_copy_(0, chunk_blocks, src) - else: - raise ValueError(f"Unsupported transfer direction: {direction}") - - lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer - -# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- -if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": - import chunked_connector_patch # noqa: F401 - -# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- -import scheduler_assertion_patch # noqa: F401 -PY -} - -write_chunked_connector_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/chunked_connector_patch.py" <<'PY' -""" -Monkey-patch for LMCacheMPConnector to add chunked KV loading. - -Fixes GPU block exhaustion deadlock at high concurrency by capping -the number of external tokens reported AND retrieved per scheduling step. - -Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this -module from sitecustomize.py before LMCache is loaded. -""" - -import logging -import os -import sys -import builtins - -logger = logging.getLogger("chunked_lmcache_patch") - -_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) - -# Per-request chunk tracking (module-level, survives across calls) -_chunk_state: dict[str, dict] = {} - - -def _apply_patch(): - """Patch LMCacheMPConnector in-place.""" - mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") - if mod is None: - return - cls = getattr(mod, "LMCacheMPConnector", None) - if cls is None or getattr(cls, "_chunked_patch_applied", False): - return - - LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) - _orig_get_matched = cls.get_num_new_matched_tokens - _orig_get_finished = cls.get_finished - - def _get_blocks_per_chunk(self): - block_size = getattr(self, "block_size", 1) - return max(1, _MAX_TOKENS // block_size) - - def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): - full_match = _orig_get_matched(self, request, num_computed_tokens) - if full_match <= 0 or _MAX_TOKENS <= 0: - return full_match - - req_id = request.request_id - block_size = getattr(self, "block_size", 1) - blocks_per_chunk = _get_blocks_per_chunk(self) - full_match_blocks = full_match // block_size - - state = _chunk_state.get(req_id) - if state is None or state.get("num_computed_at_start") != num_computed_tokens: - state = { - "full_match_blocks": full_match_blocks, - "chunk_end_blocks": 0, - "num_computed_at_start": num_computed_tokens, - "lookup_done": False, - } - _chunk_state[req_id] = state - - if state["lookup_done"]: - return 0 - - remaining = state["full_match_blocks"] - state["chunk_end_blocks"] - if remaining <= 0: - state["lookup_done"] = True - return 0 - - this_chunk = min(remaining, blocks_per_chunk) - state["chunk_end_blocks"] += this_chunk - if state["chunk_end_blocks"] >= state["full_match_blocks"]: - state["lookup_done"] = True - - capped = this_chunk * block_size - if capped < full_match: - logger.debug( - "Chunked LMCache: req %s capped %d -> %d tokens " - "(chunk %d/%d blocks)", - req_id, full_match, capped, this_chunk, full_match_blocks, - ) - - # Cap the tracker's hit blocks to match what we report - tracker = getattr(request, "kv_transfer_params", None) - if tracker is not None: - orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) - if orig_hits > this_chunk: - tracker.num_lmcache_hit_blocks = this_chunk - - return capped - - def _patched_get_finished(self, scheduler_output): - result = _orig_get_finished(self, scheduler_output) - # Clean up chunk state for finished requests. - # vLLM passes scheduler_output as a set of request-ID strings - # (not a SchedulerOutput object), so iterate directly when it - # is a set/frozenset; fall back to the attribute path for - # forward compatibility. - if isinstance(scheduler_output, (set, frozenset)): - finished = scheduler_output - else: - finished = getattr(scheduler_output, "finished_req_ids", []) - for req in finished: - _chunk_state.pop(req, None) - return result - - cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens - cls.get_finished = _patched_get_finished - cls._chunked_patch_applied = True - logger.info( - "Chunked LMCache connector patch applied " - "(max_tokens_per_load=%d)", _MAX_TOKENS, - ) - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "lmcache.integration.vllm.lmcache_mp_connector" - or ( - name.startswith("lmcache") - and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -write_scheduler_assertion_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' -""" -Patch vLLM scheduler to handle stale finished_recving gracefully. - -The assertion at scheduler.py crashes when a KV transfer reports -"finished recving" but the request is already in RUNNING state. -This happens when transfers complete asynchronously and the scheduler -has already moved the request forward. - -Fix: Instead of asserting, log a warning and skip. -""" - -import logging -import sys -import builtins - -logger = logging.getLogger("scheduler_assertion_patch") - - -def _apply_patch(): - """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" - sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") - if sched_mod is None: - return - req_mod = sys.modules.get("vllm.v1.request") - if req_mod is None: - return - Scheduler = getattr(sched_mod, "Scheduler", None) - RequestStatus = getattr(req_mod, "RequestStatus", None) - if Scheduler is None or RequestStatus is None: - return - if getattr(Scheduler, "_kv_xfer_patch_applied", False): - return - - _orig_update = Scheduler._update_from_kv_xfer_finished - - def _patched_update(self, kv_connector_output): - if self.connector is not None: - self.connector.update_connector_output(kv_connector_output) - for req_id in kv_connector_output.finished_recving or (): - if req_id not in self.requests: - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.warning( - "Stale finished_recving for req %s in status %s; skipping.", - req_id, req.status.name, - ) - for req_id in kv_connector_output.finished_sending or (): - if req_id not in self.requests: - continue - self._free_blocks(self.requests[req_id]) - - Scheduler._update_from_kv_xfer_finished = _patched_update - Scheduler._kv_xfer_patch_applied = True - logger.info("Scheduler KV transfer assertion patch applied") - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "vllm.v1.core.sched.scheduler" - or ( - name.startswith("vllm") - and "vllm.v1.core.sched.scheduler" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -591,6 +52,8 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -648,7 +111,9 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -659,7 +124,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -667,74 +132,20 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - agentic_pip_install --quiet --no-cache-dir lmcache - # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - # during Kimi fused-MoE model inspection it imports nixl_ep whenever - # that module is importable, even when this run is not using EP/NIXL - # kernels. The CUDA extension then fails immediately on AMD nodes with - # "ImportError: libcuda.so.1". - # - # LMCache MP also uses CuPy stream APIs while registering vLLM's KV - # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - # with cudaErrorInsufficientDriver when LMCache touches the stream. Use - # the ROCm 7 CuPy wheel so the same API dispatches through HIP. - python3 -m pip uninstall -y \ - nixl nixl-cu12 nixl-cu13 nixl_ep \ - >/dev/null 2>&1 || true - python3 -m pip uninstall -y \ - cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - >/dev/null 2>&1 || true - agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - python3 - <<'PY' -import importlib.util -import sys - -spec = importlib.util.find_spec("nixl_ep") -if spec is not None: - locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) - print( - "Error: nixl_ep is still importable after LMCache install; " - "this ROCm Kimi run would import a CUDA-only nixl_ep module. " - f"location={locations}", - file=sys.stderr, - ) - sys.exit(1) - -try: - from cupy_backends.cuda.api import runtime as cupy_runtime -except Exception as exc: - print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) - sys.exit(1) - -if not getattr(cupy_runtime, "is_hip", False): - print( - "Error: CuPy is still using the CUDA backend after installing " - "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", - file=sys.stderr, - ) - sys.exit(1) -PY - LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" - write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" - export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 - export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 - # Cap external KV tokens loaded per scheduling step to prevent GPU - # block exhaustion deadlock at high concurrency (c>=32). Default - # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to - # disable chunking (only safe at low concurrency). - export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" - export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -742,7 +153,7 @@ PY # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -750,10 +161,11 @@ PY # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -786,6 +198,7 @@ PY *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac +# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -794,17 +207,34 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL_PATH" --served-model-name "$MODEL" + vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code - --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -821,4 +251,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh new file mode 100755 index 000000000..f36fc59e9 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility for vLLM 0.14+ +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + +case "$OFFLOADING" in + none) ;; + cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) + ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; +esac + +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 \ + --block-size=32 + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index ff901b674..656e924dc 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,51 +2,117 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Start SGLang server ---------------------------------------------------- +# ---- Cache / offload config ------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -56,10 +122,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --max-prefill-tokens 32768 \ - --scheduler-recv-interval 30 \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --mem-fraction-static 0.8 \ - --context-length $MAX_MODEL_LEN \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -69,4 +135,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file From 19d1ca5fbcc075c42399fbc0d70df2b7a61f52fa Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:51:04 +0900 Subject: [PATCH 02/14] [AMD] minimaxm2.5 agentic: change LMCACHE_CHUNK_SIZE default from 256 to 32 Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh index f36fc59e9..ea276ef38 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -165,7 +165,9 @@ case "$OFFLOADING" in # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" - LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + # (srok) check 256 vs 32 + #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" export LMCACHE_BLOCKING_TIMEOUT_SECS=120 From d1840788b73f2881c6917a6da9b4a96ef1ca2e8a Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:56:26 +0900 Subject: [PATCH 03/14] [AMD] agentx-v0.4: add MODEL_PATH support and --served-model-name for Kimi/MiniMax scripts Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/agentic/kimik2.5_fp4_mi355x.sh | 15 +++++++++++++-- .../single_node/agentic/minimaxm2.5_fp4_mi355x.sh | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index d05b27253..b3211ff49 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -29,7 +29,18 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + rocm-smi || true amd-smi || true @@ -226,7 +237,7 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh index ea276ef38..3d645e7ad 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -29,7 +29,18 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + rocm-smi || true amd-smi || true @@ -230,7 +241,7 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" From d2b2826bf49d816a057af8adbdcb147df4448f37 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 15:00:51 +0900 Subject: [PATCH 04/14] [AMD] kimik2.5-fp4-mi355x-vllm-agentic-lmcache: fix config, use v0.21.0, expand conc list Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 134af929a..8a133fe42 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2534,14 +2534,19 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } kimik2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.21.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From dc999ef10c335d0e73a95f71e8c4cc2916060d95 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 15:05:12 +0900 Subject: [PATCH 05/14] [AMD] agentx-v0.4: fix configs for MiniMax/Kimi/Qwen agentic entries Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8a133fe42..1d8ac466b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,6 +872,7 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } +# target minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -884,8 +885,8 @@ minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: agentic-coding: - duration: 1800 search-space: - - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } - - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 @@ -2533,6 +2534,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } +# target kimik2.5-fp4-mi355x-vllm-agentic-lmcache: image: vllm/vllm-openai-rocm:v0.21.0 model: amd/Kimi-K2.5-MXFP4 @@ -2603,16 +2605,21 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } +# target qwen3.5-fp8-mi355x-sglang-agentic-hicache: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 - runner: mi355x + runner: b300 + precision: fp8 + framework: sglang + multinode: false + scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From fc0d0d430165afa83e108118563d91cae63b753d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 15:43:56 +0900 Subject: [PATCH 06/14] [AMD] qwen3.5-fp8-mi355x-sglang-agentic-hicache: fix runner to mi355x Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1d8ac466b..23920070c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2610,7 +2610,7 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 - runner: b300 + runner: mi355x precision: fp8 framework: sglang multinode: false From fe3afa91002a07bc8ff8e8f70fdf5ae86604d336 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 23:15:28 +0900 Subject: [PATCH 07/14] [AMD] qwen3.5-fp8-mi355x-sglang-agentic-hicache: update conc-list to higher range Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 23920070c..596ea049d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2618,8 +2618,10 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + #- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + #- { tp: 4, ep: 1, offloading: hicache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 4, ep: 1, offloading: none, conc-list: [56, 72, 88, 104, 120] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [56, 72, 88, 104, 120] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From dc25a0bb0857acf6a0f2cf6577498e163c41d5e3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 23:26:02 +0900 Subject: [PATCH 08/14] [AMD] minimaxm2.5-fp8-mi355x-vllm-agentic: add lmcache variant config and update script Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 21 +- .../agentic/minimaxm2.5_fp8_mi355x.sh | 255 +++++++++++++++--- 2 files changed, 241 insertions(+), 35 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 596ea049d..390c0c112 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2536,7 +2536,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: # target kimik2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x @@ -2568,6 +2568,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } +# target +minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp8-mi300x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index cd114fe96..d62eb38d1 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -2,18 +2,23 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM. +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + # `hf download` creates the target dir if missing and is itself idempotent. # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE # Either way, MODEL_PATH is what the server is launched with. @@ -35,59 +44,237 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi -rocm-smi || true -amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k resolve_trace_source install_agentic_deps # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + case "$OFFLOADING" in none) ;; cpu) - # SimpleCPUOffloadConnector now works on ROCm with the - # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b). - # Use the same offload path as NVIDIA so cross-vendor cpu-offload - # numbers are apples-to-apples. - # MI355X nodes have substantial DRAM; override workflow default (600 GB) - # so we offload up to 2 TB of KV cache. - TOTAL_CPU_DRAM_GB=2000 - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + # (srok) check 256 vs 32 + #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + ) ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac -if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 +export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 +VLLM_BLOCK_SIZE=32 +ASYNC_SCHEDULING_ARGS="" + +if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + export VLLM_ROCM_USE_AITER_MOE=0 + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled." +elif (( CONC < 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled." +elif (( CONC == 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "c64: using block size 16, shuffle enabled, async scheduling disabled." +else + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled." +fi -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---kv-cache-dtype fp8 \ ---block-size=32 \ ---max-num-seqs $CONC \ ---attention-backend "ROCM_AITER_UNIFIED_ATTN" \ ---trust-remote-code \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 + --block-size=$VLLM_BLOCK_SIZE + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + $ASYNC_SCHEDULING_ARGS + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" From 34cd6b005637535e121a5d22995827cd6471b4a6 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 23:45:38 +0900 Subject: [PATCH 09/14] [AMD] glm5.1-fp4-mi355x-sglang-agentic: add hicache variant config and update scripts Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 17 ++++ .../single_node/agentic/glm5.1_fp4_mi355x.sh | 96 +++++++++++++++++-- .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 13 ++- 3 files changed, 116 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 390c0c112..71353540c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2510,6 +2510,23 @@ glm5.1-fp4-mi355x-sglang-agentic: # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } +# target +glm5.1-fp4-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] } + kimik2.5-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 3b85a31cd..6bea8dddd 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -2,18 +2,29 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -30,8 +41,16 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi + rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source @@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR" pip install -U transformers +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path), + # so one HiCache host pool per TP rank is sufficient. + # The node-total DRAM budget divides by TP and host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint), + # so page_size can be left at the default. Keep the safer direct/layer_first + # copy path on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 +pip install -U transformers python3 -m sglang.launch_server \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path "$MODEL_PATH" \ + --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --context-length $MAX_MODEL_LEN \ --mem-fraction-static 0.85 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ --nsa-decode-backend tilelang \ + --watchdog-timeout 1200 \ --kv-cache-dtype fp8_e4m3 \ --tokenizer-worker-num $((TP*2)) \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index 656e924dc..8c6f82410 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -32,7 +32,18 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + rocm-smi || true amd-smi || true From 3707963e7777ce910840f7684e2083af3eac571d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 4 Jun 2026 10:10:05 +0900 Subject: [PATCH 10/14] [AMD] qwen3.5-fp8-mi355x-sglang-agentic: update conc-list and remove standalone script Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 +- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 152 ------------------ 2 files changed, 2 insertions(+), 156 deletions(-) delete mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 71353540c..cd3e9c358 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2654,10 +2654,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - #- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - #- { tp: 4, ep: 1, offloading: hicache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - - { tp: 4, ep: 1, offloading: none, conc-list: [56, 72, 88, 104, 120] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [56, 72, 88, 104, 120] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh deleted file mode 100755 index cdded8860..000000000 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# OFFLOADING values: -# none - SGLang GPU KV only with radix cache disabled. -# hicache - SGLang HiCache with local CPU hierarchical cache. - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid - # GDN/Mamba path allocates two HiCache host pools per TP rank: one for - # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB - # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per - # host pool, not 250 GB. Keep overrides for one-off tuning. - TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" - HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" - HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on - # MI355X, which requires page_size=1. The kernel/page_first HiCache - # transfer path faults on first prefill in this mode on ROCm, so keep - # the default on the safer direct/layer_first copy path. These remain - # env-overridable for future SGLang/ROCm fixes. - HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - # SGLang --hicache-size is per rank per host pool, while the workflow - # input is a node-total DRAM budget. Divide by TP and the number of - # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. - HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" - if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then - HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" - fi - if [ "$HICACHE_SIZE_GB" -lt 1 ]; then - echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 - exit 1 - fi - echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" - CACHE_ARGS=( - --page-size "$HICACHE_PAGE_SIZE" - --enable-hierarchical-cache - --hicache-size "$HICACHE_SIZE_GB" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - ) - # HiCache startup reaches API readiness, but SGLang's internal warmup - # request has timed out after 600s on this Qwen MI355X path. Let aiperf - # own benchmark traffic instead of blocking server readiness on it. - WARMUP_ARGS=(--skip-server-warmup) - # Keep request concurrency as the swept variable, but do not force - # HiCache runs to capture ROCm graphs at every high concurrency point. - # The conc=32 HiCache job crashed after startup readiness, before any - # aiperf traffic, while conc=16 is the highest known-good capture size - # for this model/server path. Requests above the capture size can still - # run; they just do not require a larger captured graph at startup. - HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" - if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then - CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" - fi - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - -echo "Starting SGLang server..." -export PYTHONNOUSERSITE=1 - -{ set +x; } 2>/dev/null -SGLANG_CMD=( - python3 -m sglang.launch_server - --attention-backend triton - --model-path "$MODEL_PATH" --served-model-name "$MODEL" - --host=0.0.0.0 - --port "$PORT" - --tensor-parallel-size "$TP" - --ep-size "$EP_SIZE" - --trust-remote-code - --tokenizer-worker-num 6 - --enable-aiter-allreduce-fusion - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --max-running-requests "$CONC" - --max-prefill-tokens 32768 - --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" - --mem-fraction-static 0.8 - --context-length "$MAX_MODEL_LEN" - --enable-metrics - "${CACHE_ARGS[@]}" - "${WARMUP_ARGS[@]}" -) -printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" -printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" -"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" From b3b34760074bb558d6f5eb003397c79622391a0c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 4 Jun 2026 10:19:47 +0900 Subject: [PATCH 11/14] [AMD] qwen3.5-fp4-mi355x-sglang-agentic: add hicache variant config and script Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 16 ++ .../single_node/agentic/qwen3.5_fp4_mi355x.sh | 150 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cd3e9c358..8254c5bbd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } +# target +qwen3.5-fp4-mi355x-sglang-hicache: + image: lmsysorg/sglang:v0.5.12-rocm720-mi35x + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } + - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } + qwen3.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/Qwen3.5-397B-A17B-MXFP4 diff --git a/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh new file mode 100755 index 000000000..fe85b05ab --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Cache / offload config ------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ + --attention-backend aiter \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ + --trust-remote-code \ + --model-loader-extra-config '{"enable_multithread_load": true}' \ + --watchdog-timeout 1200 \ + --tokenizer-worker-num 6 \ + --cuda-graph-max-bs $CONC \ + --max-running-requests $CONC \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --mem-fraction-static 0.8 \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ + --enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file From 56d9dc5f709a282c5dcd2243da696486c008224a Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 4 Jun 2026 11:22:38 +0900 Subject: [PATCH 12/14] [AMD] amd-master: fix entry names and tp for qwen3.5-fp4 and minimaxm2.5 agentic configs Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8254c5bbd..ef4ff772d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -434,7 +434,7 @@ qwen3.5-fp4-mi355x-sglang: - { tp: 4, conc-start: 4, conc-end: 16 } # target -qwen3.5-fp4-mi355x-sglang-hicache: +qwen3.5-fp4-mi355x-sglang-agentic-hicache: image: lmsysorg/sglang:v0.5.12-rocm720-mi35x model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 @@ -2617,8 +2617,8 @@ minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache: # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - duration: 1800 search-space: - - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } minimaxm2.5-fp8-mi300x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From c697c169177243f63578f803c3d84991b1a66059 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 4 Jun 2026 12:56:05 +0900 Subject: [PATCH 13/14] [AMD] minimaxm2.5-fp4-mi355x-agentic: increase lmcache DRAM size Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh index 3d645e7ad..640fe7f65 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -167,7 +167,10 @@ case "$OFFLOADING" in # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # (srok)TODO: intentionally increased DRAM size + TOTAL_CPU_DRAM_GB=2000 + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this From 6dbef816d431c9768293dd979cad471076b9f85f Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 4 Jun 2026 12:59:26 +0900 Subject: [PATCH 14/14] [AMD] minimaxm2.5-fp8-mi355x-agentic: increase lmcache DRAM size Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index d62eb38d1..9f1f79a3f 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -164,7 +164,10 @@ case "$OFFLOADING" in # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # (srok)TODO: intentionally increased DRAM size + TOTAL_CPU_DRAM_GB=2000 + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this