dphnAI
diff --git a/‎aphrodite/metal/config.py‎
Lines changed: 30 additions & 0 deletions b/‎aphrodite/metal/config.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎aphrodite/metal/metal_kernel_backend/attention_sdpa.py‎
Lines changed: 26 additions & 11 deletions b/‎aphrodite/metal/metal_kernel_backend/attention_sdpa.py‎
Lines changed: 26 additions & 11 deletions
diff --git a/‎aphrodite/metal/paged_attention_common.py‎
Lines changed: 19 additions & 8 deletions b/‎aphrodite/metal/paged_attention_common.py‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎aphrodite/metal/platform.py‎
Lines changed: 19 additions & 2 deletions b/‎aphrodite/metal/platform.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎aphrodite/metal/v1/cache_policy.py‎
Lines changed: 6 additions & 2 deletions b/‎aphrodite/metal/v1/cache_policy.py‎
Lines changed: 6 additions & 2 deletions
@@ -160,3 +160,33 @@ def reset_config() -> None:
     """Reset the global config (useful for testing)."""
     global _config
     _config = None
+
+
+def should_use_contiguous_kv_fast_path(
+    config: MetalConfig,
+    *,
+    model_config: object | None,
+    scheduler_config: object,
+) -> bool:
+    """Return whether Metal should prefer MLX's contiguous KV cache.
+
+    Paged attention is still the default for higher concurrency and features
+    that need block-managed KV state.  For dense, low-concurrency text serving,
+    MLX's contiguous cache is currently much faster on decode and does not
+    require an environment variable from the user.
+    """
+    return (
+        "APHRODITE_METAL_USE_PAGED_ATTENTION" not in os.environ
+        and config.use_paged_attention
+        and config.is_auto_memory
+        and not config.turboquant
+        and model_config is not None
+        and not getattr(model_config, "is_hybrid", False)
+        and getattr(scheduler_config, "max_num_seqs") <= 2
+    )
+
+
+def enable_contiguous_kv_fast_path(config: MetalConfig) -> None:
+    """Switch a Metal config to the contiguous MLX KV cache path."""
+    config.use_paged_attention = False
+    config.kv_sharing_fast_prefill = False
@@ -104,7 +104,7 @@ def _pick_kernel_block_size(cache_block_size: int) -> int:
 
 
 def _build_block_tables(
-    raw_block_tables: list[list[int]],
+    ctx: PagedAttentionContext,
     cache_block_size: int,
 ) -> tuple[mx.array, int]:
     """Build kernel-compatible block tables, translating if necessary.
@@ -117,14 +117,23 @@ def _build_block_tables(
     Returns:
         (block_tables, kernel_block_size)
     """
+    cached = ctx.block_tables_cache.get(cache_block_size)
+    if cached is not None:
+        return cached
+
+    raw_block_tables = ctx.block_tables
     if not raw_block_tables:
-        return mx.zeros((0, 0), dtype=mx.int32), cache_block_size
+        result = (mx.zeros((0, 0), dtype=mx.int32), cache_block_size)
+        ctx.block_tables_cache[cache_block_size] = result
+        return result
 
     if cache_block_size in _KERNEL_BLOCK_SIZES:
         # Fast path — no translation needed.
         max_blocks = max(len(bt) for bt in raw_block_tables)
         padded = [bt + [0] * (max_blocks - len(bt)) for bt in raw_block_tables]
-        return mx.array(padded, dtype=mx.int32), cache_block_size
+        result = (mx.array(padded, dtype=mx.int32), cache_block_size)
+        ctx.block_tables_cache[cache_block_size] = result
+        return result
 
     # Hybrid path — translate large block_size to a kernel-compatible one.
     # Vectorized: each vLLM block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
@@ -139,7 +148,9 @@ def _build_block_tables(
     expanded = (bt_arr[:, :, None] * ratio + offsets[None, None, :]).reshape(
         bt_arr.shape[0], -1
     )
-    return expanded, kernel_bs
+    result = (expanded, kernel_bs)
+    ctx.block_tables_cache[cache_block_size] = result
+    return result
 
 
 # === Q/K/V preparation (YOCO, K-eq-V, v_norm variants) ===
@@ -424,20 +435,24 @@ def sdpa_forward(
     k_3d = mx.contiguous(keys[0].transpose(1, 0, 2).astype(kv_cache.dtype))
     v_3d = mx.contiguous(values[0].transpose(1, 0, 2).astype(kv_cache.dtype))
 
-    slot_mapping = mx.array(ctx.slot_mapping, dtype=mx.int64)
-    seq_lens = mx.array(ctx.context_lens, dtype=mx.int32)
-    cu_seqlens_q = mx.array(ctx.cu_seqlens, dtype=mx.int32)
-    max_seq_len = max(ctx.context_lens)
+    slot_mapping = ctx.slot_mapping_mx
+    if slot_mapping is None:
+        slot_mapping = mx.array(ctx.slot_mapping, dtype=mx.int64)
+    seq_lens = ctx.context_lens_mx
+    if seq_lens is None:
+        seq_lens = mx.array(ctx.context_lens, dtype=mx.int32)
+    cu_seqlens_q = ctx.cu_seqlens_mx
+    if cu_seqlens_q is None:
+        cu_seqlens_q = mx.array(ctx.cu_seqlens, dtype=mx.int32)
+    max_seq_len = ctx.max_context_len or max(ctx.context_lens)
 
     # --- Block tables (with hybrid block-size translation) ---
     # vLLM may inflate block_size (e.g. 544) to align attention pages with
     # mamba pages in hybrid models.  The Metal kernel only supports small
     # block sizes (8, 16, 32).  _build_block_tables handles the translation:
     # it expands each vLLM block into multiple kernel blocks and returns the
     # kernel-compatible block_size.  The cache is reshaped to match (zero-copy).
-    block_tables, kernel_block_size = _build_block_tables(
-        ctx.block_tables, kv_cache.block_size
-    )
+    block_tables, kernel_block_size = _build_block_tables(ctx, kv_cache.block_size)
 
     if shared_kv is not None:
         # YOCO shared layer: the reference layer already scattered the
 
@@ -17,6 +17,7 @@
 from dataclasses import dataclass, field
 from typing import Any
 
+import mlx.core as mx
 from mlx_lm.models.base import create_causal_mask
 
 # ---------------------------------------------------------------------------
@@ -52,6 +53,13 @@ class PagedAttentionContext:
     # GDN state pool slot mapping: request batch position → stable slot ID.
     # Populated by model_runner for hybrid models; None for non-hybrid.
     gdn_slot_mapping: list[int] | None = None
+    # MLX forms of per-step metadata.  These are shared by all layers in the
+    # same forward pass to avoid rebuilding identical arrays per layer.
+    slot_mapping_mx: mx.array | None = None
+    context_lens_mx: mx.array | None = None
+    cu_seqlens_mx: mx.array | None = None
+    max_context_len: int = 0
+    block_tables_cache: dict[int, tuple[mx.array, int]] = field(default_factory=dict)
 
 
 def set_context(ctx: PagedAttentionContext) -> None:
@@ -200,12 +208,15 @@ def prepare_unified(
         context_lens.append(start_pos + num_tokens)
         offsets.append(start_pos)
 
-    set_context(
-        PagedAttentionContext(
-            slot_mapping=slot_mapping,
-            block_tables=block_tables,
-            context_lens=context_lens,
-            cu_seqlens=cu_seqlens,
-            offsets=offsets,
-        )
+    ctx = PagedAttentionContext(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        cu_seqlens=cu_seqlens,
+        offsets=offsets,
+        slot_mapping_mx=mx.array(slot_mapping, dtype=mx.int64),
+        context_lens_mx=mx.array(context_lens, dtype=mx.int32),
+        cu_seqlens_mx=mx.array(cu_seqlens, dtype=mx.int32),
+        max_context_len=max(context_lens, default=0),
     )
+    set_context(ctx)
@@ -9,7 +9,11 @@
 import torch
 from aphrodite.platforms.interface import DeviceCapability, Platform, PlatformEnum
 
-from aphrodite.metal.config import get_config
+from aphrodite.metal.config import (
+    enable_contiguous_kv_fast_path,
+    get_config,
+    should_use_contiguous_kv_fast_path,
+)
 
 if TYPE_CHECKING:
     from aphrodite.config import AphroditeConfig
@@ -253,6 +257,20 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
                 f"k_quant={config.k_quant}, v_quant={config.v_quant}"
             )
 
+        scheduler_config = aphrodite_config.scheduler_config
+        if should_use_contiguous_kv_fast_path(
+            config,
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        ):
+            enable_contiguous_kv_fast_path(config)
+            logger.info(
+                "Metal: using contiguous MLX KV cache for low-concurrency "
+                "dense serving (max_num_seqs=%d). Set "
+                "APHRODITE_METAL_USE_PAGED_ATTENTION=1 to force paged attention.",
+                scheduler_config.max_num_seqs,
+            )
+
         if config.debug:
             logger.info(f"Metal config: {config}")
 
@@ -267,7 +285,6 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
         # Disable features not supported on Metal
         parallel_config.disable_custom_all_reduce = True
 
-        scheduler_config = aphrodite_config.scheduler_config
         if getattr(scheduler_config, "enable_chunked_prefill", False):
             if config.use_paged_attention:
                 # The paged path uses a unified varlen Metal kernel that
 
@@ -646,11 +646,15 @@ def determine_available_memory(self) -> int:
             )
             return available
 
-        available = self._worker._one_sequence_kv_bytes()
+        one_sequence_bytes = self._worker._one_sequence_kv_bytes()
+        max_num_seqs = self._worker.model_runner.scheduler_config.max_num_seqs
+        available = one_sequence_bytes * max_num_seqs
         logger.info(
             "MLX path: reporting %.2f GB for scheduler admission control "
-            "(one max-length sequence, max_model_len=%d)",
+            "(%d max-length sequence%s, max_model_len=%d)",
             available / 1e9,
+            max_num_seqs,
+            "" if max_num_seqs == 1 else "s",
             self._worker.model_config.max_model_len,
         )
         return available