fix(vllm): Fix PagedAttention memory aliasing and unrolled loop compilation for scan_layers=True

khatwanimohit · khatwanimohit · commit ba92ee5aaa51 · 2026-04-17T00:56:01.000Z
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -104,6 +104,11 @@ class MaxTextForCausalLM(nnx.Module):
   of the decoding step.
   """
 
+  # Signal to tpu-inference model_loader that this class manages its own
+  # JIT-sharded initialization (via create_nnx_model with out_shardings).
+  # When True, model_loader skips wrapping __init__ in an outer bare @jax.jit,
+  _self_manages_sharding: bool = True
+
   def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array, mesh: Mesh):
     """Initializes the MaxTextForCausalLM model.
 
@@ -250,7 +255,7 @@ def load_weights(self, rng_key: jax.Array) -> None:
     if self.model is not None:
       return
 
-    with self.mesh, nn.logical_axis_rules(""):
+    with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
       model, _ = model_creation_utils.create_nnx_model(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -992,13 +992,14 @@ def forward_serve_vllm(
           "vLLM RPA attention ops require the vllm-tpu package. Please install it with `pip install vllm-tpu`."
       ) from e
 
-    if rpa_kv_cache is None or rpa_metadata is None:
-      raise ValueError("kv_cache and attention_metadata must be provided when using vLLM.")
-
     query = query.reshape(-1, query.shape[2], query.shape[3])
     key = key.reshape(-1, key.shape[2], key.shape[3])
     value = value.reshape(-1, value.shape[2], value.shape[3])
 
+    if rpa_kv_cache is None or rpa_metadata is None:
+      # Return dummy values for dry runs (e.g. during model initialization or JIT tracing)
+      return [], query
+
     if self.config.sliding_window_size > 0:
       attention_chunk_size = self.config.sliding_window_size
     else:
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -428,8 +428,23 @@ def pure_layer_fn(state_in, y_in):
 
     return out
 
-  def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs):
-    """Runs the layer stack using nnx.scan."""
+  def _apply_layers_sequentially(self, layers, x_in, *args, length: int, kv_caches_stacked=None, **kwargs):
+    """Runs the layer stack using nnx.scan.
+
+    Args:
+      layers: The stacked NNX module whose params are scanned over.
+      x_in: The carry (hidden state) fed into the first layer.
+      *args: Positional args broadcast to every layer call.
+      length: Number of scan iterations (= number of layers).
+      kv_caches_stacked: Optional pytree whose leaves have shape [num_layers, ...].
+        When provided, the i-th slice is passed as `kv_cache=` to layer i and the
+        updated caches are returned as a third element of the tuple.
+      **kwargs: Keyword args forwarded to the layer (filtered by the layer signature).
+
+    Returns:
+      (final_carry, updated_layers) when kv_caches_stacked is None.
+      (final_carry, updated_layers, returned_kv_stacked) otherwise.
+    """
     policy = self.get_remat_policy()
     prevent_cse = maxtext_utils.should_prevent_cse_in_remat(self.config)
     graphdef, params, state = nnx.split(
@@ -450,35 +465,83 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
     # Filter kwargs to only include keys that exist in the layer's signature
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
 
+    use_kv = kv_caches_stacked is not None
+
     def layer_fn(carry, scanned_vars):
       # Unpack the sliced variables for THIS layer
-      current_params, current_state = scanned_vars
+      if use_kv:
+        current_params, current_state, kv_cache_layer = scanned_vars
+      else:
+        current_params, current_state = scanned_vars
+        kv_cache_layer = None
 
       if self.config.parameter_memory_host_offload:
         current_params = jax.tree.map(lambda x: jax.device_put(x, max_utils.device_space()), current_params)
 
       # Merge using the SLICED state
       layer = nnx.merge(graphdef, current_params, current_state)
 
-      # Run the layer (Filter kwargs if using the solution from previous turn)
-      layer_out = layer(carry, *args, **valid_kwargs)
+      # Build call kwargs, injecting per-layer kv_cache when available
+      call_kwargs = dict(valid_kwargs)
+      if kv_cache_layer is not None:
+        call_kwargs["kv_cache"] = kv_cache_layer
 
-      new_carry = layer_out[0] if isinstance(layer_out, tuple) else layer_out
+      layer_out = layer(carry, *args, **call_kwargs)
+
+      if isinstance(layer_out, tuple):
+        new_carry = layer_out[0]
+        updated_kv = layer_out[1] if len(layer_out) > 1 else None
+      else:
+        new_carry = layer_out
+        updated_kv = None
 
       # Extract the updated state to return it
-      # _, new_current_state = nnx.split(layer, nnx.Param, ...)
       new_current_state = nnx.state(layer)
+
+      if use_kv:
+        return new_carry, (new_current_state, updated_kv)
       return new_carry, new_current_state
 
     layer_fn = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
 
-    final_carry, scanned_state = jax.lax.scan(layer_fn, x_in, (params, state))
+    if use_kv:
+      # If kv_caches is provided (e.g., from vLLM), we CANNOT use jax.lax.scan
+      # because scanning requires stacking the kv_caches list, which creates a copy
+      # and breaks the in-place memory updates required by vLLM's PagedAttention.
+      # Therefore, we must unroll the loop statically when kv_caches is provided.
+      
+      # kv_caches_stacked is actually the original kv_caches list in this new flow
+      kv_caches_list = kv_caches_stacked
+      
+      current_carry = x_in
+      
+      for i in range(length):
+        # Statically slice the parameters and state for this layer
+        current_params = jax.tree.map(lambda x: x[i], params)
+        current_state = jax.tree.map(lambda x: x[i], state)
+        
+        # Call the layer
+        current_carry, (new_current_state, updated_kv) = layer_fn(
+            current_carry, (current_params, current_state, kv_caches_list[i])
+        )
+        
+        # Update the list in-place (mutates the list passed by reference)
+        kv_caches_list[i] = updated_kv
+        
+      # We don't need to rebuild scanned_state or return it because during
+      # inference with vLLM, parameters do not change and we don't need intermediates.
+      return current_carry, layers, None
+    else:
+      final_carry, scanned_state = jax.lax.scan(layer_fn, x_in, (params, state))
+      returned_kv_stacked = None
 
     if scan_axis != 0:
       scanned_params, scanned_other = scanned_state.split(nnx.Param, ...)
       scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
       scanned_state = nnx.State.merge(scanned_params, scanned_other)
 
+    if use_kv:
+      return final_carry, nnx.merge(graphdef, scanned_state), returned_kv_stacked
     return final_carry, nnx.merge(graphdef, scanned_state)
 
   def get_decoder_layers(self):
@@ -1001,7 +1064,19 @@ def __call__(
         )
       else:
         scan_length = int(cfg.num_decoder_layers / cfg.inhomogeneous_layer_cycle_interval)
-        y, self.layers = self._apply_layers_sequentially(self.layers, y, *layer_args, length=scan_length, **layer_kwargs)
+        if kv_caches is not None:
+          # Pass the kv_caches list directly to avoid copying in jnp.stack,
+          # which breaks vLLM PagedAttention in-place memory updates.
+          # The _apply_layers_sequentially function will handle it by statically unrolling.
+          y, self.layers, returned_kv = self._apply_layers_sequentially(
+              self.layers, y, *layer_args, length=scan_length,
+              kv_caches_stacked=kv_caches, **layer_kwargs
+          )
+          # kv_caches list is updated in-place inside _apply_layers_sequentially
+        else:
+          y, self.layers = self._apply_layers_sequentially(
+              self.layers, y, *layer_args, length=scan_length, **layer_kwargs
+          )
     else:
       prevent_cse = maxtext_utils.should_prevent_cse_in_remat(cfg)
 
diff --git a/src/maxtext/models/gemma.py b/src/maxtext/models/gemma.py
@@ -30,6 +30,7 @@
 from maxtext.layers.linears import Dropout, MlpBlock
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.layers.quantizations import AqtQuantization as Quant
+from maxtext.inference import page_manager
 from maxtext.utils import max_utils
 
 
@@ -126,8 +127,7 @@ def __call__(
       deterministic,
       model_mode,
       previous_chunk=None,
-      page_manager=None,
-      page_state=None,
+      page_state: None | page_manager.PageState = None,
       slot=None,
       kv_cache=None,
       attention_metadata=None,
diff --git a/src/maxtext/models/gpt_oss.py b/src/maxtext/models/gpt_oss.py
@@ -34,6 +34,7 @@
 from maxtext.layers.attentions import Attention
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.layers.quantizations import AqtQuantization as Quant
+from maxtext.inference import page_manager
 from maxtext.utils import max_utils
 
 # -----------------------------------------
@@ -138,7 +139,7 @@ def __call__(
       deterministic,
       model_mode,
       previous_chunk=None,
-      page_state=None,
+      page_state: None | page_manager.PageState = None,
       slot=None,
       kv_cache=None,
       attention_metadata=None,
@@ -258,6 +259,11 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
+      previous_chunk=None,
+      page_state: None | page_manager.PageState = None,
+      slot=None,
+      kv_cache=None,
+      attention_metadata=None,
   ):
     cfg = self.config
 
@@ -267,19 +273,19 @@ def __call__(
     for layer_id in range(cfg.inhomogeneous_layer_cycle_interval):
       layer_name = f"layers_{layer_id}"
       layer = getattr(self, layer_name)
-      y = layer(
+      y, kv_cache = layer(
           y,
           decoder_segment_ids,
           decoder_positions,
           deterministic,
           model_mode,
+          previous_chunk=previous_chunk,
+          page_state=page_state,
+          slot=slot,
+          kv_cache=kv_cache,
+          attention_metadata=attention_metadata,
       )
-      if cfg.scan_layers:
-        y = y[0]
-    if cfg.scan_layers:
-      return y, None
-    else:
-      return y
+    return y, kv_cache
 
 
 GptOssScannableBlockToLinen = nnx_wrappers.to_linen_class(
diff --git a/src/maxtext/models/llama2.py b/src/maxtext/models/llama2.py
@@ -143,9 +143,9 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
+      previous_chunk=None,
       slot: None | int = None,
       page_state: None | page_manager.PageState = None,
-      previous_chunk=None,
       kv_cache=None,
       attention_metadata=None,
   ):
diff --git a/src/maxtext/models/llama4.py b/src/maxtext/models/llama4.py
@@ -442,9 +442,9 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
+      previous_chunk=None,
       slot: None | int = None,
       page_state: None | page_manager.PageState = None,
-      previous_chunk=None,
       kv_cache=None,
       attention_metadata=None,
   ):
@@ -570,9 +570,11 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
+      previous_chunk=None,
       slot: None | int = None,
       page_state: None | page_manager.PageState = None,
-      previous_chunk=None,
+      kv_cache=None,
+      attention_metadata=None,
   ):
 
     cfg = self.config
@@ -590,6 +592,8 @@ def __call__(
           previous_chunk=previous_chunk,
           page_state=page_state,
           slot=slot,
+          kv_cache=kv_cache,
+          attention_metadata=attention_metadata,
       )
       if cfg.scan_layers:
         y = y[0]
diff --git a/src/maxtext/models/mistral.py b/src/maxtext/models/mistral.py
@@ -22,6 +22,7 @@
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import Config
+from maxtext.inference import page_manager
 from maxtext.layers import initializers, nnx_wrappers
 from maxtext.layers import quantizations
 from maxtext.layers.attentions import Attention
@@ -126,9 +127,9 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
-      page_state: None | int = None,
-      slot: None | int = None,
       previous_chunk=None,
+      slot: None | int = None,
+      page_state: None | page_manager.PageState = None,
       kv_cache=None,
       attention_metadata=None,
   ):
diff --git a/src/maxtext/models/olmo3.py b/src/maxtext/models/olmo3.py
@@ -267,6 +267,11 @@ def __call__(
       decoder_positions,
       deterministic,
       model_mode,
+      previous_chunk=None,
+      page_state=None,
+      slot=None,
+      kv_cache=None,
+      attention_metadata=None,
   ):
     cfg = self.config
 
@@ -282,6 +287,11 @@ def __call__(
           decoder_positions,
           deterministic,
           model_mode,
+          previous_chunk=previous_chunk,
+          page_state=page_state,
+          slot=slot,
+          kv_cache=kv_cache,
+          attention_metadata=attention_metadata,
       )
       if cfg.scan_layers:
         y = y[0]
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -896,6 +896,8 @@ def __call__(
       previous_chunk=None,
       page_state: None | page_manager.PageState = None,
       slot: None | int = None,
+      kv_cache=None,
+      attention_metadata=None,
   ) -> tuple[Array, None]:
     """Applies the block of decoder layers to the input carry.
 
@@ -924,6 +926,8 @@ def __call__(
           previous_chunk,
           page_state,
           slot,
+          kv_cache=kv_cache,
+          attention_metadata=attention_metadata,
       )
 
     # The output of the block is the carry for the next scan iteration.
@@ -1235,10 +1239,7 @@ def __call__(
     layer_output = intermediate_inputs + mlp_lnx
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    if self.config.scan_layers:
-      return layer_output, None
-    else:
-      return layer_output, kv_cache
+    return layer_output, kv_cache
 
 
 # -----------------------------------------
@@ -1304,10 +1305,7 @@ def __call__(
     layer_output = intermediate_inputs + mlp_lnx
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    if self.config.scan_layers:
-      return layer_output, None
-    else:
-      return layer_output, kv_cache
+    return layer_output, kv_cache
 
 
 class Qwen3OmniMoeVisionPatchMerger(nnx.Module):
diff --git a/src/maxtext/models/simple_layer.py b/src/maxtext/models/simple_layer.py
diff --git a/src/maxtext/utils/model_creation_utils.py b/src/maxtext/utils/model_creation_utils.py