pass kv_cache through scanned decoder layers

khatwanimohit · khatwanimohit · commit bfbbc5e4a000 · 2026-03-10T16:06:59.000-07:00
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -85,6 +85,10 @@ class MaxTextForCausalLM(nnx.Module):
   tasks. It handles configuration generation, model initialization, and execution
   of the decoding step.
   """
+  # Signal to tpu-inference model_loader that this class manages its own
+  # JIT-sharded initialization (via create_nnx_model with out_shardings).
+  # When True, model_loader skips wrapping __init__ in an outer bare @jax.jit,
+  _self_manages_sharding: bool = True
 
   def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array, mesh: Mesh):
     """Initializes the MaxTextForCausalLM model.
@@ -232,7 +236,7 @@ def load_weights(self, rng_key: jax.Array) -> None:
     if self.model is not None:
       return
 
-    with self.mesh, nn.logical_axis_rules(""):
+    with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
       model, _ = model_creation_utils.create_nnx_model(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -956,13 +956,14 @@ def forward_serve_vllm(
           "vLLM RPA attention ops require the vllm-tpu package. Please install it with `pip install vllm-tpu`."
       ) from e
 
-    if rpa_kv_cache is None or rpa_metadata is None:
-      raise ValueError("kv_cache and attention_metadata must be provided when using vLLM.")
-
     query = query.reshape(-1, query.shape[2], query.shape[3])
     key = key.reshape(-1, key.shape[2], key.shape[3])
     value = value.reshape(-1, value.shape[2], value.shape[3])
 
+    if rpa_kv_cache is None or rpa_metadata is None:
+      # Return dummy values for dry runs (e.g. during model initialization or JIT tracing)
+      return [], query
+
     if self.config.sliding_window_size > 0:
       attention_chunk_size = self.config.sliding_window_size
     else:
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -792,7 +792,11 @@ def __call__(
         decoder_positions,
         deterministic,
         model_mode,
+        previous_chunk,
+        page_state,
+        slot,
     )
+    in_axes_tuple = (nn.broadcast,) * len(broadcast_args)
     if cfg.using_pipeline_parallelism:
       if cfg.pipeline_fsdp_ag_once:
         logical_partition_spec = self.pipeline_module.get_weight_sharding(
@@ -954,16 +958,38 @@ def __call__(
                 "nope_layer_interval": self.config.nope_layer_interval,
                 "interleave_moe_layer_step": self.config.interleave_moe_layer_step,
             }
-          y, _ = self.scan_decoder_layers(
+
+          # Update broadcast_args and in_axes_tuple for vLLM RPA
+          current_broadcast_args = list(broadcast_args)
+          current_in_axes_tuple = list(in_axes_tuple)
+
+          if kv_caches is not None:
+            # Stack kv_caches for scan: [num_layers, ...]
+            stacked_kv_cache = jnp.stack(kv_caches, axis=0)
+            current_broadcast_args.append(stacked_kv_cache)
+            current_in_axes_tuple.append(0)  # Scan over the layer dimension
+          else:
+            current_broadcast_args.append(None)
+            current_in_axes_tuple.append(nn.broadcast)
+
+          current_broadcast_args.append(attention_metadata)
+          current_in_axes_tuple.append(nn.broadcast)
+
+          y, returned_kv_cache = self.scan_decoder_layers(
               cfg,
               RemattedBlockLayer,
               scan_length,
               "layers",
               mesh,
-              in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
+              in_axes_tuple=tuple(current_in_axes_tuple),
               model_mode=model_mode,
               **layer_kwargs,
-          )(y, *broadcast_args)
+          )(y, *current_broadcast_args)
+
+          if kv_caches is not None and returned_kv_cache is not None:
+            # Update the list of KV caches from the scanned results
+            for i in range(len(kv_caches)):
+              kv_caches[i] = returned_kv_cache[i]
       else:
         if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
           assert len(RemattedBlockLayers) == 2, "Unscanned layers must have a length of 2 using deepseek."
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -1235,10 +1235,7 @@ def __call__(
     layer_output = intermediate_inputs + mlp_lnx
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    if self.config.scan_layers:
-      return layer_output, None
-    else:
-      return layer_output, kv_cache
+    return layer_output, kv_cache
 
 
 # -----------------------------------------
@@ -1304,10 +1301,7 @@ def __call__(
     layer_output = intermediate_inputs + mlp_lnx
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    if self.config.scan_layers:
-      return layer_output, None
-    else:
-      return layer_output, kv_cache
+    return layer_output, kv_cache
 
 
 class Qwen3OmniMoeVisionPatchMerger(nnx.Module):