working for llama

khatwanimohit · khatwanimohit · commit 4eca564cc5a7 · 2026-04-20T16:38:32.000Z
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -48,6 +48,8 @@
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 from maxtext.configs import pyconfig
+import maxtext.integration.vllm.maxtext_vllm_adapter as adapter
+adapter.register()
 
 os.environ["SKIP_JAX_PRECOMPILE"] = "1"
 os.environ["NEW_MODEL_DESIGN"] = "1"
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -986,9 +986,6 @@ def __call__(
           current_broadcast_args = list(broadcast_args)
           current_in_axes_tuple = list(in_axes_tuple)
 
-          current_broadcast_args.append(attention_metadata)
-          current_in_axes_tuple.append(nn.broadcast)
-
           if kv_caches is not None:
             # Stack kv_caches for scan: [num_layers, ...]
             stacked_kv_cache = jnp.stack(kv_caches, axis=0)
@@ -998,6 +995,13 @@ def __call__(
             
             # We don't pass kv_cache as a scanned argument anymore
             
+            # Pass None for previous_chunk, slot, page_state, kv_cache to align with __call__ signature
+            current_broadcast_args.extend([None, None, None, None, attention_metadata])
+            current_in_axes_tuple.extend([nn.broadcast] * 5)
+            
+            max_logging.info(f"DEBUG: len(current_broadcast_args)={len(current_broadcast_args)}")
+            max_logging.info(f"DEBUG: current_broadcast_args={[type(a) for a in current_broadcast_args]}")
+            
             final_carry, _ = self.scan_decoder_layers(
                 cfg,
                 RemattedBlockLayer,
diff --git a/src/maxtext/models/gemma3.py b/src/maxtext/models/gemma3.py
@@ -194,7 +194,13 @@ def __call__(
   ):
     cfg = self.config
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -244,7 +250,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/gemma4.py b/src/maxtext/models/gemma4.py
@@ -322,7 +322,13 @@ def __call__(
   ):
     cfg = self.config
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -383,7 +389,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/gpt_oss.py b/src/maxtext/models/gpt_oss.py
@@ -23,6 +23,7 @@
 from flax import linen as nn
 from flax import nnx
 from jax.ad_checkpoint import checkpoint_name
+import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import AttentionType, Config
@@ -146,7 +147,13 @@ def __call__(
   ):
     cfg = self.config
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
 
     inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_norm_length", "activation_embed"))
@@ -201,7 +208,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/llama2.py b/src/maxtext/models/llama2.py
@@ -19,6 +19,7 @@
 import functools
 from flax import nnx
 from jax.ad_checkpoint import checkpoint_name
+import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import Config
@@ -152,7 +153,13 @@ def __call__(
     cfg = self.config
 
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
     inputs = self._maybe_shard_with_logical(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -206,7 +213,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/llama4.py b/src/maxtext/models/llama4.py
@@ -19,6 +19,7 @@
 
 from flax import linen as nn
 from flax import nnx
+import jax
 from jax import lax
 from jax.ad_checkpoint import checkpoint_name
 import jax.numpy as jnp
@@ -452,7 +453,13 @@ def __call__(
     assert cfg.num_experts >= 1, "Expected the Llama4 config to have `num_experts > 1`."
 
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -504,7 +511,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/mistral.py b/src/maxtext/models/mistral.py
@@ -19,6 +19,7 @@
 from flax import linen as nn
 from flax import nnx
 from jax.ad_checkpoint import checkpoint_name
+import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import Config
@@ -136,7 +137,13 @@ def __call__(
     cfg = self.config
 
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -181,7 +188,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache
diff --git a/src/maxtext/models/olmo3.py b/src/maxtext/models/olmo3.py
@@ -24,6 +24,7 @@
 from flax import linen as nn
 from flax import nnx
 from jax.ad_checkpoint import checkpoint_name
+import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from maxtext.common.common_types import AttentionType, Config
@@ -155,7 +156,13 @@ def __call__(
   ):
     cfg = self.config
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
-    if isinstance(inputs, tuple):
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
       inputs = inputs[0]
 
     inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_norm_length", "activation_embed"))
@@ -209,7 +216,15 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if is_scan_carry:
+      def update_cache(cache, val):
+        if jnp.size(val) > 0:
+          return cache.at[layer_idx].set(val)
+        return cache
+
+      stacked_kv_cache = jax.tree_util.tree_map(update_cache, stacked_kv_cache, kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    elif cfg.scan_layers:
       return layer_output, None
     else:
       return layer_output, kv_cache