trial

khatwanimohit · khatwanimohit · commit 5c4341b072ba · 2026-04-17T17:15:38.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1173,6 +1173,7 @@ use_tokamax_splash: false
 use_jax_splash: false
 
 # vLLM Adapter Configurations
+hbm_utilization_vllm: 0.5
 # Path to the HuggingFace-style config directory for the adapter (e.g. src/maxtext/integration/vllm/maxtext_vllm_adapter)
 vllm_hf_config_path: ""
 # A JSON string of overrides to apply to the HuggingFace-style config for the vLLM adapter.
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -78,6 +78,7 @@ logical_axis_rules: [
                       ['cache_heads', ['model']],
                       ['exp', ['expert', 'attn_dp_expert']],
                       ['paged_kv_heads', ['model']],
+                      ['layers', []],
                     ]
 data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -82,6 +82,7 @@ def decode_with_vllm(config: Config) -> None:
               "weight_dtype": "bfloat16",
               "allow_split_physical_axes": True,
               "debug_sharding": config.debug_sharding,
+              "scan_layers": config.scan_layers,
           },
           "sharding": {
               "sharding_strategy": {
@@ -140,11 +141,15 @@ def decode_with_vllm(config: Config) -> None:
         f"max_target_length ({config.max_target_length}) must be greater than max_prompt_length ({max_prompt_length})"
     )
 
+  # MaxText uses -1 to mean "disabled"; vLLM requires top_p in (0, 1].
+  top_p = config.decode_sampling_nucleus_p if config.decode_sampling_nucleus_p > 0 else 1.0
+  top_k = config.decode_sampling_top_k if config.decode_sampling_top_k > 0 else -1
+
   sampling_params = SamplingParams(
       temperature=config.decode_sampling_temperature,
       max_tokens=max_tokens_to_generate,
-      top_k=config.decode_sampling_top_k,
-      top_p=config.decode_sampling_nucleus_p,
+      top_k=top_k,
+      top_p=top_p,
   )
 
   outputs = llm.generate(prompts, sampling_params)
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -981,16 +981,54 @@ def __call__(
                 "nope_layer_interval": self.config.nope_layer_interval,
                 "interleave_moe_layer_step": self.config.interleave_moe_layer_step,
             }
-          y, _ = self.scan_decoder_layers(
-              cfg,
-              RemattedBlockLayer,
-              scan_length,
-              "layers",
-              mesh,
-              in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
-              model_mode=model_mode,
-              **layer_kwargs,
-          )(y, *broadcast_args)
+          # Update broadcast_args and in_axes_tuple for vLLM RPA
+          in_axes_tuple = (nn.broadcast,) * len(broadcast_args)
+          current_broadcast_args = list(broadcast_args)
+          current_in_axes_tuple = list(in_axes_tuple)
+
+          current_broadcast_args.append(attention_metadata)
+          current_in_axes_tuple.append(nn.broadcast)
+
+          if kv_caches is not None:
+            # Stack kv_caches for scan: [num_layers, ...]
+            stacked_kv_cache = jnp.stack(kv_caches, axis=0)
+            
+            # We pass (y, stacked_kv_cache, 0) as the carry
+            carry = (y, stacked_kv_cache, 0)
+            
+            # We don't pass kv_cache as a scanned argument anymore
+            
+            final_carry, _ = self.scan_decoder_layers(
+                cfg,
+                RemattedBlockLayer,
+                scan_length,
+                "layers",
+                mesh,
+                in_axes_tuple=tuple(current_in_axes_tuple),
+                model_mode=model_mode,
+                **layer_kwargs,
+            )(carry, *current_broadcast_args)
+            
+            y, returned_kv_cache, _ = final_carry
+            
+            # Update the list of KV caches from the scanned results
+            for i in range(cfg.num_decoder_layers):
+              kv_caches[i] = returned_kv_cache[i]
+          else:
+            # Fallback to old behavior if kv_caches is None (not vLLM RPA)
+            current_broadcast_args.append(None)
+            current_in_axes_tuple.append(nn.broadcast)
+            
+            y, _ = self.scan_decoder_layers(
+                cfg,
+                RemattedBlockLayer,
+                scan_length,
+                "layers",
+                mesh,
+                in_axes_tuple=tuple(current_in_axes_tuple),
+                model_mode=model_mode,
+                **layer_kwargs,
+            )(y, *current_broadcast_args)
       else:
         if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
           assert len(RemattedBlockLayers) == 2, "Unscanned layers must have a length of 2 using deepseek."
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -1285,6 +1285,14 @@ def __call__(
       attention_metadata: None | dict[str, Any] = None,
   ):
     # Unpack inputs if it's a tuple (e.g. from a previous layer returning (hidden_states, kv_cache))
+    is_scan_carry = False
+    if isinstance(inputs, tuple) and len(inputs) == 3:
+      hidden_states, stacked_kv_cache, layer_idx = inputs
+      kv_cache = stacked_kv_cache[layer_idx]
+      inputs = hidden_states
+      is_scan_carry = True
+    elif isinstance(inputs, tuple):
+      inputs = inputs[0]
     if isinstance(inputs, tuple):
       inputs = inputs[0]
     hidden_states, intermediate_inputs, kv_cache = self.apply_attention_with_norm(
@@ -1305,7 +1313,11 @@ def __call__(
     layer_output = intermediate_inputs + mlp_lnx
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    return layer_output, kv_cache
+    if is_scan_carry:
+      stacked_kv_cache = stacked_kv_cache.at[layer_idx].set(kv_cache)
+      return (layer_output, stacked_kv_cache, layer_idx + 1), None
+    else:
+      return layer_output, kv_cache
 
 
 class Qwen3OmniMoeVisionPatchMerger(nnx.Module):

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ logical_axis_rules: [`
`78`	`78`	`['cache_heads', ['model']],`
`79`	`79`	`['exp', ['expert', 'attn_dp_expert']],`
`80`	`80`	`['paged_kv_heads', ['model']],`
	`81`	`+ ['layers', []],`
`81`	`82`	`]`
`82`	`83`	`data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]`
`83`	`84`	`input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']`