update vllm logical rule

NuojCheng · NuojCheng · commit 92cc4b5a8531 · 2026-04-22T16:32:10.000Z
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -29,56 +29,77 @@ weight_dtype: bfloat16
 # -------------- Logical Axis Rules --------------
 mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
 logical_axis_rules: [
-                      ['activation_batch', ['data']],
-                      ['activation_batch_moe', ['data']],
-                      ['activation_batch_attn', ['data']],
-                      ['activation_embed_and_logits_batch', ['data', 'expert']],
-                      ['activation_embed_and_logits_batch_sequence', ['data', 'expert']],
-                      ['activation_heads', ['model', 'expert']],
-                      ['activation_kv_heads', ['model', 'expert']],
-                      ['activation_length_attn', []],
-                      ['activation_length', []],
-                      ['activation_length_moe', []],
-                      ['activation_q_length', ['expert', 'attn_dp_expert']],
-                      ['activation_embed_attn', 'model'],
-                      # Expert is missing explicitly from activation_embed despite using TP.
-                      # We are going for a replicate-AR style of TP as opposed to our typical AG-RS style of TP
-                      # due to the output sharding of the fused_moe_gmm kernel in tpu-inference.
-                      ['activation_embed', ['model', 'attn_dp']],
-                      ['activation_embed_moe', ['model', 'attn_dp']],
-                      ['activation_mlp', ['model', 'attn_dp']],
-                      ['activation_mlp_moe', ['model', 'attn_dp']],
-                      ['activation_kv', ['model']],
-                      ['activation_prefill_kv_batch', ['expert', 'attn_dp_expert']],
-                      ['activation_kv_batch', ['data']],
-                      ['activation_kv_head_dim', ['model']],
-                      ['activation_vocab', ['model', 'attn_dp']],
-                      ['activation_norm_length', []],
-                      ['activation_norm_length_moe', []],
-                      ['activation_exp', ['expert', 'attn_dp_expert']],
-                      ['decode_batch', ['data']],
-                      ['decode_batch_moe', ['data']],
-                      ['decode_length', []],
-                      ['mlp', ['model', 'attn_dp']],
-                      ['mlp_moe', ['model', 'attn_dp']],
-                      ['mlp_no_fsdp', ['model', 'attn_dp']],
-                      ['vocab', ['model', 'attn_dp']],
-                      # Expert is intended to act like TP for attention.
-                      # We target two all-reduces, one at the end of attention out projection and one at the end of the feedforward.
-                      ['heads', ['model', 'expert']],
-                      ['q_heads', ['model', 'expert']],
-                      ['kv_heads', ['model', 'expert']],
-                      ['kv_head_dim', []],
+                      # ==========================================
+                      # Vocabulary Embedding
+                      # ==========================================
+                      # Vocab Activations
+                      ['activation_embed_and_logits_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_vocab', ['expert', 'model']],
+                      # Vocab Weights
+                      ['vocab', []],
+                      ['embed_vocab', []],
+                      # ==========================================
+                      # Attention
+                      # ==========================================
+                      # Attention Activations
+                      ['activation_batch_attn', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_heads', ['expert', 'model']],
+                      ['activation_kv_heads', ['expert', 'model']],
+                      ['activation_embed_attn', []],
+                      ['activation_kv', []],
+                      ['activation_kv_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_kv_head_dim', []],
+                      # Attention Weights
+                      ['heads', ['expert', 'model']],
+                      ['q_heads', ['expert', 'model']],
+                      ['kv_heads', ['expert', 'model']],
+                      ['qkv', []],
                       ['kv', []],
-                      ['embed', []],
+                      ['kv_head_dim', []],
+                      ['q_lora', []],
+                      ["q_lora_up_proj", []],
+                      ['kv_lora', []],
+                      ["kv_lora_up_proj", []],
+                      # ==========================================
+                      # Mixture of Experts (MoE)
+                      # ==========================================
+                      # MoE Activations
+                      ['activation_batch_moe', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_embed_moe', ['attn_dp', 'model']],
+                      ['activation_mlp_moe', ['attn_dp', 'model']],
+                      ['activation_exp', ['attn_dp_expert', 'expert']],
+                      # MoE Weights
+                      ['exp', ['attn_dp_expert', 'expert']],
+                      ['mlp_moe', ['attn_dp', 'model']],
                       ['embed_moe', []],
-                      ['embed_tensor_transpose', ['attn_dp', 'model']],
-                      ['q_lora', ['expert', 'attn_dp_expert']],
-                      ['kv_lora', ['expert', 'attn_dp_expert']],
+                      # ==========================================
+                      # Standard MLP / Dense Layers / Model Structure
+                      # ==========================================
+                      # Dense Activations
+                      ['activation_mlp', ['attn_dp', 'model']],
+                      # Note activation batch and length also get used in attention and vocab
+                      ['activation_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_embed', []],
+                      # General Weights
+                      ['mlp', ['attn_dp', 'model']],
+                      ['embed', []],
                       ['norm', []],
-                      ['cache_heads', ['model']],
-                      ['exp', ['expert', 'attn_dp_expert']],
-                      ['paged_kv_heads', ['model']],
-                    ]
+                      # ==========================================
+                      # Inference(Prefill, Decode, Cache)
+                      # ==========================================
+                      ['activation_prefill_kv_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['decode_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['cache_heads', ['expert', 'model']],
+                      ['paged_kv_heads', ['expert', 'model']],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                  ]
 data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -254,7 +254,7 @@ def load_weights(self, rng_key: jax.Array) -> None:
       return
 
     with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
-      model, _ = model_creation_utils.create_nnx_model(
+      model = model_creation_utils.from_pretrained(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
       self.model = nnx.data(model)

Original file line number	Diff line number	Diff line change
`@@ -254,7 +254,7 @@ def load_weights(self, rng_key: jax.Array) -> None:`
`254`	`254`	`return`
`255`	`255`
`256`	`256`	`with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):`
`257`		`- model, _ = model_creation_utils.create_nnx_model(`
	`257`	`+ model = model_creation_utils.from_pretrained(`
`258`	`258`	`self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key`
`259`	`259`	`)`
`260`	`260`	`self.model = nnx.data(model)`