feat: Gemma4 LoRA Extension

RexBearIU · RexBearIU · commit 2bc86324bd07 · 2026-05-22T06:46:39.000Z
diff --git a/src/maxtext/configs/post_train/lora_module_path.yml b/src/maxtext/configs/post_train/lora_module_path.yml
@@ -21,6 +21,7 @@ mistral: "decoder/layers/.*(attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo))"
 deepseek2: "decoder/(dense_layers|moe_stack)/self_attention/(query|out|wkv_a|wkv_b)|decoder/(dense_layers|moe_stack)/(mlp|shared_experts)/(wi_0|wi_1|wo)"
 gemma2: "decoder/layers/(self_attention_local|self_attention_global)/(query|key|value|out)|decoder/layers/(mlp_local|mlp_global)/(wi_0|wi_1|wo)"
 gemma3: "decoder/layers/.*(self_attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo|gate|up|down))"
+gemma4: "decoder/(scanned_blocks|layers_remainder)/layers.*/.*(self_attention/(query|key|value|out)|mlp/.*(MoeBlock_0|wi_0|wi_1|wo|shared_experts/(wi_0|wi_1|wo)))"
 olmo3: "decoder/layers/.*(attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo))"
 gpt3: "decoder/layers/(self_attention/(qkv_proj|out)|mlp/(wi|wo))"
 
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -267,6 +267,16 @@ def verify_chat_template_generation_prompt_logic(tokenizer_model):
   actual_prefix_in_full_turn = full_turn_ids[len(prompt_wo_gen_ids) : len(prompt_wo_gen_ids) + len(assistant_prefix)]
 
   if actual_prefix_in_full_turn != assistant_prefix:
+    # Allow the generation prompt to include a thought channel block (e.g., for Gemma 4).
+    thought_channel = "<|channel>thought\n<channel|>"
+    thought_ids = extract_token_ids(tokenizer_model.encode(thought_channel, add_special_tokens=False))
+    if len(assistant_prefix) >= len(thought_ids) and assistant_prefix[-len(thought_ids) :] == thought_ids:
+      true_prefix_ids = assistant_prefix[: -len(thought_ids)]
+      actual_prefix = full_turn_ids[len(prompt_wo_gen_ids) : len(prompt_wo_gen_ids) + len(true_prefix_ids)]
+      if actual_prefix == true_prefix_ids:
+        max_logging.info("Chat template generation prompt mismatch resolved via thought channel bypass.")
+        return
+
     expected_str = tokenizer_model.decode(assistant_prefix)
     actual_str = tokenizer_model.decode(actual_prefix_in_full_turn)
     raise ValueError(
@@ -298,6 +308,12 @@ def _get_completion_in_chat_template(tokenizer_model, round_msgs):
   prompt_completion_ids = extract_token_ids(prompt_completion_tokens)
   prompt_ids = extract_token_ids(prompt_tokens)
 
+  # Bypass for Gemma 4's thought channel block which is included in generation prompt but not in normal assistant turns
+  thought_channel = "<|channel>thought\n<channel|>"
+  thought_ids = extract_token_ids(tokenizer_model.encode(thought_channel, add_special_tokens=False))
+  if len(prompt_ids) >= len(thought_ids) and prompt_ids[-len(thought_ids) :] == thought_ids:
+    prompt_ids = prompt_ids[: -len(thought_ids)]
+
   completion_tokens = prompt_completion_ids[len(prompt_ids) :]
   completion_in_chat_template = tokenizer_model.decode(completion_tokens, skip_special_tokens=False)
   return completion_in_chat_template
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -264,9 +264,14 @@ def setup_trainer_state(mt_config, goodput_recorder=None):
 def train_model(mt_config, trainer, mesh):
   """Runs the SFT training loop in Tunix."""
   with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):
+    # Disable NNX graph caching for MoE models (where experts > 1) to allow
+    # necessary dynamic metadata synchronization during forward passes (e.g., in jax.lax.scan).
+    enable_nnx_cache = getattr(mt_config, "num_experts", 1) <= 1
+
     trainer.train(
         trainer.data_hooks.train_data_iterator,
         trainer.data_hooks.eval_data_iterator,
+        cache_nnx_graph=enable_nnx_cache,
     )
   return trainer
 
diff --git a/tests/post_training/unit/sft_data_processing_test.py b/tests/post_training/unit/sft_data_processing_test.py
@@ -512,13 +512,17 @@ def setUp(self):
     super().setUp()
     self.qwen3_tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
     self.llama2_tokenizer = transformers.AutoTokenizer.from_pretrained(self.LLAMA_TOKENIZER_PATH)
+    self.gemma4_tokenizer = transformers.AutoTokenizer.from_pretrained("google/gemma-4-26b-a4b-it")
 
   def test_tokenizer_w_generation_prompt(self):
     verify_chat_template_generation_prompt_logic(self.qwen3_tokenizer)
 
   def test_tokenizer_wo_generation_prompt(self):
     verify_chat_template_generation_prompt_logic(self.llama2_tokenizer)
 
+  def test_tokenizer_gemma4_thought_channel_bypass(self):
+    verify_chat_template_generation_prompt_logic(self.gemma4_tokenizer)
+
   def test_failure_path_with_modified_template(self):
     """Verifies the function correctly raises a ValueError on a bad template."""
     # Replace the role within the existing add_generation_prompt block with a deliberately faulty one.