feat: Enable global attention for Gemma3/Gemma4 drafter models

tcligg · tcligg · commit fa7ba138e4b1 · 2026-04-17T00:54:39.000Z
diff --git a/examples/run_gemma3_27b_eagle3_online.sh b/examples/run_gemma3_27b_eagle3_online.sh
@@ -14,6 +14,7 @@ torchrun \
     --draft-model-config $ROOT_DIR/configs/gemma3-27b-eagle3.json \
     --train-data-path $ROOT_DIR/cache/dataset/ultrachat_train.jsonl \
     --output-dir $ROOT_DIR/outputs/gemma3-27b-eagle3-ultrachat \
+    --eval-holdout-ratio 0.03 \
     --num-epochs 10 \
     --batch-size 8 \
     --tp-size $TP_SIZE \
diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py
@@ -402,6 +402,11 @@ def build_draft_model(args: Namespace) -> Tuple[AutoDraftModelConfig, nn.Module]
         # Use provided config file
         draft_model_config = AutoDraftModelConfig.from_file(args.draft_model_config)
 
+    # if the target model is gemma, we should use global attention for the draft model
+    if "gemma" in getattr(draft_model_config, "target_model_type", "").lower():
+        draft_model_config.use_global_attention = True
+        print_on_rank0("Using global attention for draft model.")
+
     # Handle base ckpt, config file
     draft_model_last_checkpoint = None
     is_resume_checkpoint = False
@@ -427,6 +432,7 @@ def build_draft_model(args: Namespace) -> Tuple[AutoDraftModelConfig, nn.Module]
     if draft_model_last_checkpoint:
         draft_model = AutoEagle3DraftModel.from_pretrained(
             draft_model_last_checkpoint,
+            config=draft_model_config,
             attention_backend=args.attention_backend,
             torch_dtype=torch.bfloat16,
         ).cuda()
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -523,6 +523,7 @@ def __init__(self, config):
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
+        self.use_global_attention = getattr(config, "use_global_attention", False)
 
         self.q_proj = nn.Linear(
             self.hidden_size * 2, self.num_heads * self.head_dim, bias=False
@@ -760,6 +761,10 @@ class LlamaFlexAttention(LlamaAttention):
         - past_key_values: dynamic cache used for storing past key and value states.
     """
 
+    def __init__(self, config):
+        super().__init__(config)
+        self.use_global_attention = getattr(config, "use_global_attention", False)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -821,39 +826,45 @@ def forward(
             cache_kwargs=cache_kwargs,
         )
 
-        seq_lengths = attention_mask.sum(dim=-1)
-        # Shrink the attention mask to align with the padding to the right.
-        # This is equivalent to the shrinking logic in eagle3.py
-        seq_lengths -= lck
-        # TODO: Remove the usage of uncompiled create_block_mask after
-        # https://github.com/pytorch/pytorch/issues/160018
-        if q_len <= 128:
-            create_block_mask_func = create_block_mask
-            flex_attention_func = flex_attention
+        if self.use_global_attention:
+            block_mask = None # Enables full attention
         else:
-            create_block_mask_func = compile_friendly_create_block_mask
-            flex_attention_func = compile_friendly_flex_attention
-
-        block_mask = create_block_mask_func(
-            mask_mod=generate_eagle3_mask(
-                seq_lengths=seq_lengths,
-                Q_LEN=q_len,
-                KV_LEN=key_cache.shape[-2],
-                lck=lck,
-            ),
-            B=bsz,
-            H=1,  # Rely on broadcast
-            Q_LEN=q_len,
-            KV_LEN=key_cache.shape[-2],
-            device=query_states.device,
-        )
-        attn_output = flex_attention_func(
-            query=query_states,
-            key=key_cache.contiguous(),
-            value=value_cache.contiguous(),
-            block_mask=block_mask,
-            enable_gqa=True,
-        )
+            seq_lengths = attention_mask.sum(dim=-1)
+            # Shrink the attention mask to align with the padding to the right.
+            # This is equivalent to the shrinking logic in eagle3.py
+            seq_lengths -= lck
+            # TODO: Remove the usage of uncompiled create_block_mask after
+            # https://github.com/pytorch/pytorch/issues/160018
+            if q_len <= 128:
+                create_block_mask_func = create_block_mask
+                flex_attention_func = flex_attention
+            else:
+                create_block_mask_func = compile_friendly_create_block_mask
+                flex_attention_func = compile_friendly_flex_attention
+
+            if self.use_global_attention:
+                block_mask = None  # This will result in dense attention
+            else:
+                block_mask = create_block_mask_func(
+                    mask_mod=generate_eagle3_mask(
+                        seq_lengths=seq_lengths,
+                        Q_LEN=q_len,
+                        KV_LEN=key_cache.shape[-2],
+                        lck=lck,
+                    ),
+                    B=bsz,
+                    H=1,  # Rely on broadcast
+                    Q_LEN=q_len,
+                    KV_LEN=key_cache.shape[-2],
+                    device=query_states.device,
+                )
+            attn_output = flex_attention_func(
+                query=query_states,
+                key=key_cache.contiguous(),
+                value=value_cache.contiguous(),
+                block_mask=block_mask,
+                enable_gqa=True,
+            )
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.head_dim * self.num_heads)
         attn_output = self.o_proj(attn_output)
@@ -869,6 +880,10 @@ class LlamaFlashAttention(LlamaAttention):
         - cache_hidden: manual cache used for storing past key and value states
     """
 
+    def __init__(self, config):
+        super().__init__(config)
+        self.use_global_attention = getattr(config, "use_global_attention", False)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -934,7 +949,7 @@ def forward(
             v0,
             dropout_p=0.0,
             softmax_scale=1.0 / math.sqrt(self.head_dim),
-            causal=True,
+            causal=not self.use_global_attention, # Set causal based on the flag
             return_attn_probs=True,
         )
         lse = lse.transpose(1, 2)