fix bug in qwen3_vl eagle3 speculative decoding (Tencent#176)

irisliu10 · web-flow · commit 0725bc8d30d5 · 2025-12-15T22:17:24.000+08:00
diff --git a/angelslim/compressor/speculative/train/models/target/target_model_wrapper.py b/angelslim/compressor/speculative/train/models/target/target_model_wrapper.py
@@ -280,15 +280,14 @@ class VLMTransformersBackend(BaseBackend):
     def load_model(self):
         from transformers import AutoModelForImageTextToText, AutoProcessor
 
-        default_kwargs = {
-            "dtype": torch.bfloat16,
-            "device_map": "auto",
-            "trust_remote_code": True,
-        }
-        default_kwargs.update(self.kwargs)
+        device = decide_device_for_distributed()
+        print_with_rank(f"Loading model to device: {device}")
+
+        # Prepare model loading configuration
+        model_kwargs = self._prepare_model_kwargs(device)
 
         self.model = AutoModelForImageTextToText.from_pretrained(
-            self.model_path, **default_kwargs
+            self.model_path, **model_kwargs
         )
 
         # Freeze the base model
@@ -300,6 +299,24 @@ def load_model(self):
             self.model_path, trust_remote_code=True
         )
 
+    def _prepare_model_kwargs(self, device: str) -> dict:
+        """
+        Prepare keyword arguments for model loading.
+
+        Args:
+            device: Target device for model placement
+
+        Returns:
+            Dictionary of model loading arguments
+        """
+        default_kwargs = {
+            "dtype": torch.bfloat16,
+            "device_map": device,
+            "trust_remote_code": True,
+        }
+        default_kwargs.update(self.kwargs)
+        return default_kwargs
+
     def get_hidden_states_and_logits(
         self,
         input_ids: torch.Tensor,
@@ -317,6 +334,12 @@ def get_hidden_states_and_logits(
         Returns:
             Tuple of (concatenated_hidden_states, logits)
         """
+        pixel_values = None
+        image_grid_thw = None
+        if "pixel_values" in kwargs:
+            pixel_values = kwargs["pixel_values"].squeeze(0)
+        if "image_grid_thw" in kwargs:
+            image_grid_thw = kwargs["image_grid_thw"].squeeze(0)
         inputs_embeds_list, position_ids_list = [], []
 
         def hook(module, args, kwargs):
@@ -336,6 +359,8 @@ def hook(module, args, kwargs):
             outputs = self.model(
                 input_ids,
                 attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
                 output_hidden_states=True,
                 output_logits=True,
             )
@@ -375,6 +400,12 @@ def get_aux_and_target_hiddens(
         Returns:
             Tuple of (auxiliary_hidden_states, final_hidden_states)
         """
+        pixel_values = None
+        image_grid_thw = None
+        if "pixel_values" in kwargs:
+            pixel_values = kwargs["pixel_values"].squeeze(0)
+        if "image_grid_thw" in kwargs:
+            image_grid_thw = kwargs["image_grid_thw"].squeeze(0)
         inputs_embeds_list, position_ids_list = [], []
 
         def hook(module, args, kwargs):
@@ -393,6 +424,8 @@ def hook(module, args, kwargs):
         with torch.no_grad():
             outputs = self.model(
                 input_ids,
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
                 attention_mask=attention_mask,
                 output_hidden_states=True,
                 output_logits=True,
diff --git a/angelslim/compressor/speculative/train/trainer/online_vlm_eagle3_trainer.py b/angelslim/compressor/speculative/train/trainer/online_vlm_eagle3_trainer.py
@@ -150,13 +150,12 @@ def draft_model_training_time_test(
         # Iterative speculative decoding training loop
         for idx in range(self.length):
             # Get input embeddings with gradient tracking
-            if inputs_embeds is None:
-                inputs_embeds = self.draft_model.get_input_embeddings(input_ids)
+            inputs_embeds = self.draft_model.get_input_embeddings(input_ids)
             if not inputs_embeds.requires_grad:
                 inputs_embeds.requires_grad = True
 
             # Encode through draft model layers
-            hidden_states = self.draft_model.encode_layers(
+            hidden_states, cache_hidden = self.draft_model.encode_layers(
                 inputs_embeds=inputs_embeds,
                 hidden_states=hidden_states,
                 cache_hidden=cache_hidden,
@@ -198,14 +197,6 @@ def draft_model_training_time_test(
                 target_logits = padding(target_logits, left=False)
                 loss_mask = padding(loss_mask, left=False)
 
-                # Update attention mask to prevent attending to future positions
-                ind = torch.arange(seq_length, device=attention_mask.device)
-                new_attention_mask = attention_mask.clone()
-                new_attention_mask[:, :, ind[idx:], ind[: seq_length - idx]] = (
-                    torch.finfo(attention_mask.dtype).min
-                )
-                attention_mask = new_attention_mask
-
         # Compute weighted loss
         ploss_weight = [0.8**i for i in range(len(plosses))]
         ploss = sum([ploss_weight[i] * plosses[i] for i in range(len(plosses))])