feat: speculative decoding support image size control by max_pixels/min_pixels

dawnranger · dawnranger · commit fc0a99568f4a · 2026-03-24T11:15:30.000+08:00
diff --git a/angelslim/compressor/speculative/train/data/data_utils.py b/angelslim/compressor/speculative/train/data/data_utils.py
@@ -18,6 +18,8 @@
 import torch
 from transformers.image_utils import load_image
 
+from angelslim.utils import rank0_print
+
 __all__ = [
     "process_token_dict_to_mappings",
     "convert_sharegpt_data",
@@ -27,9 +29,46 @@
     "VLMHunyuanDataCollatorWithPadding",
     "AudioDataCollatorWithPadding",
     "CosyVoice3DataCollatorWithPadding",
+    "build_image_processor_kwargs",
 ]
 
 
+def build_image_processor_kwargs(image_processor, max_pixels=None, min_pixels=None):
+    """
+    convert max_pixels/min_pixels to the format required by the specific image_processor.
+      - Qwen2.5-VL: directly use max_pixels / min_pixels
+      - Qwen3-VL:   convert to size={"longest_edge": max_pixels, "shortest_edge": min_pixels}
+
+    Args:
+        image_processor: model's image_processor instance
+        max_pixels: maximum pixels (total area), None means no limit
+        min_pixels: minimum pixels (total area), None means no limit
+
+    Returns:
+        dict: can be directly passed to image_processor(...)
+    """
+    if max_pixels is None and min_pixels is None:
+        return {}
+
+    processor_class = type(image_processor).__name__
+    # Qwen3-VL uses size={"longest_edge": ..., "shortest_edge": ...}
+    if "Qwen3" in processor_class:
+        size = {}
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        return {"size": size}
+    else:
+        # Qwen2.5-VL's accept max_pixels and min_pixels
+        kwargs = {}
+        if max_pixels is not None:
+            kwargs["max_pixels"] = max_pixels
+        if min_pixels is not None:
+            kwargs["min_pixels"] = min_pixels
+        return kwargs
+
+
 def convert_sharegpt_data(row, dataset_column="conversations"):
     converted_messages = []
 
@@ -78,19 +117,19 @@ def process_token_dict_to_mappings(
             token_dict[token] = 0
             if len(token_dict) >= draft_vocab_size:
                 break
-    print(f"Added missing tokens to reach draft vocab size: {draft_vocab_size}")
-    print(f"Total tokens after addition: {len(token_dict)}")
+    rank0_print(f"Added missing tokens to reach draft vocab size: {draft_vocab_size}")
+    rank0_print(f"Total tokens after addition: {len(token_dict)}")
     total_frequency = sum(token_dict.values())
     top_N = token_dict.most_common(draft_vocab_size)
     top_N_frequency_sum = sum(freq for key, freq in top_N)
 
     if total_frequency == 0:
-        print("Warning: Total token frequency is zero. All tokens will have zero ratio.")
+        rank0_print("Warning: Total token frequency is zero. All tokens will have zero ratio.")
         top_N_ratio = 0.0
     else:
         top_N_ratio = top_N_frequency_sum / total_frequency
 
-    print(f"top {draft_vocab_size} token frequency ratio: {top_N_ratio:.2%}")
+    rank0_print(f"top {draft_vocab_size} token frequency ratio: {top_N_ratio:.2%}")
     used_tokens = [key for key, freq in top_N]
     used_tokens.sort()
 
@@ -199,14 +238,29 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
 class VLMDataCollatorWithPadding:
 
-    def __init__(self, processor=None):
+    def __init__(self, processor=None, image_processor_kwargs=None):
         """
         Args:
             processor: VLM processor (e.g. AutoProcessor for qwen3_vl).
                        When provided, image_paths in features will be decoded
                        on-the-fly to pixel_values (used in online training).
+            image_processor_kwargs: Additional kwargs passed to image_processor,
+                       e.g. {"max_pixels": 1003520, "min_pixels": 200704}.
         """
         self.processor = processor
+        max_pixels = image_processor_kwargs.get("max_pixels", None)
+        min_pixels = image_processor_kwargs.get("min_pixels", None)
+        if (
+            processor is not None
+            and (max_pixels is not None or min_pixels is not None)
+            and hasattr(processor, "image_processor")
+        ):
+            self._resolved_image_processor_kwargs = build_image_processor_kwargs(
+                processor.image_processor, max_pixels, min_pixels
+            )
+        else:
+            self._resolved_image_processor_kwargs = {}
+        rank0_print(f"_resolved_image_processor_kwargs: {self._resolved_image_processor_kwargs}")
 
     def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         max_length = max(item["input_ids"].shape[1] for item in features)
@@ -238,7 +292,18 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                 image_paths = json.loads(item["image_paths"])
                 if image_paths:
                     images = [load_image(p) for p in image_paths]
-                    vision_enc = self.processor.image_processor(images=images, return_tensors="pt")
+                    if hasattr(self.processor, "image_processor"):
+                        vision_enc = self.processor.image_processor(
+                            images=images,
+                            return_tensors="pt",
+                            **self._resolved_image_processor_kwargs,
+                        )
+                    else:
+                        vision_enc = self.processor(
+                            images=images,
+                            return_tensors="pt",
+                            **self._resolved_image_processor_kwargs,
+                        )
                     all_pixel_values.append(vision_enc["pixel_values"])
                     if "image_grid_thw" in vision_enc:
                         all_image_grid_thw.append(vision_enc["image_grid_thw"])
@@ -300,14 +365,29 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
 class VLMHunyuanDataCollatorWithPadding:
 
-    def __init__(self, processor=None):
+    def __init__(self, processor=None, image_processor_kwargs=None):
         """
         Args:
             processor: VLM processor (e.g. AutoProcessor for hunyuan_vl).
                        When provided, image_paths in features will be decoded
                        on-the-fly to pixel_values (used in online training).
+            image_processor_kwargs: Additional kwargs passed to image_processor,
+                       e.g. {"max_pixels": 1003520, "min_pixels": 200704}.
         """
         self.processor = processor
+        max_pixels = image_processor_kwargs.get("max_pixels", None)
+        min_pixels = image_processor_kwargs.get("min_pixels", None)
+        if (
+            processor is not None
+            and (max_pixels is not None or min_pixels is not None)
+            and hasattr(processor, "image_processor")
+        ):
+            self._resolved_image_processor_kwargs = build_image_processor_kwargs(
+                processor.image_processor, max_pixels, min_pixels
+            )
+        else:
+            self._resolved_image_processor_kwargs = {}
+        rank0_print(f"_resolved_image_processor_kwargs: {self._resolved_image_processor_kwargs}")
 
     def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         max_length = max(item["input_ids"].shape[1] for item in features)
@@ -338,7 +418,18 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                 image_paths = json.loads(item["image_paths"])
                 if image_paths:
                     images = [load_image(p) for p in image_paths]
-                    vision_enc = self.processor(images=images, return_tensors="pt")
+                    if hasattr(self.processor, "image_processor"):
+                        vision_enc = self.processor.image_processor(
+                            images=images,
+                            return_tensors="pt",
+                            **self._resolved_image_processor_kwargs,
+                        )
+                    else:
+                        vision_enc = self.processor(
+                            images=images,
+                            return_tensors="pt",
+                            **self._resolved_image_processor_kwargs,
+                        )
                     all_pixel_values.append(vision_enc["pixel_values"])
                     if "image_grid_thw" in vision_enc:
                         all_image_grid_thw.append(vision_enc["image_grid_thw"])
diff --git a/angelslim/compressor/speculative/train/data/dataset_builder/online_dataset_builder.py b/angelslim/compressor/speculative/train/data/dataset_builder/online_dataset_builder.py
@@ -41,6 +41,7 @@
     DataCollatorWithPadding,
     VLMDataCollatorWithPadding,
     VLMHunyuanDataCollatorWithPadding,
+    build_image_processor_kwargs,
 )
 from .base_dataset_builder import OnlineDatasetBuilder
 from .dataset_builder_factory import DatasetBuilderFactory
@@ -87,6 +88,11 @@ def __init__(
             chat_template_type,
             display,
         )
+        _max_pixels = os.environ.get("MAX_PIXELS")
+        _min_pixels = os.environ.get("MIN_PIXELS")
+        self.max_pixels = int(_max_pixels) if _max_pixels is not None else None
+        self.min_pixels = int(_min_pixels) if _min_pixels is not None else None
+        rank0_print(f"max_pixels: {self.max_pixels}, min_pixels: {self.min_pixels}")
 
     def build_dataset(
         self,
@@ -168,7 +174,15 @@ def build_dataset(
 
     def get_data_collator(self) -> Any:
         # for online vlm training: dynamically compute pixel_values during collate stage
-        return VLMDataCollatorWithPadding(processor=self.tokenizer)
+        image_processor_kwargs = {}
+        if self.max_pixels is not None:
+            image_processor_kwargs["max_pixels"] = self.max_pixels
+        if self.min_pixels is not None:
+            image_processor_kwargs["min_pixels"] = self.min_pixels
+        return VLMDataCollatorWithPadding(
+            processor=self.tokenizer,
+            image_processor_kwargs=image_processor_kwargs or None,
+        )
 
     def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
         new_examples = {
@@ -258,6 +272,12 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
                 del message["content"]
                 message["content"] = new_content
 
+            image_kwargs = {}
+            if image_paths and hasattr(self.tokenizer, "image_processor"):
+                image_kwargs = build_image_processor_kwargs(
+                    self.tokenizer.image_processor, self.max_pixels, self.min_pixels
+                )
+
             encoding = self.tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
@@ -268,6 +288,7 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
                 max_length=self.max_length,
                 truncation=True,
                 padding=False,
+                **image_kwargs,
             )
 
             input_ids = encoding["input_ids"]
@@ -326,6 +347,11 @@ def __init__(
             chat_template_type,
             display,
         )
+        _max_pixels = os.environ.get("MAX_PIXELS")
+        _min_pixels = os.environ.get("MIN_PIXELS")
+        self.max_pixels = int(_max_pixels) if _max_pixels is not None else None
+        self.min_pixels = int(_min_pixels) if _min_pixels is not None else None
+        rank0_print(f"max_pixels: {self.max_pixels}, min_pixels: {self.min_pixels}")
 
     def build_dataset(
         self,
@@ -404,7 +430,15 @@ def build_dataset(
 
     def get_data_collator(self) -> Any:
         # for online training, we need to use VLMHunyuanDataCollatorWithPadding
-        return VLMHunyuanDataCollatorWithPadding(processor=self.tokenizer)
+        image_processor_kwargs = {}
+        if self.max_pixels is not None:
+            image_processor_kwargs["max_pixels"] = self.max_pixels
+        if self.min_pixels is not None:
+            image_processor_kwargs["min_pixels"] = self.min_pixels
+        return VLMHunyuanDataCollatorWithPadding(
+            processor=self.tokenizer,
+            image_processor_kwargs=image_processor_kwargs or None,
+        )
 
     def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
         new_examples = {
@@ -482,6 +516,11 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
             )
             image_inputs, _ = self._extract_vision_info(messages)
 
+            image_kwargs = {}
+            if image_inputs and hasattr(self.tokenizer, "image_processor"):
+                image_kwargs = build_image_processor_kwargs(
+                    self.tokenizer.image_processor, self.max_pixels, self.min_pixels
+                )
             encoding = self.tokenizer(
                 text=[text],
                 images=image_inputs,
@@ -490,6 +529,7 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
                 max_length=self.max_length,
                 truncation=True,
                 padding=False,
+                **image_kwargs,
             )
             input_ids = encoding["input_ids"]
             offsets = encoding["offset_mapping"]
diff --git a/tools/generate_hidden_for_draft_model.py b/tools/generate_hidden_for_draft_model.py
@@ -33,6 +33,7 @@
     infer_model_params,
 )
 from angelslim.compressor.speculative.train.data.data_utils import (
+    build_image_processor_kwargs,
     process_token_dict_to_mappings,
 )
 from angelslim.utils import decide_device_for_distributed
@@ -102,6 +103,10 @@ def __init__(
         self.rank = rank
         self.draft_vocab_size = draft_vocab_size
         self.target_vocab_size = target_vocab_size
+        _max_pixels = os.environ.get("MAX_PIXELS")
+        _min_pixels = os.environ.get("MIN_PIXELS")
+        self.max_pixels = int(_max_pixels) if _max_pixels is not None else None
+        self.min_pixels = int(_min_pixels) if _min_pixels is not None else None
         self.output_dir.mkdir(parents=True, exist_ok=True)
         self.token_dict = Counter()
 
@@ -151,11 +156,17 @@ def _process_single_sample(self, idx: int, row: Dict[str, Any]) -> bool:
                     images = [load_image(p) for p in image_paths]
                     processor = self.target_model.tokenizer
                     if hasattr(processor, "image_processor"):
+                        kwargs = build_image_processor_kwargs(
+                            processor.image_processor, self.max_pixels, self.min_pixels
+                        )
                         vision_encoding = processor.image_processor(
-                            images=images, return_tensors="pt"
+                            images=images, return_tensors="pt", **kwargs
                         )
                     else:
-                        vision_encoding = processor(images=images, return_tensors="pt")
+                        kwargs = build_image_processor_kwargs(
+                            processor, self.max_pixels, self.min_pixels
+                        )
+                        vision_encoding = processor(images=images, return_tensors="pt", **kwargs)
                     row["pixel_values"] = vision_encoding["pixel_values"].to(device)
                     if "video_pixel_values" in vision_encoding:
                         row["video_pixel_values"] = vision_encoding["video_pixel_values"].to(
@@ -406,7 +417,6 @@ def parse_arguments() -> argparse.Namespace:
         help="Path to draft model config file, used to read draft_vocab_size and vocab_size "
         "for computing vocab mapping",
     )
-
     return parser.parse_args()