fix pyarrow offset overflow problem (#254)

dawnranger · web-flow · commit d3199fd05394 · 2026-03-12T11:29:39.000+08:00
diff --git a/angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3.json b/angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3.json
@@ -3,6 +3,7 @@
     "Eagle3LlamaForCausalLM"
   ],
   "model_type": "llama",
+  "target_model_type": "qwen3_vl",
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
diff --git a/angelslim/compressor/speculative/train/data/data_utils.py b/angelslim/compressor/speculative/train/data/data_utils.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 from typing import Any, Dict, List
 
 import torch
+from transformers.image_utils import load_image
 
 __all__ = [
     "process_token_dict_to_mappings",
@@ -195,6 +197,15 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
 class VLMDataCollatorWithPadding:
 
+    def __init__(self, processor=None):
+        """
+        Args:
+            processor: VLM processor (e.g. AutoProcessor for qwen3_vl).
+                       When provided, image_paths in features will be decoded
+                       on-the-fly to pixel_values (used in online training).
+        """
+        self.processor = processor
+
     def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         max_length = max(item["input_ids"].shape[1] for item in features)
         batch_input_ids = torch.cat(
@@ -217,27 +228,53 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
             "position_ids": None,
         }
 
-        if "pixel_values" in features[0]:
-            batch["pixel_values"] = paddingtensor3D_BHW(
-                [item["pixel_values"] for item in features]
-            )
-        if "video_pixel_values" in features[0]:
-            batch["video_pixel_values"] = paddingtensor3D_BHW(
-                [item["video_pixel_values"] for item in features]
-            )
-
-        if all(
-            "image_grid_thw" in item and item["image_grid_thw"] is not None for item in features
-        ):
-            batch["image_grid_thw"] = torch.cat(
-                [item["image_grid_thw"] for item in features], dim=0
-            )
-        if all(
-            "video_grid_thw" in item and item["video_grid_thw"] is not None for item in features
-        ):
-            batch["video_grid_thw"] = torch.cat(
-                [item["video_grid_thw"] for item in features], dim=0
-            )
+        # Online training: decode image_paths -> pixel_values on-the-fly
+        if self.processor is not None and "image_paths" in features[0]:
+            all_pixel_values, all_image_grid_thw = [], []
+            all_video_pixel_values, all_video_grid_thw = [], []
+            for item in features:
+                image_paths = json.loads(item["image_paths"])
+                if image_paths:
+                    images = [load_image(p) for p in image_paths]
+                    vision_enc = self.processor.image_processor(images=images, return_tensors="pt")
+                    all_pixel_values.append(vision_enc["pixel_values"])
+                    if "image_grid_thw" in vision_enc:
+                        all_image_grid_thw.append(vision_enc["image_grid_thw"])
+                    if "video_pixel_values" in vision_enc:
+                        all_video_pixel_values.append(vision_enc["video_pixel_values"])
+                    if "video_grid_thw" in vision_enc:
+                        all_video_grid_thw.append(vision_enc["video_grid_thw"])
+            if all_pixel_values:
+                batch["pixel_values"] = paddingtensor3D_BHW(all_pixel_values)
+            if all_image_grid_thw:
+                batch["image_grid_thw"] = torch.cat(all_image_grid_thw, dim=0)
+            if all_video_pixel_values:
+                batch["video_pixel_values"] = paddingtensor3D_BHW(all_video_pixel_values)
+            if all_video_grid_thw:
+                batch["video_grid_thw"] = torch.cat(all_video_grid_thw, dim=0)
+        else:
+            if "pixel_values" in features[0]:
+                batch["pixel_values"] = paddingtensor3D_BHW(
+                    [item["pixel_values"] for item in features]
+                )
+            if "video_pixel_values" in features[0]:
+                batch["video_pixel_values"] = paddingtensor3D_BHW(
+                    [item["video_pixel_values"] for item in features]
+                )
+            if all(
+                "image_grid_thw" in item and item["image_grid_thw"] is not None
+                for item in features
+            ):
+                batch["image_grid_thw"] = torch.cat(
+                    [item["image_grid_thw"] for item in features], dim=0
+                )
+            if all(
+                "video_grid_thw" in item and item["video_grid_thw"] is not None
+                for item in features
+            ):
+                batch["video_grid_thw"] = torch.cat(
+                    [item["video_grid_thw"] for item in features], dim=0
+                )
 
         # Check if both hidden_states and target_hiddens exist in all features
         if all("hidden_states" in item and "target_hiddens" in item for item in features):
@@ -261,6 +298,15 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
 class VLMHunyuanDataCollatorWithPadding:
 
+    def __init__(self, processor=None):
+        """
+        Args:
+            processor: VLM processor (e.g. AutoProcessor for hunyuan_vl).
+                       When provided, image_paths in features will be decoded
+                       on-the-fly to pixel_values (used in online training).
+        """
+        self.processor = processor
+
     def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         max_length = max(item["input_ids"].shape[1] for item in features)
         batch_input_ids = torch.cat(
@@ -283,17 +329,33 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
             "input_position_ids": None,
         }
 
-        if "pixel_values" in features[0]:
-            batch["pixel_values"] = paddingtensor3D_BHW(
-                [item["pixel_values"] for item in features]
-            )
-
-        if all(
-            "image_grid_thw" in item and item["image_grid_thw"] is not None for item in features
-        ):
-            batch["image_grid_thw"] = torch.cat(
-                [item["image_grid_thw"] for item in features], dim=0
-            )
+        # Online training: decode image_paths -> pixel_values on-the-fly
+        if self.processor is not None and "image_paths" in features[0]:
+            all_pixel_values, all_image_grid_thw = [], []
+            for item in features:
+                image_paths = json.loads(item["image_paths"])
+                if image_paths:
+                    images = [load_image(p) for p in image_paths]
+                    vision_enc = self.processor(images=images, return_tensors="pt")
+                    all_pixel_values.append(vision_enc["pixel_values"])
+                    if "image_grid_thw" in vision_enc:
+                        all_image_grid_thw.append(vision_enc["image_grid_thw"])
+            if all_pixel_values:
+                batch["pixel_values"] = paddingtensor3D_BHW(all_pixel_values)
+            if all_image_grid_thw:
+                batch["image_grid_thw"] = torch.cat(all_image_grid_thw, dim=0)
+        else:
+            if "pixel_values" in features[0]:
+                batch["pixel_values"] = paddingtensor3D_BHW(
+                    [item["pixel_values"] for item in features]
+                )
+            if all(
+                "image_grid_thw" in item and item["image_grid_thw"] is not None
+                for item in features
+            ):
+                batch["image_grid_thw"] = torch.cat(
+                    [item["image_grid_thw"] for item in features], dim=0
+                )
 
         # Check if both hidden_states and target_hiddens exist in all features
         if all("hidden_states" in item and "target_hiddens" in item for item in features):
diff --git a/angelslim/compressor/speculative/train/data/dataset_builder/online_dataset_builder.py b/angelslim/compressor/speculative/train/data/dataset_builder/online_dataset_builder.py
@@ -146,25 +146,27 @@ def build_dataset(
                 num_proc=num_proc,
                 desc="Filtering empty input_ids",
             )
-            processed_ds.set_format(type="torch")
+            torch_columns = [c for c in processed_ds.column_names if c != "image_paths"]
+            processed_ds.set_format(type="torch", columns=torch_columns, output_all_columns=True)
+            rank0_print(
+                f"processed_ds size:{len(processed_ds)}, columns: {processed_ds.column_names}"
+            )
 
             return processed_ds
 
         except Exception as e:
             raise RuntimeError(f"Dataset building failed for {datapath}") from e
 
     def get_data_collator(self) -> Any:
-        return VLMDataCollatorWithPadding()
+        # for online vlm training: dynamically compute pixel_values during collate stage
+        return VLMDataCollatorWithPadding(processor=self.tokenizer)
 
     def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
         new_examples = {
             "input_ids": [],
             "attention_mask": [],
             "loss_mask": [],
-            "pixel_values": [],
-            "video_pixel_values": [],
-            "image_grid_thw": [],
-            "video_grid_thw": [],
+            "image_paths": [],
         }
 
         for i in range(len(examples["id"])):
@@ -189,7 +191,7 @@ def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
             if any(v is not None for v in value):
                 cleaned_new_examples[key] = value
 
-        return new_examples
+        return cleaned_new_examples
 
     def _visualize_loss_mask(
         self, input_ids: torch.Tensor, loss_mask: torch.Tensor, conversation: str
@@ -222,6 +224,16 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
             if not messages:
                 return None
 
+            # extract image paths before apply_chat_template modifies messages in-place
+            image_paths = []
+            for message in messages:
+                content = message.get("content", [])
+                if not isinstance(content, list):
+                    continue
+                for item in content:
+                    if item.get("type") == "image" and item.get("image"):
+                        image_paths.append(item["image"])
+
             # Apply chat template
             assert isinstance(messages, list), f"type(messages)={type(messages)} is not list"
             for message in messages:
@@ -277,17 +289,9 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
                 "input_ids": input_ids.view(1, -1),
                 "attention_mask": attention_mask.view(1, -1),
                 "loss_mask": loss_mask.view(1, -1),
+                "image_paths": json.dumps(image_paths),
             }
 
-            if "pixel_values" in encoding:
-                result_dict["pixel_values"] = encoding["pixel_values"].unsqueeze(0)
-            if "video_pixel_values" in encoding:
-                result_dict["video_pixel_values"] = encoding["video_pixel_values"].unsqueeze(0)
-            if "image_grid_thw" in encoding:
-                result_dict["image_grid_thw"] = encoding["image_grid_thw"]
-            if "video_grid_thw" in encoding:
-                result_dict["video_grid_thw"] = encoding["video_grid_thw"]
-
             return result_dict
 
         except Exception as e:
@@ -370,24 +374,24 @@ def build_dataset(
                 num_proc=num_proc,
                 desc="Filtering empty input_ids",
             )
-            processed_ds.set_format(type="torch")
+            torch_columns = [c for c in processed_ds.column_names if c != "image_paths"]
+            processed_ds.set_format(type="torch", columns=torch_columns, output_all_columns=True)
 
             return processed_ds
 
         except Exception as e:
             raise RuntimeError(f"Dataset building failed for {datapath}") from e
 
     def get_data_collator(self) -> Any:
-        return VLMHunyuanDataCollatorWithPadding()
+        # for online training, we need to use VLMHunyuanDataCollatorWithPadding
+        return VLMHunyuanDataCollatorWithPadding(processor=self.tokenizer)
 
     def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
         new_examples = {
             "input_ids": [],
             "attention_mask": [],
             "loss_mask": [],
-            "pixel_values": [],
-            "image_grid_thw": [],
-            "position_ids": [],
+            "image_paths": [],
             "input_position_ids": [],
         }
         for i in range(len(examples["id"])):
@@ -409,7 +413,7 @@ def _preprocess_function(self, examples: Dict[str, List]) -> Dict[str, List]:
         for key, value in new_examples.items():
             if any(v is not None for v in value):
                 cleaned_new_examples[key] = value
-        return new_examples
+        return cleaned_new_examples
 
     def _visualize_loss_mask(
         self, input_ids: torch.Tensor, loss_mask: torch.Tensor, conversation: str
@@ -499,10 +503,16 @@ def _process_single_conversation(self, conversation_data: List[Dict]) -> Optiona
                 "input_position_ids": input_position_ids,
             }
 
-            if "pixel_values" in encoding:
-                result_dict["pixel_values"] = encoding["pixel_values"].unsqueeze(0)
-            if "image_grid_thw" in encoding:
-                result_dict["image_grid_thw"] = encoding["image_grid_thw"]
+            # get image_paths
+            image_paths = []
+            for message in messages:
+                content = message.get("content", [])
+                if not isinstance(content, list):
+                    continue
+                for item in content:
+                    if item.get("type") == "image" and item.get("image"):
+                        image_paths.append(item["image"])
+            result_dict["image_paths"] = json.dumps(image_paths)
 
             return result_dict
 
diff --git a/angelslim/compressor/speculative/train/trainer/eagle3_trainer.py b/angelslim/compressor/speculative/train/trainer/eagle3_trainer.py
@@ -291,7 +291,7 @@ def prediction_step(
         """
         Perform an evaluation step on `model` using `inputs`.
         """
-        data_for_draft_model = self.prepare_data_for_draft_model(**inputs)
+        data_for_draft_model = self.prepare_data_for_draft_model(inputs)
 
         attention_mask = data_for_draft_model["attention_mask"]
         # inputs_embeds = data_for_draft_model["inputs_embeds"]
diff --git a/tools/generate_hidden_for_draft_model.py b/tools/generate_hidden_for_draft_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import argparse
+import json
 import logging
 import os
 from datetime import timedelta
@@ -22,6 +23,7 @@
 import torch
 import torch.distributed as dist
 from tqdm import tqdm
+from transformers.image_utils import load_image
 
 from angelslim.compressor.speculative import DatasetManager, create_target_model
 from angelslim.utils import decide_device_for_distributed
@@ -120,6 +122,30 @@ def _process_single_sample(self, idx: int, row: Dict[str, Any]) -> bool:
         try:
             # Generate aux and target hiddens
             device = decide_device_for_distributed()
+
+            if "image_paths" in row:
+                image_paths = json.loads(row.pop("image_paths"))
+                if image_paths:
+                    images = [load_image(p) for p in image_paths]
+                    processor = self.target_model.tokenizer
+                    if hasattr(processor, "image_processor"):
+                        vision_encoding = processor.image_processor(
+                            images=images, return_tensors="pt"
+                        )
+                    else:
+                        vision_encoding = processor(images=images, return_tensors="pt")
+                    row["pixel_values"] = vision_encoding["pixel_values"].to(device)
+                    if "video_pixel_values" in vision_encoding:
+                        row["video_pixel_values"] = vision_encoding["video_pixel_values"].to(
+                            device
+                        )
+                    if "image_grid_thw" in vision_encoding:
+                        row["image_grid_thw"] = vision_encoding["image_grid_thw"].to(device)
+                    if "video_grid_thw" in vision_encoding:
+                        row["video_grid_thw"] = vision_encoding["video_grid_thw"].to(device)
+                else:
+                    row.pop("image_paths", None)
+
             for k, v in row.items():
                 if isinstance(v, torch.Tensor) and v is not None:
                     row[k] = v.to(device)
@@ -422,15 +448,24 @@ def main():
             trust_remote_code=args.trust_remote_code,
             target_model_type=args.target_model_type,
         )
+        logger.info(
+            f"Target model loaded: {args.target_model_name_or_path or args.model_name}",
+            extra={"rank": rank},
+        )
+        logger.info(f"tokenizer: {target_model.tokenizer}")
 
         # Load dataset
         dataset = load_dataset(args, target_model.tokenizer, rank)
+        if len(dataset) == 0:
+            logger.warning("No samples to process after loading dataset", extra={"rank": rank})
+            return
 
         # Split dataset for this rank
         dataset_slice = split_dataset_for_rank(dataset, rank, world_size, args.start, args.end)
 
         # Generate hidden states
         output_dir = f"{args.outdir}/rank_{rank}"
+        logger.info(f"writing hidden states to {output_dir}", extra={"rank": rank})
         generator = HiddenStateGenerator(target_model, output_dir, rank=rank)
         successful, failed = generator.generate(dataset_slice)
 
diff --git a/tools/train_eagle3_offline.py b/tools/train_eagle3_offline.py
@@ -309,6 +309,7 @@ def train():
     )
 
     target_model_type = getattr(draft_model_config, "target_model_type", None)
+    rank0_print(f"target_model_type: {target_model_type}")
 
     dataset_manager = DatasetManager(
         data_args=args,

Original file line number	Diff line number	Diff line change
`@@ -309,6 +309,7 @@ def train():`
`309`	`309`	`)`
`310`	`310`
`311`	`311`	`target_model_type = getattr(draft_model_config, "target_model_type", None)`
	`312`	`+ rank0_print(f"target_model_type: {target_model_type}")`
`312`	`313`
`313`	`314`	`dataset_manager = DatasetManager(`
`314`	`315`	`data_args=args,`