feat: support Hunyuan Eagle3 training (#126)

liusong1222 · web-flow · commit 18f472c43257 · 2025-11-04T20:21:56.000+08:00
diff --git a/angelslim/compressor/speculative/__init__.py b/angelslim/compressor/speculative/__init__.py
@@ -13,5 +13,31 @@
 # limitations under the License.
 
 from .benchmark import BenchmarkConfig, BenchmarkEngine, BenchmarkMode
+from .train import (
+    DataCollatorWithPadding,
+    DatasetManager,
+    DraftModelConfig,
+    OnlineEagle3Trainer,
+    convert_sharegpt_data,
+    convert_ultrachat_data,
+    create_draft_model,
+    create_target_model,
+    data_generation_work_flow,
+    get_supported_chat_template_type_strings,
+)
 
-__all__ = ["BenchmarkEngine", "BenchmarkConfig", "BenchmarkMode"]
+__all__ = [
+    "BenchmarkEngine",
+    "BenchmarkConfig",
+    "BenchmarkMode",
+    "create_draft_model",
+    "DraftModelConfig",
+    "create_target_model",
+    "OnlineEagle3Trainer",
+    "data_generation_work_flow",
+    "DataCollatorWithPadding",
+    "convert_sharegpt_data",
+    "convert_ultrachat_data",
+    "DatasetManager",
+    "get_supported_chat_template_type_strings",
+]
diff --git a/angelslim/compressor/speculative/train/__init__.py b/angelslim/compressor/speculative/train/__init__.py
@@ -0,0 +1,23 @@
+from .data import (
+    DataCollatorWithPadding,
+    DatasetManager,
+    convert_sharegpt_data,
+    convert_ultrachat_data,
+    data_generation_work_flow,
+    get_supported_chat_template_type_strings,
+)
+from .models import DraftModelConfig, create_draft_model, create_target_model
+from .trainer import OnlineEagle3Trainer
+
+__all__ = [
+    "create_draft_model",
+    "DraftModelConfig",
+    "create_target_model",
+    "OnlineEagle3Trainer",
+    "data_generation_work_flow",
+    "DataCollatorWithPadding",
+    "convert_sharegpt_data",
+    "convert_ultrachat_data",
+    "DatasetManager",
+    "get_supported_chat_template_type_strings",
+]
diff --git a/angelslim/compressor/speculative/train/configs/hunyuan-4b-eagle3.json b/angelslim/compressor/speculative/train/configs/hunyuan-4b-eagle3.json
@@ -0,0 +1,53 @@
+{
+  "add_classification_head": false,
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_head_dim": 128,
+  "bos_token_id": 1,
+  "cla_share_factor": 2,
+  "class_num": 0,
+  "dense_list": [
+    3072,
+    0
+  ],
+  "eod_token_id": 120026,
+  "eos_token_id": 120020,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "im_end_id": 5,
+  "im_newline_id": 11,
+  "im_start_id": 4,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "mask_init_id": 12,
+  "max_position_embeddings": 262144,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "norm_type": "rms",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
+  "org_vocab_size": 120818,
+  "pad_id": 120002,
+  "pad_token_id": 120002,
+  "pool_type": "last",
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sep_token_id": 120007,
+  "text_end_id": 7,
+  "text_start_id": 6,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "use_cla": false,
+  "use_qk_norm": true,
+  "use_rotary_pos_emb": true,
+  "vocab_size": 120818,
+  "draft_vocab_size": 32000
+}
diff --git a/angelslim/compressor/speculative/train/data/__init__.py b/angelslim/compressor/speculative/train/data/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .chat_templates import get_supported_chat_template_type_strings
 from .data_generation import data_generation_work_flow
 from .data_utils import (
     DataCollatorWithPadding,
@@ -26,4 +27,5 @@
     "convert_sharegpt_data",
     "convert_ultrachat_data",
     "data_generation_work_flow",
+    "get_supported_chat_template_type_strings",
 ]
diff --git a/angelslim/compressor/speculative/train/data/chat_templates.py b/angelslim/compressor/speculative/train/data/chat_templates.py
@@ -27,11 +27,13 @@ class ChatTemplateType(Enum):
     """Supported chat template types."""
 
     QWEN3 = "qwen3"
+    HUNYUAN = "hunyuan"
 
 
 # String to ChatTemplateType mapping
 CHAT_TEMPLATE_TYPE_MAPPING = {
     "qwen3": ChatTemplateType.QWEN3,
+    "hunyuan": ChatTemplateType.HUNYUAN,
 }
 
 
@@ -75,7 +77,22 @@ def _initialize_templates(self) -> Dict[ChatTemplateType, ChatTemplate]:
                     "correct. If you don't know the answer to a question, "
                     "please don't share false information."
                 ),
-            )
+            ),
+            ChatTemplateType.HUNYUAN: ChatTemplate(
+                user_header="<｜hy_User｜>",
+                assistant_header="<｜hy_Assistant｜>",
+                system_prompt=(
+                    "You are a helpful, respectful and honest assistant. "
+                    "Always answer as helpfully as possible, while being safe. "
+                    "Your answers should not include any harmful, unethical, racist, "
+                    "sexist, toxic, dangerous, or illegal content. Please ensure that "
+                    "your responses are socially unbiased and positive in nature.\n\n"
+                    "If a question does not make any sense, or is not factually "
+                    "coherent, explain why instead of answering something not "
+                    "correct. If you don't know the answer to a question, "
+                    "please don't share false information."
+                ),
+            ),
         }
 
     def get_template(self, chat_template_type: ChatTemplateType) -> ChatTemplate:
diff --git a/angelslim/compressor/speculative/train/data/dataset.py b/angelslim/compressor/speculative/train/data/dataset.py
@@ -36,18 +36,82 @@ def __init__(
         max_length: int = 2048,
         shuffle_seed: int = 42,
         chat_template_type: ChatTemplateType = ChatTemplateType.QWEN3,
+        display: bool = False,
     ):
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.shuffle_seed = shuffle_seed
         self.chat_template_type = chat_template_type
+        self.display = display
+        self.display_count = 0  # Track how many samples have been displayed
 
         # Get chat template
         template = template_manager.get_template_dict(chat_template_type)
         self.user_header = template["user_header"]
         self.assistant_header = template["assistant_header"]
         self.system_prompt = template["system_prompt"]
 
+    def _visualize_loss_mask(
+        self, input_ids: torch.Tensor, loss_mask: torch.Tensor, conversation: str
+    ) -> None:
+        """
+        Visualize loss_mask with color-coded output.
+
+        Args:
+            input_ids: Token IDs
+            loss_mask: Loss mask tensor (1 for training, 0 for ignoring)
+            conversation: Original conversation text
+        """
+        # ANSI color codes
+        RED = "\033[91m"  # For masked out tokens (loss_mask=0)
+        GREEN = "\033[92m"  # For training tokens (loss_mask=1)
+        RESET = "\033[0m"  # Reset color
+        BOLD = "\033[1m"
+
+        rank0_print("\n" + "=" * 80)
+        rank0_print(f"{BOLD}Loss Mask Visualization{RESET}")
+        rank0_print("=" * 80)
+
+        # Display legend
+        rank0_print(f"\n{BOLD}Legend:{RESET}")
+        rank0_print(f"{GREEN}■ Green: Training tokens (loss_mask=1){RESET}")
+        rank0_print(f"{RED}■ Red: Ignored tokens (loss_mask=0){RESET}")
+
+        # Display statistics
+        total_tokens = len(loss_mask)
+        training_tokens = loss_mask.sum().item()
+        ignored_tokens = total_tokens - training_tokens
+        training_ratio = training_tokens / total_tokens * 100 if total_tokens > 0 else 0
+
+        rank0_print(f"\n{BOLD}Statistics:{RESET}")
+        rank0_print(f"Total tokens: {total_tokens}")
+        rank0_print(f"Training tokens: {training_tokens} ({training_ratio:.2f}%)")
+        rank0_print(f"Ignored tokens: {ignored_tokens} ({100-training_ratio:.2f}%)")
+
+        # Display token-by-token visualization
+        rank0_print(f"\n{BOLD}Token-by-token visualization:{RESET}")
+        rank0_print("-" * 80)
+
+        decoded_tokens = []
+        for token_id, mask_value in zip(input_ids, loss_mask):
+            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
+
+            # Choose color based on mask value
+            color = GREEN if mask_value == 1 else RED
+
+            # Format token with color
+            colored_token = f"{color}{token_text}{RESET}"
+            decoded_tokens.append(colored_token)
+
+        # Print all tokens directly
+        rank0_print("".join(decoded_tokens))
+
+        # Display original conversation for reference
+        rank0_print(f"\n{BOLD}Original conversation:{RESET}")
+        rank0_print("-" * 80)
+        rank0_print(conversation)
+        rank0_print("=" * 80 + "\n")
+
     def build_dataset(self, datapath: str, num_proc: int = 8) -> Dataset:
         try:
             # Load and shuffle dataset
@@ -67,8 +131,10 @@ def build_dataset(self, datapath: str, num_proc: int = 8) -> Dataset:
                 desc="Processing conversations",
             )
 
-            # Filter out None results
-            processed_ds = processed_ds.filter(lambda x: x["input_ids"] is not None)
+            # Filter out None results with multiprocessing support
+            processed_ds = processed_ds.filter(
+                lambda x: x["input_ids"] is not None, num_proc=num_proc
+            )
             processed_ds.set_format(type="torch")
 
             return processed_ds
@@ -134,6 +200,11 @@ def _process_single_conversation(
             input_ids = torch.tensor(input_ids)
             attention_mask = torch.ones_like(input_ids)
 
+            # Visualize loss mask if display mode is enabled
+            if self.display and self.display_count == 0:
+                self._visualize_loss_mask(input_ids, loss_mask, conversation)
+                self.display_count += 1
+
             return {
                 "input_ids": input_ids[None, :],
                 "attention_mask": attention_mask[None, :],
@@ -262,6 +333,7 @@ def __init__(
         tokenizer: AutoTokenizer,
         model_max_length: int = 2048,
         chat_template_type: Optional[Union[str, ChatTemplateType]] = None,
+        display: bool = False,
     ):
         """
         Initialize DatasetManager with DataArguments.
@@ -274,10 +346,12 @@ def __init__(
                 - ChatTemplateType enum value (e.g., ChatTemplateType.QWEN3)
                 - String (e.g., "llama", "qwen")
                 - None (will default to LLAMA)
+            display: Whether to display loss mask visualization for the first sample
         """
         self.data_args = data_args
         self.tokenizer = tokenizer
         self.model_max_length = model_max_length
+        self.display = display
 
         # Convert chat_template_type to ChatTemplateType enum
         if chat_template_type is None:
@@ -293,6 +367,7 @@ def __init__(
             max_length=model_max_length,
             shuffle_seed=data_args.shuffle_seed,
             chat_template_type=chat_template_type,
+            display=display,
         )
 
     def create_datasets(self) -> Tuple[Dataset, Optional[Dataset]]:
@@ -305,8 +380,8 @@ def create_datasets(self) -> Tuple[Dataset, Optional[Dataset]]:
         """
         # Determine number of processes
         num_proc = self.data_args.num_proc
-        if self.data_args.preprocessing_num_workers is not None:
-            num_proc = self.data_args.preprocessing_num_workers
+        if self.display:
+            num_proc = None
 
         # Create train dataset
         train_dataset = self.dataset_builder.build_dataset(
diff --git a/angelslim/compressor/speculative/train/models/__init__.py b/angelslim/compressor/speculative/train/models/__init__.py
@@ -0,0 +1,4 @@
+from .draft import DraftModelConfig, create_draft_model
+from .target import create_target_model
+
+__all__ = ["create_draft_model", "DraftModelConfig", "create_target_model"]
diff --git a/angelslim/compressor/speculative/train/models/target/target_model_wrapper.py b/angelslim/compressor/speculative/train/models/target/target_model_wrapper.py
@@ -65,7 +65,9 @@ def load_model(self):
             param.requires_grad = False
 
         self.model.eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True
+        )
 
     def get_hidden_states_and_logits(
         self,
@@ -122,8 +124,8 @@ def __init__(self, backend: str, model_path: str, **kwargs):
         Initialize TargetModel with specified backend
 
         Args:
-            backend: One of ["hf", "vllm_local", "vllm_serving"]
-            model_path: Path to model or serving endpoint
+            backend: One of ["hf"]
+            model_path: Path to model
             **kwargs: Additional arguments for backend initialization
         """
         if backend not in self.BACKENDS:
@@ -148,8 +150,6 @@ def get_hidden_states_and_logits(
         Args:
             input_ids: Input token ids, shape [batch_size, seq_len]
             attention_mask: Attention mask, shape [batch_size, seq_len]
-            position_ids: Position ids, shape [batch_size, seq_len]
-            past_key_values: Past key values for generation
 
         Returns:
             Tuple of (hidden_states, logits)
diff --git a/tools/train_eagle3_online.py b/tools/train_eagle3_online.py