Tencent
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎README_cn.md‎
Lines changed: 5 additions & 5 deletions b/‎README_cn.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎angelslim/compressor/speculative/train/configs/hunyuan_ocr-eagle3.json‎
Lines changed: 34 additions & 0 deletions b/‎angelslim/compressor/speculative/train/configs/hunyuan_ocr-eagle3.json‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/configs/qwen2-audio-7b-eagle3.json‎
Lines changed: 30 additions & 0 deletions b/‎angelslim/compressor/speculative/train/configs/qwen2-audio-7b-eagle3.json‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/configs/qwen3-vl-2b-eagle3-mrope.json‎
Lines changed: 43 additions & 0 deletions b/‎angelslim/compressor/speculative/train/configs/qwen3-vl-2b-eagle3-mrope.json‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/configs/qwen3-vl-30b-a3b-eagle3-mrope.json‎
Lines changed: 43 additions & 0 deletions b/‎angelslim/compressor/speculative/train/configs/qwen3-vl-30b-a3b-eagle3-mrope.json‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3-mrope.json‎
Lines changed: 1 addition & 0 deletions b/‎angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3-mrope.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/data/chat_templates.py‎
Lines changed: 24 additions & 0 deletions b/‎angelslim/compressor/speculative/train/data/chat_templates.py‎
Lines changed: 24 additions & 0 deletions
@@ -636,12 +636,12 @@ Benchmark results for Qwen3-VL series models using Eagle3 speculative decoding o
 
 ##### 1.2.2 HunyuanOCR Model
 
-Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across OCR tasks, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
+Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across **[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)** dataset, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
 <table><thead>
   <tr>
     <th>Model</th>
     <th>Method</th>
-    <th colspan="2">OCR-Bench-Internal</th>
+    <th colspan="2">OmniDocBench</th>
   </tr></thead>
 <tbody>
   <tr>
@@ -653,13 +653,13 @@ Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.1
   <tr>
     <td rowspan="2">Hunyuan-OCR</td>
     <td>Vanilla</td>
-    <td>71.21</td>
+    <td>70.12</td>
     <td>1</td>
   </tr>
   <tr>
     <td>Eagle3</td>
-    <td>120.75</td>
-    <td>2.2</td>
+    <td>108.1</td>
+    <td>2.08</td>
   </tr>
 </tbody>
 </table>
 
@@ -640,13 +640,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta
 
 ##### 1.2.2 HunyuanOCR模型
 
-我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在 **OCR-Bench** 上的接收长度和吞吐。结果是在单张H20上用以下设置测得：**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。
+我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)上的接收长度和吞吐。结果是在单张H20上用以下设置测得：**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。
 
 <table><thead>
   <tr>
     <th>Model</th>
     <th>Method</th>
-    <th colspan="2">OCR-Bench-Internal</th>
+    <th colspan="2">OmniDocBench</th>
   </tr></thead>
 <tbody>
   <tr>
@@ -658,13 +658,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta
   <tr>
     <td rowspan="2">Hunyuan-OCR</td>
     <td>Vanilla</td>
-    <td>71.21</td>
+    <td>70.12</td>
     <td>1</td>
   </tr>
   <tr>
     <td>Eagle3</td>
-    <td>120.75</td>
-    <td>2.2</td>
+    <td>108.1</td>
+    <td>2.08</td>
   </tr>
 </tbody>
 </table>
 
@@ -0,0 +1,34 @@
+{
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "model_type": "llama",
+  "target_model_type": "hunyuan_vl",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 120000,
+  "eod_token_id": 120020,
+  "eos_token_id": 120020,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "image_start_token_id": 120118,
+  "image_end_token_id": 120119,
+  "image_token_id": 120120,
+  "image_newline_token_id": 120121,
+  "initializer_range": 0.02,
+  "intermediate_size": 3584,
+  "max_position_embeddings": 32768,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "use_cache": true,
+  "vocab_size": 120818,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "draft_vocab_size": 32000,
+  "modal_type": "VLM"
+}
@@ -0,0 +1,30 @@
+{
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "model_type": "llama",
+  "target_model_type": "qwen2_audio",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 8192,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "use_cache": true,
+  "vocab_size": 156032,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "draft_vocab_size": 32000,
+  "modal_type": "Audio"
+}
@@ -0,0 +1,43 @@
+{
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "model_type": "llama",
+  "target_model_type": "qwen3_vl",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "type": "default",
+    "rope_type": "default",
+    "mrope_interleaved": true,
+    "mrope_section": [
+      24,
+      20,
+      20
+    ]
+  },
+  "rope_theta": 5000000,
+  "use_cache": true,
+  "vocab_size": 151936,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "draft_vocab_size": 32000,
+  "modal_type": "VLM"
+}
@@ -0,0 +1,43 @@
+{
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "model_type": "llama",
+  "target_model_type": "qwen3_vl",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "type": "default",
+    "rope_type": "default",
+    "mrope_interleaved": true,
+    "mrope_section": [
+      24,
+      20,
+      20
+    ]
+  },
+  "rope_theta": 5000000,
+  "use_cache": true,
+  "vocab_size": 151936,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "draft_vocab_size": 32000,
+  "modal_type": "VLM"
+}
@@ -3,6 +3,7 @@
     "Eagle3LlamaForCausalLM"
   ],
   "model_type": "llama",
+  "target_model_type": "qwen3_vl",
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
 
@@ -26,18 +26,22 @@
 class ChatTemplateType(Enum):
     """Supported chat template types."""
 
+    QWEN2_AUDIO = "qwen2_audio"
     QWEN3 = "qwen3"
     HUNYUAN = "hunyuan"
     QWEN3_VL = "qwen3_vl"
     HUNYUAN_7B = "hunyuan_7b"
+    HUNYUAN_VL = "hunyuan_vl"
 
 
 # String to ChatTemplateType mapping
 CHAT_TEMPLATE_TYPE_MAPPING = {
+    "qwen2_audio": ChatTemplateType.QWEN2_AUDIO,
     "qwen3": ChatTemplateType.QWEN3,
     "hunyuan": ChatTemplateType.HUNYUAN,
     "hunyuan_7b": ChatTemplateType.HUNYUAN_7B,
     "qwen3_vl": ChatTemplateType.QWEN3_VL,
+    "hunyuan_vl": ChatTemplateType.HUNYUAN_VL,
 }
 
 
@@ -133,6 +137,26 @@ def _initialize_templates(self) -> Dict[ChatTemplateType, ChatTemplate]:
                     }
                 ],
             ),
+            ChatTemplateType.QWEN2_AUDIO: ChatTemplate(
+                user_header="<|im_start|>user\n",
+                assistant_header="<|im_start|>assistant\n",
+                system_prompt=[
+                    {
+                        "type": "text",
+                        "text": ("You are a helpful assistant."),
+                    }
+                ],
+            ),
+            ChatTemplateType.HUNYUAN_VL: ChatTemplate(
+                user_header="<｜hy_Assistant｜>",
+                assistant_header="<｜hy_User｜>",
+                system_prompt=[
+                    {
+                        "type": "text",
+                        "text": "",
+                    }
+                ],
+            ),
         }
 
     def get_template(self, chat_template_type: ChatTemplateType) -> ChatTemplate: