Tencent · irisliu10 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/README.md b/README.md
@@ -636,12 +636,12 @@ Benchmark results for Qwen3-VL series models using Eagle3 speculative decoding o
 
 ##### 1.2.2 HunyuanOCR Model
 
-Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across OCR tasks, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
+Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across **[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)** dataset, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
 <table><thead>
   <tr>
     <th>Model</th>
     <th>Method</th>
-    <th colspan="2">OCR-Bench-Internal</th>
+    <th colspan="2">OmniDocBench</th>
   </tr></thead>
 <tbody>
   <tr>
@@ -653,13 +653,13 @@ Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.1
   <tr>
     <td rowspan="2">Hunyuan-OCR</td>
     <td>Vanilla</td>
-    <td>71.21</td>
+    <td>70.12</td>
     <td>1</td>
   </tr>
   <tr>
     <td>Eagle3</td>
-    <td>120.75</td>
-    <td>2.2</td>
+    <td>108.1</td>
+    <td>2.08</td>
   </tr>
 </tbody>
 </table>

diff --git a/README_cn.md b/README_cn.md
@@ -640,13 +640,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta
 
 ##### 1.2.2 HunyuanOCR模型
 
-我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在 **OCR-Bench** 上的接收长度和吞吐。结果是在单张H20上用以下设置测得：**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。
+我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)上的接收长度和吞吐。结果是在单张H20上用以下设置测得：**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。
 
 <table><thead>
   <tr>
     <th>Model</th>
     <th>Method</th>
-    <th colspan="2">OCR-Bench-Internal</th>
+    <th colspan="2">OmniDocBench</th>
   </tr></thead>
 <tbody>
   <tr>
@@ -658,13 +658,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta
   <tr>
     <td rowspan="2">Hunyuan-OCR</td>
     <td>Vanilla</td>
-    <td>71.21</td>
+    <td>70.12</td>
     <td>1</td>
   </tr>
   <tr>
     <td>Eagle3</td>
-    <td>120.75</td>
-    <td>2.2</td>
+    <td>108.1</td>
+    <td>2.08</td>
   </tr>
 </tbody>
 </table>

diff --git a/angelslim/compressor/speculative/train/configs/qwen2-audio-7b-eagle3.json b/angelslim/compressor/speculative/train/configs/qwen2-audio-7b-eagle3.json
@@ -0,0 +1,30 @@
+{
+  "architectures": [
+    "Eagle3LlamaForCausalLM"
+  ],
+  "model_type": "llama",
+  "target_model_type": "qwen2_audio",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 8192,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "use_cache": true,
+  "vocab_size": 156032,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "draft_vocab_size": 32000,
+  "modal_type": "Audio"
+}
diff --git a/angelslim/compressor/speculative/train/data/data_utils.py b/angelslim/compressor/speculative/train/data/data_utils.py
@@ -341,3 +341,71 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                 [paddingtensor2D(item["position_ids"], max_length) for item in features]
             )
         return batch
+
+
+class AudioDataCollatorWithPadding:
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        max_length = max(item["input_ids"].shape[1] for item in features)
+        batch_input_ids = torch.cat(
+            [paddingtensor2D(item["input_ids"], max_length) for item in features]
+        )
+        batch_attention_mask = torch.cat(
+            [paddingtensor2D(item["attention_mask"], max_length) for item in features]
+        )
+        batch_loss_mask = torch.cat(
+            [paddingtensor2D(item["loss_mask"], max_length) for item in features]
+        )
+
+        batch = {
+            "input_ids": batch_input_ids,
+            "attention_mask": batch_attention_mask,
+            "loss_mask": batch_loss_mask,
+            "feature_attention_mask": None,
+            "input_features": None,
+            "hidden_states": None,
+            "target_hiddens": None,
+            "inputs_embeds": None,
+            "position_ids": None,
+        }
+
+        # Check if both hidden_states and target_hiddens exist in all features
+        if all(
+            "hidden_states" in item and "target_hiddens" in item for item in features
+        ):
+            batch["hidden_states"] = torch.cat(
+                [paddingtensor(item["hidden_states"], max_length) for item in features]
+            )
+            batch["target_hiddens"] = torch.cat(
+                [paddingtensor(item["target_hiddens"], max_length) for item in features]
+            )
+        if all(
+            "inputs_embeds" in item and item["inputs_embeds"] is not None
+            for item in features
+        ):
+            batch["inputs_embeds"] = torch.cat(
+                [paddingtensor(item["inputs_embeds"], max_length) for item in features]
+            )
+        if all(
+            "position_ids" in item and item["position_ids"] is not None
+            for item in features
+        ):
+            batch["position_ids"] = torch.cat(
+                [paddingtensor2D(item["position_ids"], max_length) for item in features]
+            )
+        if all(
+            "feature_attention_mask" in item
+            and item["feature_attention_mask"] is not None
+            for item in features
+        ):
+            batch["feature_attention_mask"] = torch.cat(
+                [(item["feature_attention_mask"]) for item in features]
+            )
+        if all(
+            "input_features" in item and item["input_features"] is not None
+            for item in features
+        ):
+            batch["input_features"] = torch.cat(
+                [(item["input_features"]) for item in features]
+            )
+        return batch
diff --git a/angelslim/compressor/speculative/train/data/dataset_builder/__init__.py b/angelslim/compressor/speculative/train/data/dataset_builder/__init__.py
@@ -19,6 +19,7 @@
     OfflineVLMHunyuanVLDatasetBuilder,
 )
 from .online_dataset_builder import (
+    OnlineAudioDatasetBuilder,
     OnlineLLMDatasetBuilder,
     OnlineVLMDatasetBuilder,
     OnlineVLMHunyuanVLDatasetBuilder,
@@ -32,4 +33,5 @@
     "OfflineVLMDatasetBuilder",
     "OfflineVLMHunyuanVLDatasetBuilder",
     "DatasetBuilderFactory",
+    "OnlineAudioDatasetBuilder",
 ]