Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -636,12 +636,12 @@ Benchmark results for Qwen3-VL series models using Eagle3 speculative decoding o

##### 1.2.2 HunyuanOCR Model

Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across OCR tasks, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.13.0) across **[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)** dataset, using a single NVIDIA H20 GPU (**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**).
<table><thead>
<tr>
<th>Model</th>
<th>Method</th>
<th colspan="2">OCR-Bench-Internal</th>
<th colspan="2">OmniDocBench</th>
</tr></thead>
<tbody>
<tr>
Expand All @@ -653,13 +653,13 @@ Benchmark results for HunyuanOCR using Eagle3 speculative decoding on vLLM (v0.1
<tr>
<td rowspan="2">Hunyuan-OCR</td>
<td>Vanilla</td>
<td>71.21</td>
<td>70.12</td>
<td>1</td>
</tr>
<tr>
<td>Eagle3</td>
<td>120.75</td>
<td>2.2</td>
<td>108.1</td>
<td>2.08</td>
</tr>
</tbody>
</table>
Expand Down
10 changes: 5 additions & 5 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -640,13 +640,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta

##### 1.2.2 HunyuanOCR模型

我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在 **OCR-Bench** 上的接收长度和吞吐。结果是在单张H20上用以下设置测得:**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。
我们使用(v0.13.0)评测了HunyuanOCR Eagle3模型在[OmniDocBench](https://huggingface.co/datasets/opendatalab/OmniDocBench)上的接收长度和吞吐。结果是在单张H20上用以下设置测得:**tp=1, ep=1, num_speculative_tokens=4, batch_size=1, output_len=1024**。

<table><thead>
<tr>
<th>Model</th>
<th>Method</th>
<th colspan="2">OCR-Bench-Internal</th>
<th colspan="2">OmniDocBench</th>
</tr></thead>
<tbody>
<tr>
Expand All @@ -658,13 +658,13 @@ bash scripts/deploy/lm_eval.sh -d 0,1 -t 2 -g 0.8 -r $RESULT_PATH -b "auto" --ta
<tr>
<td rowspan="2">Hunyuan-OCR</td>
<td>Vanilla</td>
<td>71.21</td>
<td>70.12</td>
<td>1</td>
</tr>
<tr>
<td>Eagle3</td>
<td>120.75</td>
<td>2.2</td>
<td>108.1</td>
<td>2.08</td>
</tr>
</tbody>
</table>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"architectures": [
"Eagle3LlamaForCausalLM"
],
"model_type": "llama",
"target_model_type": "qwen2_audio",
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"dtype": "bfloat16",
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 8192,
"num_attention_heads": 32,
"num_hidden_layers": 1,
"num_key_value_heads": 4,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 10000,
"use_cache": true,
"vocab_size": 156032,
"tie_word_embeddings": false,
"transformers_version": "4.57.1",
"draft_vocab_size": 32000,
"modal_type": "Audio"
}
68 changes: 68 additions & 0 deletions angelslim/compressor/speculative/train/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,71 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
[paddingtensor2D(item["position_ids"], max_length) for item in features]
)
return batch


class AudioDataCollatorWithPadding:

def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
max_length = max(item["input_ids"].shape[1] for item in features)
batch_input_ids = torch.cat(
[paddingtensor2D(item["input_ids"], max_length) for item in features]
)
batch_attention_mask = torch.cat(
[paddingtensor2D(item["attention_mask"], max_length) for item in features]
)
batch_loss_mask = torch.cat(
[paddingtensor2D(item["loss_mask"], max_length) for item in features]
)

batch = {
"input_ids": batch_input_ids,
"attention_mask": batch_attention_mask,
"loss_mask": batch_loss_mask,
"feature_attention_mask": None,
"input_features": None,
"hidden_states": None,
"target_hiddens": None,
"inputs_embeds": None,
"position_ids": None,
}

# Check if both hidden_states and target_hiddens exist in all features
if all(
"hidden_states" in item and "target_hiddens" in item for item in features
):
batch["hidden_states"] = torch.cat(
[paddingtensor(item["hidden_states"], max_length) for item in features]
)
batch["target_hiddens"] = torch.cat(
[paddingtensor(item["target_hiddens"], max_length) for item in features]
)
if all(
"inputs_embeds" in item and item["inputs_embeds"] is not None
for item in features
):
batch["inputs_embeds"] = torch.cat(
[paddingtensor(item["inputs_embeds"], max_length) for item in features]
)
if all(
"position_ids" in item and item["position_ids"] is not None
for item in features
):
batch["position_ids"] = torch.cat(
[paddingtensor2D(item["position_ids"], max_length) for item in features]
)
if all(
"feature_attention_mask" in item
and item["feature_attention_mask"] is not None
for item in features
):
batch["feature_attention_mask"] = torch.cat(
[(item["feature_attention_mask"]) for item in features]
)
if all(
"input_features" in item and item["input_features"] is not None
for item in features
):
batch["input_features"] = torch.cat(
[(item["input_features"]) for item in features]
)
return batch
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
OfflineVLMHunyuanVLDatasetBuilder,
)
from .online_dataset_builder import (
OnlineAudioDatasetBuilder,
OnlineLLMDatasetBuilder,
OnlineVLMDatasetBuilder,
OnlineVLMHunyuanVLDatasetBuilder,
Expand All @@ -32,4 +33,5 @@
"OfflineVLMDatasetBuilder",
"OfflineVLMHunyuanVLDatasetBuilder",
"DatasetBuilderFactory",
"OnlineAudioDatasetBuilder",
]
Loading
Loading