Skip to content

Commit 1c786f2

Browse files
committed
speculative: add qwen3-vl-8b-eagle3-mrope config
1 parent 4fe1230 commit 1c786f2

10 files changed

Lines changed: 58 additions & 15 deletions

angelslim/compressor/speculative/train/configs/qwen3-1.7b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -13,7 +15,6 @@
1315
"intermediate_size": 6144,
1416
"max_position_embeddings": 40960,
1517
"max_window_layers": 28,
16-
"model_type": "llama",
1718
"num_attention_heads": 16,
1819
"num_hidden_layers": 1,
1920
"num_key_value_heads": 8,
@@ -22,7 +23,6 @@
2223
"rope_theta": 1000000,
2324
"sliding_window": null,
2425
"tie_word_embeddings": true,
25-
"torch_dtype": "bfloat16",
2626
"transformers_version": "4.51.0",
2727
"use_cache": true,
2828
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-14b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -13,7 +15,6 @@
1315
"intermediate_size": 17408,
1416
"max_position_embeddings": 40960,
1517
"max_window_layers": 40,
16-
"model_type": "llama",
1718
"num_attention_heads": 40,
1819
"num_hidden_layers": 1,
1920
"num_key_value_heads": 8,
@@ -22,7 +23,6 @@
2223
"rope_theta": 1000000,
2324
"sliding_window": null,
2425
"tie_word_embeddings": false,
25-
"torch_dtype": "bfloat16",
2626
"transformers_version": "4.51.0",
2727
"use_cache": true,
2828
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-30b-a3b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -14,7 +16,6 @@
1416
"intermediate_size": 6144,
1517
"max_position_embeddings": 40960,
1618
"max_window_layers": 48,
17-
"model_type": "llama",
1819
"num_attention_heads": 32,
1920
"num_hidden_layers": 1,
2021
"num_key_value_heads": 4,
@@ -25,7 +26,6 @@
2526
"router_aux_loss_coef": 0.001,
2627
"sliding_window": null,
2728
"tie_word_embeddings": false,
28-
"torch_dtype": "bfloat16",
2929
"transformers_version": "4.51.0",
3030
"use_cache": true,
3131
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-32b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -13,7 +15,6 @@
1315
"intermediate_size": 25600,
1416
"max_position_embeddings": 40960,
1517
"max_window_layers": 64,
16-
"model_type": "llama",
1718
"num_attention_heads": 64,
1819
"num_hidden_layers": 1,
1920
"num_key_value_heads": 8,
@@ -22,7 +23,6 @@
2223
"rope_theta": 1000000,
2324
"sliding_window": null,
2425
"tie_word_embeddings": false,
25-
"torch_dtype": "bfloat16",
2626
"transformers_version": "4.51.0",
2727
"use_cache": true,
2828
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-4b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -13,7 +15,6 @@
1315
"intermediate_size": 9728,
1416
"max_position_embeddings": 40960,
1517
"max_window_layers": 36,
16-
"model_type": "llama",
1718
"num_attention_heads": 32,
1819
"num_hidden_layers": 1,
1920
"num_key_value_heads": 8,
@@ -22,7 +23,6 @@
2223
"rope_theta": 1000000,
2324
"sliding_window": null,
2425
"tie_word_embeddings": false,
25-
"torch_dtype": "bfloat16",
2626
"transformers_version": "4.51.0",
2727
"use_cache": true,
2828
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-8b-eagle3.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"architectures": [
33
"Eagle3LlamaForCausalLM"
44
],
5+
"model_type": "llama",
6+
"torch_dtype": "bfloat16",
57
"attention_bias": false,
68
"attention_dropout": 0.0,
79
"bos_token_id": 151643,
@@ -13,7 +15,6 @@
1315
"intermediate_size": 12288,
1416
"max_position_embeddings": 40960,
1517
"max_window_layers": 36,
16-
"model_type": "llama",
1718
"num_attention_heads": 32,
1819
"num_hidden_layers": 1,
1920
"num_key_value_heads": 8,
@@ -22,7 +23,6 @@
2223
"rope_theta": 1000000,
2324
"sliding_window": null,
2425
"tie_word_embeddings": false,
25-
"torch_dtype": "bfloat16",
2626
"transformers_version": "4.51.0",
2727
"use_cache": true,
2828
"use_sliding_window": false,

angelslim/compressor/speculative/train/configs/qwen3-vl-2b-eagle3-mrope.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
],
55
"model_type": "llama",
66
"target_model_type": "qwen3_vl",
7+
"dtype": "bfloat16",
78
"attention_bias": false,
89
"attention_dropout": 0.0,
910
"bos_token_id": 151643,
10-
"dtype": "bfloat16",
1111
"eos_token_id": 151645,
1212
"head_dim": 128,
1313
"hidden_act": "silu",

angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3-mrope.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
],
55
"model_type": "llama",
66
"target_model_type": "qwen3_vl",
7+
"dtype": "bfloat16",
78
"attention_bias": false,
89
"attention_dropout": 0.0,
910
"bos_token_id": 151643,
10-
"dtype": "bfloat16",
1111
"eos_token_id": 151645,
1212
"head_dim": 128,
1313
"hidden_act": "silu",

angelslim/compressor/speculative/train/configs/qwen3-vl-4b-eagle3.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
],
55
"model_type": "llama",
66
"target_model_type": "qwen3_vl",
7+
"dtype": "bfloat16",
78
"attention_bias": false,
89
"attention_dropout": 0.0,
910
"bos_token_id": 151643,
10-
"dtype": "bfloat16",
1111
"eos_token_id": 151645,
1212
"head_dim": 128,
1313
"hidden_act": "silu",
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"architectures": [
3+
"Eagle3LlamaForCausalLM"
4+
],
5+
"model_type": "llama",
6+
"target_model_type": "qwen3_vl",
7+
"dtype": "bfloat16",
8+
"attention_bias": false,
9+
"attention_dropout": 0.0,
10+
"bos_token_id": 151643,
11+
"eos_token_id": 151645,
12+
"head_dim": 128,
13+
"hidden_act": "silu",
14+
"hidden_size": 4096,
15+
"initializer_range": 0.02,
16+
"intermediate_size": 12288,
17+
"max_position_embeddings": 262144,
18+
"num_attention_heads": 32,
19+
"num_hidden_layers": 1,
20+
"num_key_value_heads": 8,
21+
"rms_norm_eps": 1e-06,
22+
"rope_scaling": {
23+
"type": "default",
24+
"rope_type": "default",
25+
"mrope_interleaved": true,
26+
"mrope_section": [
27+
24,
28+
20,
29+
20
30+
]
31+
},
32+
"rope_theta": 5000000,
33+
"tie_word_embeddings": false,
34+
"use_cache": true,
35+
"vocab_size": 151936,
36+
"transformers_version": "4.57.1",
37+
"image_token_id": 151655,
38+
"video_token_id": 151656,
39+
"vision_end_token_id": 151653,
40+
"vision_start_token_id": 151652,
41+
"draft_vocab_size": 32000,
42+
"modal_type": "VLM"
43+
}

0 commit comments

Comments
 (0)