Skip to content

Commit db164a1

Browse files
committed
spec: support eagle3
1 parent 0c7eeb3 commit db164a1

18 files changed

Lines changed: 1069 additions & 13 deletions

common/speculative.cpp

Lines changed: 426 additions & 10 deletions
Large diffs are not rendered by default.

common/speculative.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ common_speculative * common_speculative_init(common_params_speculative & params,
2424

2525
void common_speculative_free(common_speculative * spec);
2626

27+
// Optional setup hook to call once after loading the draft model but before creating its context.
28+
// Inherits any missing weights from the target model (e.g. tok_embd / lm_head from target model for eagle3 / dflash)
29+
void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt);
30+
2731
struct common_speculative_draft_params {
2832
// this flag is used to chain the drafts through all the available implementations
2933
// after the first successful draft from an implementation, we set it

conversion/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class ModelBase:
9494
metadata: gguf.Metadata
9595
dir_model_card: Path
9696
remote_hf_model_id: str | None
97+
target_model_dir: Path | None
9798

9899
# subclasses should define this!
99100
model_arch: gguf.MODEL_ARCH
@@ -119,6 +120,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
119120
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
120121
disable_mistral_community_chat_template: bool = False,
121122
sentence_transformers_dense_modules: bool = False,
123+
target_model_dir: Path | None = None,
122124
fuse_gate_up_exps: bool = False):
123125
if type(self) is ModelBase or \
124126
type(self) is TextModel or \
@@ -138,6 +140,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
138140
self.dry_run = dry_run
139141
self.remote_hf_model_id = remote_hf_model_id
140142
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
143+
self.target_model_dir = target_model_dir
141144
self.fuse_gate_up_exps = fuse_gate_up_exps
142145
self._gate_exp_buffer: dict[int, Tensor] = {}
143146
self._up_exp_buffer: dict[int, Tensor] = {}
@@ -2340,6 +2343,7 @@ class LazyTorchTensor(gguf.LazyBase):
23402343
torch.float16: np.float16,
23412344
torch.float32: np.float32,
23422345
torch.uint8: np.uint8,
2346+
torch.int64: np.int64,
23432347
}
23442348

23452349
# only used when byteswapping data. Only correct size is needed

conversion/llama.py

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
if TYPE_CHECKING:
1111
from torch import Tensor
1212

13-
from .base import ModelBase, TextModel, gguf
13+
from .base import ModelBase, TextModel, gguf, logger
1414

1515

1616
@ModelBase.register(
@@ -21,6 +21,9 @@
2121
"VLlama3ForCausalLM",
2222
"LlavaForConditionalGeneration",
2323
"VoxtralForConditionalGeneration",
24+
"LlamaForCausalLMEagle3",
25+
"Eagle3Speculator",
26+
"Eagle3DraftModel",
2427
"IQuestCoderForCausalLM",
2528
"LlamaModel")
2629
class LlamaModel(TextModel):
@@ -39,7 +42,57 @@ def __init__(self, *args, **kwargs):
3942
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
4043
self.origin_hf_arch = hparams.get('architectures', [None])[0]
4144

45+
# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
46+
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
47+
self.is_eagle3 = True
48+
self.model_arch = gguf.MODEL_ARCH.EAGLE3
49+
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
50+
# Re-initialize tensor_map with eagle3 architecture
51+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
52+
# Update gguf_writer architecture
53+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
54+
self.gguf_writer.add_architecture()
55+
if self.target_model_dir is None:
56+
raise ValueError(
57+
"EAGLE-3 model requires --target-model-dir to be specified. "
58+
"Please provide the path to the target model directory to read config.json"
59+
)
60+
# Read both eagle3 raw config and target model config
61+
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
62+
eagle3_raw_config = json.load(f)
63+
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
64+
target_config = json.load(f)
65+
66+
# extract_layers: derived from target model layer count (low/mid/high)
67+
target_num_layers = target_config["num_hidden_layers"]
68+
extract_layers = [2, target_num_layers // 2, target_num_layers - 3]
69+
logger.info(f"EAGLE-3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)")
70+
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers)
71+
72+
# target_hidden_size: prefer eagle3 config, fallback to target config
73+
if eagle3_raw_config.get("target_hidden_size") is not None:
74+
target_hidden_size = eagle3_raw_config["target_hidden_size"]
75+
src = "EAGLE-3 config"
76+
else:
77+
target_hidden_size = target_config["hidden_size"]
78+
src = "target model config"
79+
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
80+
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
81+
82+
# norm_before_residual (RedHat-style eagle3 specific)
83+
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
84+
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
85+
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
86+
4287
def set_vocab(self):
88+
# eagle3: use tokenizer from target model if provided
89+
original_dir_model = None
90+
if getattr(self, 'is_eagle3', False):
91+
assert self.target_model_dir is not None
92+
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
93+
original_dir_model = self.dir_model
94+
self.dir_model = self.target_model_dir
95+
4396
if self.origin_hf_arch == "GlmasrModel":
4497
return self._set_vocab_glmedge()
4598

@@ -83,6 +136,10 @@ def set_vocab(self):
83136
if self.hparams.get("vocab_size", 32000) == 49152:
84137
self.gguf_writer.add_add_bos_token(False)
85138

139+
# eagle3: Restore original dir_model
140+
if original_dir_model is not None:
141+
self.dir_model = original_dir_model
142+
86143
def set_gguf_parameters(self):
87144
super().set_gguf_parameters()
88145
hparams = self.hparams
@@ -127,7 +184,49 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
127184

128185
return super().filter_tensors((name, gen))
129186

187+
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
188+
tensors = super().index_tensors(remote_hf_model_id)
189+
190+
# Handle Eagle3Speculator nested config
191+
if "transformer_layer_config" in self.hparams:
192+
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
193+
194+
# eagle3 detection
195+
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
196+
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
197+
new_tensors = {}
198+
for name, gen in tensors.items():
199+
if name.startswith("midlayer."):
200+
new_name = "model.layers.0." + name[len("midlayer."):]
201+
new_tensors[new_name] = gen
202+
elif name.startswith("layers.0."): # Eagle3Speculator format
203+
new_name = "model." + name
204+
new_tensors[new_name] = gen
205+
else:
206+
new_tensors[name] = gen
207+
return new_tensors
208+
209+
return tensors
210+
130211
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
212+
# eagle3: special tensors that bypass standard llama mapping
213+
if getattr(self, 'is_eagle3', False):
214+
if name == "fc.weight":
215+
yield (name, data_torch)
216+
return
217+
if name == "d2t":
218+
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
219+
if not hasattr(self, '_eagle3_int_tensors'):
220+
self._eagle3_int_tensors = {}
221+
self._eagle3_int_tensors[name] = data_torch
222+
return
223+
if name == "t2d":
224+
# not used at runtime, skip
225+
return
226+
if name == "model.layers.0.hidden_norm.weight":
227+
yield ("blk.0.hidden_norm.weight", data_torch)
228+
return
229+
131230
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
132231
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
133232

@@ -203,8 +302,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
203302
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
204303

205304
def prepare_tensors(self):
305+
# eagle3: collect d2t original dtype before parent converts tensors to F32
306+
eagle3_original_dtypes = {}
307+
if getattr(self, 'is_eagle3', False):
308+
for name, data_torch in self.get_tensors():
309+
if name == "d2t":
310+
eagle3_original_dtypes[name] = data_torch.dtype
311+
206312
super().prepare_tensors()
207313

314+
# eagle3: write d2t as int64 directly (not converted to F32)
315+
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
316+
for name, data_torch in self._eagle3_int_tensors.items():
317+
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
318+
data = data_torch.to(torch.int64).numpy()
319+
data_qtype = gguf.GGMLQuantizationType.I64
320+
321+
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
322+
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
323+
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
324+
208325
if self._experts is not None:
209326
# flatten `list[dict[str, Tensor]]` into `list[str]`
210327
experts = [k for d in self._experts for k in d.keys()]

convert_hf_to_gguf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,15 @@ def parse_args() -> argparse.Namespace:
149149
help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
150150
)
151151

152+
parser.add_argument(
153+
"--target-model-dir", type=str, default=None,
154+
help=(
155+
"path to the target model directory; required when converting a standalone draft model "
156+
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
157+
"layer count to populate its GGUF."
158+
),
159+
)
160+
152161
args = parser.parse_args()
153162
if not args.print_supported_models and args.model is None:
154163
parser.error("the following arguments are required: model")
@@ -264,6 +273,7 @@ def main() -> None:
264273
small_first_shard=args.no_tensor_first_split,
265274
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
266275
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
276+
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
267277
fuse_gate_up_exps=args.fuse_gate_up_exps
268278
)
269279

gguf-py/gguf/constants.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ class LLM:
152152
SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp"
153153
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
154154
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
155+
EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers"
156+
EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
157+
EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
155158

156159
class Attention:
157160
HEAD_COUNT = "{arch}.attention.head_count"
@@ -498,6 +501,7 @@ class MODEL_ARCH(IntEnum):
498501
RND1 = auto()
499502
PANGU_EMBED = auto()
500503
MISTRAL3 = auto()
504+
EAGLE3 = auto()
501505
MISTRAL4 = auto()
502506
PADDLEOCR = auto()
503507
MIMO2 = auto()
@@ -862,6 +866,10 @@ class MODEL_TENSOR(IntEnum):
862866
NEXTN_HNORM = auto()
863867
NEXTN_SHARED_HEAD_HEAD = auto()
864868
NEXTN_SHARED_HEAD_NORM = auto()
869+
# eagle3
870+
EAGLE3_FC = auto() # feature fusion layer
871+
EAGLE3_HIDDEN_NORM = auto() # hidden normalization
872+
EAGLE3_D2T = auto() # draft to target vocabulary mapping
865873
# lfm2 audio
866874
A_ENC_NORM_CONV = auto()
867875
A_ENC_LINEAR_POS = auto()
@@ -1014,6 +1022,7 @@ class MODEL_TENSOR(IntEnum):
10141022
MODEL_ARCH.RND1: "rnd1",
10151023
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
10161024
MODEL_ARCH.MISTRAL3: "mistral3",
1025+
MODEL_ARCH.EAGLE3: "eagle3",
10171026
MODEL_ARCH.MISTRAL4: "mistral4",
10181027
MODEL_ARCH.PADDLEOCR: "paddleocr",
10191028
MODEL_ARCH.MIMO2: "mimo2",
@@ -1407,6 +1416,9 @@ class MODEL_TENSOR(IntEnum):
14071416
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
14081417
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
14091418
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
1419+
MODEL_TENSOR.EAGLE3_FC: "fc",
1420+
MODEL_TENSOR.EAGLE3_HIDDEN_NORM: "blk.{bid}.hidden_norm",
1421+
MODEL_TENSOR.EAGLE3_D2T: "d2t",
14101422
}
14111423

14121424
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -3854,6 +3866,24 @@ class MODEL_TENSOR(IntEnum):
38543866
MODEL_TENSOR.FFN_DOWN_EXP,
38553867
MODEL_TENSOR.FFN_UP_EXP,
38563868
],
3869+
MODEL_ARCH.EAGLE3: [
3870+
MODEL_TENSOR.TOKEN_EMBD,
3871+
MODEL_TENSOR.OUTPUT_NORM,
3872+
MODEL_TENSOR.OUTPUT,
3873+
MODEL_TENSOR.ROPE_FREQS,
3874+
MODEL_TENSOR.ATTN_NORM,
3875+
MODEL_TENSOR.ATTN_Q,
3876+
MODEL_TENSOR.ATTN_K,
3877+
MODEL_TENSOR.ATTN_V,
3878+
MODEL_TENSOR.ATTN_OUT,
3879+
MODEL_TENSOR.FFN_NORM,
3880+
MODEL_TENSOR.FFN_GATE,
3881+
MODEL_TENSOR.FFN_DOWN,
3882+
MODEL_TENSOR.FFN_UP,
3883+
MODEL_TENSOR.EAGLE3_FC,
3884+
MODEL_TENSOR.EAGLE3_HIDDEN_NORM,
3885+
MODEL_TENSOR.EAGLE3_D2T,
3886+
],
38573887
MODEL_ARCH.MISTRAL4: [
38583888
MODEL_TENSOR.TOKEN_EMBD,
38593889
MODEL_TENSOR.OUTPUT_NORM,

src/llama-arch.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
126126
{ LLM_ARCH_RND1, "rnd1" },
127127
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
128128
{ LLM_ARCH_MISTRAL3, "mistral3" },
129+
{ LLM_ARCH_EAGLE3, "eagle3" },
129130
{ LLM_ARCH_MISTRAL4, "mistral4" },
130131
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
131132
{ LLM_ARCH_MIMO2, "mimo2" },
@@ -285,6 +286,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
285286

286287
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
287288

289+
{ LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" },
290+
{ LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
291+
{ LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
292+
288293
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
289294
// sentence-transformers dense modules feature dims
290295
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
@@ -548,6 +553,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
548553
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
549554
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
550555
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
556+
{ LLM_TENSOR_EAGLE3_HIDDEN_NORM, "blk.%d.hidden_norm" },
557+
{ LLM_TENSOR_EAGLE3_FC, "fc" },
558+
{ LLM_TENSOR_EAGLE3_D2T, "d2t" },
551559
};
552560

553561
// declare information about the model weight tensors:
@@ -769,6 +777,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
769777
// Nemotron 3 Super
770778
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
771779
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
780+
// eagle3
781+
{LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
782+
{LLM_TENSOR_EAGLE3_HIDDEN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
783+
{LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
772784
};
773785

774786
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

src/llama-arch.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ enum llm_arch {
138138
LLM_ARCH_MAINCODER,
139139
LLM_ARCH_KIMI_LINEAR,
140140
LLM_ARCH_UNKNOWN,
141+
LLM_ARCH_EAGLE3,
141142
};
142143

143144
enum llm_kv {
@@ -327,6 +328,10 @@ enum llm_kv {
327328

328329
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
329330

331+
LLM_KV_EAGLE3_EXTRACT_LAYERS,
332+
LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
333+
LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
334+
330335
LLM_KV_SHORTCONV_L_CACHE,
331336

332337
LLM_KV_XIELU_ALPHA_N,
@@ -555,6 +560,9 @@ enum llm_tensor {
555560
LLM_TENSOR_NEXTN_HNORM,
556561
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
557562
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
563+
LLM_TENSOR_EAGLE3_FC,
564+
LLM_TENSOR_EAGLE3_HIDDEN_NORM,
565+
LLM_TENSOR_EAGLE3_D2T,
558566
};
559567

560568
enum llm_tensor_layer {

0 commit comments

Comments
 (0)