@@ -15,6 +15,9 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
1515
1616 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW , hparams.n_swa , false );
1717 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS , hparams.f_norm_rms_eps );
18+ ml.get_key (LLM_KV_NEXTN_PREDICT_LAYERS , hparams.nextn_predict_layers , false );
19+ GGML_ASSERT (hparams.nextn_predict_layers < hparams.n_layer && " nextn_predict_layers must be < n_layer" );
20+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers ;
1821
1922 switch (hparams.n_layer ) {
2023 case 30 : type = LLM_TYPE_1_2B ; break ;
@@ -38,21 +41,37 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
3841 }
3942
4043 for (int i = 0 ; i < n_layer; ++i) {
44+ const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast <uint32_t >(i) >= n_layer - hparams.nextn_predict_layers ;
45+ int flags = 0 ;
46+ if (is_nextn) {
47+ // NextN/MTP layers are preserved in GGUF but are not executed yet.
48+ flags |= TENSOR_SKIP ;
49+ }
50+
4151 auto & layer = layers[i];
4252
43- create_tensor_qkv (layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0 );
44- layer.wo = create_tensor (tn (LLM_TENSOR_ATTN_OUT , " weight" , i), {n_embd, n_embd}, 0 );
53+ create_tensor_qkv (layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
54+ layer.wo = create_tensor (tn (LLM_TENSOR_ATTN_OUT , " weight" , i), {n_embd, n_embd}, flags);
55+
56+ if (!is_nextn) {
57+ layer.rope_freqs = create_tensor (tn (LLM_TENSOR_ROPE_FREQS , " weight" , i), {n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0 ));
58+ }
4559
46- layer.rope_freqs = create_tensor (tn (LLM_TENSOR_ROPE_FREQS , " weight" , i), {n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0 ));
60+ layer.attn_post_norm = create_tensor (tn (LLM_TENSOR_ATTN_POST_NORM , " weight" , i), {n_embd}, flags);
61+ layer.attn_q_norm = create_tensor (tn (LLM_TENSOR_ATTN_Q_NORM , " weight" , i), {n_embd_head_k}, flags);
62+ layer.attn_k_norm = create_tensor (tn (LLM_TENSOR_ATTN_K_NORM , " weight" , i), {n_embd_head_k}, flags);
4763
48- layer.attn_post_norm = create_tensor (tn (LLM_TENSOR_ATTN_POST_NORM , " weight" , i), {n_embd}, 0 );
49- layer.attn_q_norm = create_tensor (tn (LLM_TENSOR_ATTN_Q_NORM , " weight" , i), {n_embd_head_k}, 0 );
50- layer.attn_k_norm = create_tensor (tn (LLM_TENSOR_ATTN_K_NORM , " weight" , i), {n_embd_head_k}, 0 );
64+ layer.ffn_gate = create_tensor (tn (LLM_TENSOR_FFN_GATE , " weight" , i), {n_embd, n_ff}, flags);
65+ layer.ffn_down = create_tensor (tn (LLM_TENSOR_FFN_DOWN , " weight" , i), { n_ff, n_embd}, flags);
66+ layer.ffn_up = create_tensor (tn (LLM_TENSOR_FFN_UP , " weight" , i), {n_embd, n_ff}, flags);
67+ layer.ffn_post_norm = create_tensor (tn (LLM_TENSOR_FFN_POST_NORM , " weight" , i), {n_embd}, flags);
5168
52- layer.ffn_gate = create_tensor (tn (LLM_TENSOR_FFN_GATE , " weight" , i), {n_embd, n_ff}, 0 );
53- layer.ffn_down = create_tensor (tn (LLM_TENSOR_FFN_DOWN , " weight" , i), { n_ff, n_embd}, 0 );
54- layer.ffn_up = create_tensor (tn (LLM_TENSOR_FFN_UP , " weight" , i), {n_embd, n_ff}, 0 );
55- layer.ffn_post_norm = create_tensor (tn (LLM_TENSOR_FFN_POST_NORM , " weight" , i), {n_embd}, 0 );
69+ if (is_nextn) {
70+ layer.nextn .eh_proj = create_tensor (tn (LLM_TENSOR_NEXTN_EH_PROJ , " weight" , i), {2 * n_embd, n_embd}, flags);
71+ layer.nextn .enorm = create_tensor (tn (LLM_TENSOR_NEXTN_ENORM , " weight" , i), {n_embd}, flags);
72+ layer.nextn .hnorm = create_tensor (tn (LLM_TENSOR_NEXTN_HNORM , " weight" , i), {n_embd}, flags);
73+ layer.nextn .shared_head_norm = create_tensor (tn (LLM_TENSOR_NEXTN_SHARED_HEAD_NORM , " weight" , i), {n_embd}, flags | TENSOR_NOT_REQUIRED );
74+ }
5675 }
5776}
5877
@@ -90,7 +109,11 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
90109 }
91110 ggml_tensor * inp_out_ids = build_inp_out_ids ();
92111
93- for (int il = 0 ; il < n_layer; ++il) {
112+ // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
113+ const int n_layer_main = int (n_layer) - int (hparams.nextn_predict_layers );
114+ GGML_ASSERT (n_layer_main > 0 );
115+
116+ for (int il = 0 ; il < n_layer_main; ++il) {
94117 ggml_tensor * inpSA = inpL;
95118
96119 // use RoPE for SWA layers or non-SWA models
@@ -126,7 +149,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
126149 Qcur, Kcur, Vcur, nullptr , nullptr , nullptr , 1 .0f / sqrtf (float (n_embd_head)), il);
127150 cb (cur, " attn_out" , il);
128151 }
129- if (il == n_layer - 1 && inp_out_ids) {
152+ if (il == n_layer_main - 1 && inp_out_ids) {
130153 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
131154 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
132155 }
0 commit comments