Skip to content

Commit 48b88c3

Browse files
nuxlearlgai-exaone
andauthored
model: Add EXAONE 4.5 implementations (ggml-org#21733)
* Add EXAONE 4.5 and Add GQA for MMproj * mtmd: EXAONE 4.5 vision markers and projector path EXAONE 4.5 uses <vision> and </vision> for image boundaries; Qwen keeps <|vision_start|> and <|vision_end|>. Route EXAONE 4.5 through the Qwen2.5-VL-style encode path (window attention pattern, optional mmproj input norm). Update exaone4_5 projector weights and convert_hf_to_gguf for mmproj export. * mtmd: load EXAONE4 nextn tensors correctly Align EXAONE4 tensor registration with EXAONE_MOE for NextN/MTP slots and avoid skip-flag propagation on duplicated rope_freqs so model loading succeeds for EXAONE 4.5 GGUF. * Minor fixes * Address PR feedback * Address PR feedback * Fix EXAONE after merge * Fix EXAONE 4.5 conversion * Address PR feedback * Refactor EXAONE 4.5 conversion * Address PR feedback * Fix unintended deletion * Minor fix --------- Co-authored-by: LG-AI-EXAONE <exaonemodels@lgresearch.ai>
1 parent 1962000 commit 48b88c3

11 files changed

Lines changed: 356 additions & 16 deletions

File tree

conversion/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
"Ernie4_5_ForCausalLM": "ernie",
5959
"Ernie4_5_MoeForCausalLM": "ernie",
6060
"EuroBertModel": "bert",
61+
"Exaone4_5_ForConditionalGeneration": "exaone",
6162
"Exaone4ForCausalLM": "exaone",
6263
"ExaoneForCausalLM": "exaone",
6364
"ExaoneMoEForCausalLM": "exaone",
@@ -240,6 +241,7 @@
240241
"DeepseekOCR2ForCausalLM": "deepseek",
241242
"DeepseekOCRForCausalLM": "deepseek",
242243
"DotsOCRForCausalLM": "dotsocr",
244+
"Exaone4_5_ForConditionalGeneration": "exaone",
243245
"Gemma3ForConditionalGeneration": "gemma",
244246
"Gemma3nForConditionalGeneration": "gemma",
245247
"Gemma4ForConditionalGeneration": "gemma",

conversion/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2593,7 +2593,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
25932593
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
25942594
# For text conversion we route to a dedicated text-only class.
25952595
# TODO: refactor this later to avoid adding exception here
2596-
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
2596+
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Exaone4_5_ForConditionalGeneration"):
25972597
return arch
25982598

25992599
# if "architectures" is found in the sub-config, use that instead

conversion/exaone.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
import math
44

55
from pathlib import Path
6-
from typing import Iterable, TYPE_CHECKING
6+
from typing import Callable, Iterable, TYPE_CHECKING
77

88
import torch
99

1010
if TYPE_CHECKING:
1111
from torch import Tensor
1212

13-
from .base import ModelBase, TextModel, gguf
13+
from .base import MmprojModel, ModelBase, TextModel, gguf
14+
from .qwenvl import Qwen2VLVisionModel
1415

1516

1617
@ModelBase.register("ExaoneForCausalLM")
@@ -208,3 +209,97 @@ def prepare_tensors(self):
208209
experts = [k for d in self._experts for k in d.keys()]
209210
if len(experts) > 0:
210211
raise ValueError(f"Unprocessed experts: {experts}")
212+
213+
214+
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
215+
class Exaone4_5_TextModel(Exaone4Model):
216+
"""Text tower of EXAONE 4.5; Tensors match EXAONE4"""
217+
218+
model_arch = gguf.MODEL_ARCH.EXAONE4
219+
220+
def __init__(self, *args, **kwargs):
221+
super().__init__(*args, **kwargs)
222+
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
223+
if n_nextn > 0:
224+
self.block_count = self.hparams["num_hidden_layers"] + n_nextn
225+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
226+
227+
def set_gguf_parameters(self):
228+
super().set_gguf_parameters()
229+
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
230+
if n_nextn > 0:
231+
self.gguf_writer.add_nextn_predict_layers(n_nextn)
232+
233+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
234+
if name.startswith("mtp."):
235+
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
236+
if n_nextn <= 0:
237+
return
238+
nh = self.hparams["num_hidden_layers"]
239+
if ".layers." in name:
240+
share = self.hparams.get("mtp_share_layers", False)
241+
mtp_bid = bid if bid is not None else 0
242+
if share:
243+
for k in range(n_nextn):
244+
nn = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{nh + k}")
245+
yield from super().modify_tensors(data_torch, nn, nh + k)
246+
return
247+
name = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{mtp_bid + nh}")
248+
else:
249+
remapper = {
250+
"mtp.fc": gguf.MODEL_TENSOR.NEXTN_EH_PROJ,
251+
"mtp.pre_fc_norm_embedding": gguf.MODEL_TENSOR.NEXTN_ENORM,
252+
"mtp.pre_fc_norm_hidden": gguf.MODEL_TENSOR.NEXTN_HNORM,
253+
"mtp.norm": gguf.MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
254+
}
255+
_n = Path(name)
256+
key = _n.stem
257+
if key not in remapper:
258+
return
259+
for bid_mtp in range(nh, self.block_count):
260+
mapped_name = self.format_tensor_name(remapper[key], bid_mtp, suffix=_n.suffix)
261+
yield from ModelBase.modify_tensors(self, data_torch, mapped_name, bid_mtp)
262+
return
263+
264+
yield from super().modify_tensors(data_torch, name, bid)
265+
266+
267+
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
268+
class Exaone4_5VisionModel(Qwen2VLVisionModel):
269+
"""Vision tower for EXAONE 4.5; Qwen2-VL-style ViT (GQA) + patch merger"""
270+
271+
@classmethod
272+
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
273+
name, gen = item
274+
name = name.replace("model.visual.", "visual.", 1)
275+
return super().filter_tensors((name, gen))
276+
277+
def set_gguf_parameters(self):
278+
MmprojModel.set_gguf_parameters(self)
279+
assert self.hparams_vision is not None
280+
hparams = self.hparams_vision
281+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.EXAONE4_5)
282+
self.gguf_writer.add_vision_use_silu(True)
283+
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
284+
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
285+
num_kv_head = self.find_vparam(["num_key_value_heads"], optional=True)
286+
if num_kv_head is not None:
287+
self.gguf_writer.add_vision_head_count_kv(num_kv_head)
288+
eps = hparams.get("rms_norm_eps", self.global_config.get("rms_norm_eps", 1e-6))
289+
self.gguf_writer.add_vision_attention_layernorm_eps(eps)
290+
if (window_size := hparams.get("window_size")) is not None:
291+
self.gguf_writer.add_vision_window_size(window_size)
292+
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
293+
if fullatt_block_indexes:
294+
n_wa_pattern = fullatt_block_indexes[0] + 1
295+
for i in range(1, len(fullatt_block_indexes)):
296+
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
297+
raise ValueError(f"Invalid EXAONE4.5 fullatt_block_indexes: {fullatt_block_indexes}")
298+
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
299+
300+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
301+
if ".qkv." in name:
302+
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
303+
return
304+
305+
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)

gguf-py/gguf/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3310,6 +3310,13 @@ class MODEL_TENSOR(IntEnum):
33103310
MODEL_TENSOR.FFN_DOWN,
33113311
MODEL_TENSOR.FFN_UP,
33123312
MODEL_TENSOR.FFN_POST_NORM,
3313+
# NextN/MTP tensors - preserved but unused
3314+
MODEL_TENSOR.NEXTN_EH_PROJ,
3315+
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
3316+
MODEL_TENSOR.NEXTN_ENORM,
3317+
MODEL_TENSOR.NEXTN_HNORM,
3318+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
3319+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
33133320
],
33143321
MODEL_ARCH.EXAONE_MOE: [
33153322
MODEL_TENSOR.TOKEN_EMBD,
@@ -4318,6 +4325,7 @@ class VisionProjectorType:
43184325
LLAMA4 = "llama4"
43194326
QWEN2VL = "qwen2vl_merger"
43204327
QWEN25VL = "qwen2.5vl_merger"
4328+
EXAONE4_5 = "exaone4_5"
43214329
QWEN3VL = "qwen3vl_merger"
43224330
STEP3VL = "step3vl"
43234331
ULTRAVOX = "ultravox"

src/models/exaone4.cpp

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
1515

1616
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1717
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
18+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
19+
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
20+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1821

1922
switch (hparams.n_layer) {
2023
case 30: type = LLM_TYPE_1_2B; break;
@@ -38,21 +41,37 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
3841
}
3942

4043
for (int i = 0; i < n_layer; ++i) {
44+
const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
45+
int flags = 0;
46+
if (is_nextn) {
47+
// NextN/MTP layers are preserved in GGUF but are not executed yet.
48+
flags |= TENSOR_SKIP;
49+
}
50+
4151
auto & layer = layers[i];
4252

43-
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
44-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
53+
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
54+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, flags);
55+
56+
if (!is_nextn) {
57+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
58+
}
4559

46-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
60+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
61+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
62+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
4763

48-
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
49-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
50-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
64+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
65+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
66+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
67+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
5168

52-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
53-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
54-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
55-
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
69+
if (is_nextn) {
70+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
71+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
72+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
73+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
74+
}
5675
}
5776
}
5877

@@ -90,7 +109,11 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
90109
}
91110
ggml_tensor * inp_out_ids = build_inp_out_ids();
92111

93-
for (int il = 0; il < n_layer; ++il) {
112+
// MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
113+
const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
114+
GGML_ASSERT(n_layer_main > 0);
115+
116+
for (int il = 0; il < n_layer_main; ++il) {
94117
ggml_tensor * inpSA = inpL;
95118

96119
// use RoPE for SWA layers or non-SWA models
@@ -126,7 +149,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
126149
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
127150
cb(cur, "attn_out", il);
128151
}
129-
if (il == n_layer - 1 && inp_out_ids) {
152+
if (il == n_layer_main - 1 && inp_out_ids) {
130153
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
131154
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
132155
}

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ add_library(mtmd
1818
models/cogvlm.cpp
1919
models/conformer.cpp
2020
models/dotsocr.cpp
21+
models/exaone4_5.cpp
2122
models/gemma4a.cpp
2223
models/gemma4v.cpp
2324
models/glm4v.cpp

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ enum projector_type {
347347
PROJECTOR_TYPE_KIMIK25,
348348
PROJECTOR_TYPE_NEMOTRON_V2_VL,
349349
PROJECTOR_TYPE_HUNYUANVL,
350+
PROJECTOR_TYPE_EXAONE4_5,
350351
PROJECTOR_TYPE_MINICPMV4_6,
351352
PROJECTOR_TYPE_GRANITE_SPEECH,
352353
PROJECTOR_TYPE_MIMOVL,
@@ -396,6 +397,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
396397
{ PROJECTOR_TYPE_YASA2, "yasa2"},
397398
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
398399
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
400+
{ PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"},
399401
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
400402
{ PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
401403
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},

tools/mtmd/clip.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
884884
{
885885
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
886886
} break;
887+
case PROJECTOR_TYPE_EXAONE4_5:
888+
{
889+
builder = std::make_unique<clip_graph_exaone4_5>(ctx, img);
890+
} break;
887891
case PROJECTOR_TYPE_MIMOVL:
888892
{
889893
builder = std::make_unique<clip_graph_mimovl>(ctx, img);
@@ -1556,6 +1560,19 @@ struct clip_model_loader {
15561560
hparams.audio_window_len = 400;
15571561
hparams.audio_hop_len = 160;
15581562
} break;
1563+
case PROJECTOR_TYPE_EXAONE4_5:
1564+
{
1565+
hparams.n_merge = 2;
1566+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1567+
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, false);
1568+
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
1569+
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
1570+
hparams.set_warmup_n_tokens(46 * 46);
1571+
if (hparams.rope_theta <= 0.0f) {
1572+
hparams.rope_theta = 10000.0f;
1573+
}
1574+
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
1575+
} break;
15591576
case PROJECTOR_TYPE_GEMMA4A:
15601577
{
15611578
// Gemma4 feature_extraction_gemma4.py:
@@ -1799,6 +1816,7 @@ struct clip_model_loader {
17991816
|| model.proj_type == PROJECTOR_TYPE_LDPV2
18001817
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
18011818
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
1819+
|| model.proj_type == PROJECTOR_TYPE_EXAONE4_5
18021820
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
18031821
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
18041822
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
@@ -1948,6 +1966,7 @@ struct clip_model_loader {
19481966
} break;
19491967
case PROJECTOR_TYPE_QWEN2VL:
19501968
case PROJECTOR_TYPE_QWEN25VL:
1969+
case PROJECTOR_TYPE_EXAONE4_5:
19511970
{
19521971
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
19531972
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
@@ -3079,6 +3098,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
30793098
case PROJECTOR_TYPE_QWEN2VL:
30803099
case PROJECTOR_TYPE_QWEN25VL:
30813100
case PROJECTOR_TYPE_QWEN3VL:
3101+
case PROJECTOR_TYPE_EXAONE4_5:
30823102
case PROJECTOR_TYPE_MIMOVL:
30833103
case PROJECTOR_TYPE_GLM4V:
30843104
case PROJECTOR_TYPE_PADDLEOCR:
@@ -3100,6 +3120,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
31003120
case PROJECTOR_TYPE_QWEN2VL:
31013121
case PROJECTOR_TYPE_QWEN25VL:
31023122
case PROJECTOR_TYPE_QWEN3VL:
3123+
case PROJECTOR_TYPE_EXAONE4_5:
31033124
case PROJECTOR_TYPE_MIMOVL:
31043125
case PROJECTOR_TYPE_GLM4V:
31053126
case PROJECTOR_TYPE_PADDLEOCR:
@@ -3179,6 +3200,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
31793200
case PROJECTOR_TYPE_QWEN2VL:
31803201
case PROJECTOR_TYPE_QWEN25VL:
31813202
case PROJECTOR_TYPE_QWEN3VL:
3203+
case PROJECTOR_TYPE_EXAONE4_5:
31823204
case PROJECTOR_TYPE_MIMOVL:
31833205
case PROJECTOR_TYPE_GLM4V:
31843206
case PROJECTOR_TYPE_YOUTUVL:
@@ -3675,11 +3697,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
36753697
set_input_i32("positions", positions);
36763698
} break;
36773699
case PROJECTOR_TYPE_QWEN25VL:
3700+
case PROJECTOR_TYPE_EXAONE4_5:
36783701
case PROJECTOR_TYPE_YOUTUVL:
36793702
{
36803703
// pw * ph = number of tokens output by ViT after apply patch merger
36813704
// ipw * ipw = number of vision token been processed inside ViT
3682-
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
3705+
const bool use_window_attn =
3706+
(ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL || ctx->model.proj_type == PROJECTOR_TYPE_EXAONE4_5)
3707+
? hparams.n_wa_pattern > 0
3708+
: !hparams.wa_layer_indexes.empty();
36833709
const int merge_ratio = 2;
36843710
const int pw = image_size_width / patch_size / merge_ratio;
36853711
const int ph = image_size_height / patch_size / merge_ratio;
@@ -4262,6 +4288,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
42624288
return ctx->model.mm_model_mlp_3_w->ne[1];
42634289
case PROJECTOR_TYPE_QWEN2VL:
42644290
case PROJECTOR_TYPE_QWEN25VL:
4291+
case PROJECTOR_TYPE_EXAONE4_5:
42654292
case PROJECTOR_TYPE_JANUS_PRO:
42664293
case PROJECTOR_TYPE_YOUTUVL:
42674294
return ctx->model.mm_1_b->ne[0];

0 commit comments

Comments
 (0)