Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 97 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,6 +1331,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
res = "qwen2"
if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f":
# ref: https://huggingface.co/openbmb/MiniCPM-V-4_6
res = "qwen35"
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
res = "grok-2"
Expand Down Expand Up @@ -1526,7 +1529,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
# ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
res = "exaone-moe"
if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4":
if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f":
# ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct
res = "qwen35"
Comment thread
tc-mb marked this conversation as resolved.
if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d":
Expand Down Expand Up @@ -5432,16 +5435,107 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield from super().modify_tensors(data_torch, name, bid)


class _Qwen35MRopeMixin:
# Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers);
# the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE
# loaders treat qwen35.rope.dimension_sections as required, so make sure it is always
# written even when a particular checkpoint omits the field in `rope_parameters`.
_QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0]

def set_gguf_parameters(self):
super().set_gguf_parameters() # type: ignore[misc]
if "mrope_section" not in self.rope_parameters: # type: ignore[attr-defined]
self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) # type: ignore[attr-defined]


@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
class Qwen3_5TextModel(_LinearAttentionVReorderBase):
class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
model_arch = gguf.MODEL_ARCH.QWEN35


@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
model_arch = gguf.MODEL_ARCH.QWEN35MOE


# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under
# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger
# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as
# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup.

@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
class MiniCPMV4_6TextModel(Qwen3_5TextModel):
model_arch = gguf.MODEL_ARCH.QWEN35

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# drop vision tower / multimodal merger tensors -- they belong to the mmproj file
if name.startswith(("model.vision_tower.", "model.merger.")):
return
# MTP tensors are not used at inference yet; align with Qwen3Next behaviour
if name.startswith("mtp"):
return
# strip the language-model wrapper so the underlying Qwen3.5 tensor mapping matches
if name.startswith("model.language_model."):
name = "model." + name[len("model.language_model."):]
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
class MiniCPMV4_6VisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams_vision is not None:
# In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP
# positional embedding bucket grid (70 x 70), while the per-slice processing
# resolution is the preprocessor's `scale_resolution` (typically 448).
# The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size`
# as the slice size and warmup resolution, so report `scale_resolution` there
# to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules.
scale_resolution = self.preprocessor_config.get("scale_resolution")
if scale_resolution is not None:
self.hparams_vision["image_size"] = int(scale_resolution)

def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None

# projector type string is consumed by clip_projector_type_from_string() in clip.cpp
# (mapped to PROJECTOR_TYPE_MINICPMV4_6).
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6)

# legacy version tag, used by mtmd.cpp to pick the slice template (MINICPMV_2_6).
# The clip loader reads this field via gguf_get_val_i32, so it must be written as int32.
self.gguf_writer.add_int32("clip.minicpmv_version", 46)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is unused now

# fixed merger output token count per slice for the default 16x downsample mode.
self.gguf_writer.add_uint32("clip.minicpmv_query_num", 64)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also unused, right? otherwise you should create a new constant in constants.py


# ViT layer index after which the window-attention merger is applied
insert_layer_id = int(self.global_config.get(
"insert_layer_id", self.hparams_vision.get("insert_layer_id", 6)))
self.gguf_writer.add_uint32("clip.vision.insert_layer_id", insert_layer_id)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use ClipVision.N_WA_PATTERN or WA_LAYER_INDEXES


# SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx).
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(
self.hparams_vision.get("layer_norm_eps", 1e-6))

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# text tower / lm_head / MTP -> belong to the LM file
if name.startswith(("model.language_model.", "lm_head.")) or name.startswith("mtp"):
return

# final merger and ViT merger
if name.startswith(("model.merger.", "model.vision_tower.vit_merger.")):
yield from super().modify_tensors(data_torch, name, bid)
return

# SigLIP vision body
if name.startswith("model.vision_tower."):
name = "vision_tower.vision_model." + name[len("model.vision_tower."):]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use replace("model.vision_tower.", "vision_tower.vision_model.")

yield from super().modify_tensors(data_torch, name, bid)
return


@ModelBase.register("GPT2LMHeadModel")
class GPT2Model(TextModel):
model_arch = gguf.MODEL_ARCH.GPT2
Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
Expand Down
49 changes: 49 additions & 0 deletions docs/multimodal/minicpmv4.6.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## MiniCPM-V 4.6

### Prepare models and code

Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.

The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.

### Build llama.cpp

If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)

Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```


### Usage of MiniCPM-V 4.6

Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.

```bash
# language model
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf

# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf

# optional: quantize to Q4_K_M
./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
```


Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
```
25 changes: 25 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,14 @@ class MODEL_TENSOR(IntEnum):
V_DS_NORM = auto() # qwen3vl
V_DS_FC1 = auto() # qwen3vl
V_DS_FC2 = auto() # qwen3vl
V_MERGER_LN1 = auto() # minicpmv4_6
V_MERGER_ATTN_Q = auto() # minicpmv4_6
V_MERGER_ATTN_K = auto() # minicpmv4_6
V_MERGER_ATTN_V = auto() # minicpmv4_6
V_MERGER_ATTN_O = auto() # minicpmv4_6
V_MERGER_DS_LN = auto() # minicpmv4_6
V_MERGER_DS_UP = auto() # minicpmv4_6
V_MERGER_DS_DOWN = auto() # minicpmv4_6
V_MM_POST_FC_NORM = auto() # cogvlm
V_MM_UP = auto() # cogvlm
V_MM_DOWN = auto() # cogvlm
Expand Down Expand Up @@ -1251,6 +1259,14 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm",
MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1",
MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2",
MODEL_TENSOR.V_MERGER_LN1: "v.vit_merger.ln1",
MODEL_TENSOR.V_MERGER_ATTN_Q: "v.vit_merger.attn_q",
MODEL_TENSOR.V_MERGER_ATTN_K: "v.vit_merger.attn_k",
MODEL_TENSOR.V_MERGER_ATTN_V: "v.vit_merger.attn_v",
MODEL_TENSOR.V_MERGER_ATTN_O: "v.vit_merger.attn_out",
MODEL_TENSOR.V_MERGER_DS_LN: "v.vit_merger.ds_ln",
MODEL_TENSOR.V_MERGER_DS_UP: "v.vit_merger.ds_ffn_up",
MODEL_TENSOR.V_MERGER_DS_DOWN: "v.vit_merger.ds_ffn_down",
MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm
MODEL_TENSOR.V_MM_UP: "mm.up",
MODEL_TENSOR.V_MM_DOWN: "mm.down",
Expand Down Expand Up @@ -1403,6 +1419,14 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_DS_NORM,
MODEL_TENSOR.V_DS_FC1,
MODEL_TENSOR.V_DS_FC2,
MODEL_TENSOR.V_MERGER_LN1,
MODEL_TENSOR.V_MERGER_ATTN_Q,
MODEL_TENSOR.V_MERGER_ATTN_K,
MODEL_TENSOR.V_MERGER_ATTN_V,
MODEL_TENSOR.V_MERGER_ATTN_O,
MODEL_TENSOR.V_MERGER_DS_LN,
MODEL_TENSOR.V_MERGER_DS_UP,
MODEL_TENSOR.V_MERGER_DS_DOWN,
MODEL_TENSOR.V_MM_POST_FC_NORM,
MODEL_TENSOR.V_MM_UP,
MODEL_TENSOR.V_MM_DOWN,
Expand Down Expand Up @@ -4158,6 +4182,7 @@ class VisionProjectorType:
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
MINICPMV4_6 = "minicpmv4_6"


# Items here are (block size, type size)
Expand Down
35 changes: 35 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,7 @@ class TensorNameMap:
"mlp_AR.pre_norm", # PaddleOCR-VL
"merger.ln_q",
"vision_tower.merger.ln_q", # dots.ocr
"model.merger.mlp.0.pre_norm", # minicpmv4_6
),

MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
Expand Down Expand Up @@ -1763,6 +1764,38 @@ class TensorNameMap:
"model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
),

MODEL_TENSOR.V_MERGER_LN1: (
"model.vision_tower.vit_merger.layer_norm1", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_ATTN_Q: (
"model.vision_tower.vit_merger.self_attn.q_proj", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_ATTN_K: (
"model.vision_tower.vit_merger.self_attn.k_proj", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_ATTN_V: (
"model.vision_tower.vit_merger.self_attn.v_proj", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_ATTN_O: (
"model.vision_tower.vit_merger.self_attn.out_proj", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_DS_LN: (
"model.vision_tower.vit_merger.pre_norm", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_DS_UP: (
"model.vision_tower.vit_merger.linear_1", # minicpmv4_6
),

MODEL_TENSOR.V_MERGER_DS_DOWN: (
"model.vision_tower.vit_merger.linear_2", # minicpmv4_6
),

MODEL_TENSOR.V_SAM_POS_EMBD: (
"model.sam_model.pos_embed",
),
Expand Down Expand Up @@ -1822,11 +1855,13 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_UP: (
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
"visual.merger.up_proj", # glm4v
"model.merger.mlp.0.linear_1", # minicpmv4_6
),

MODEL_TENSOR.V_MM_DOWN: (
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
"visual.merger.down_proj", # glm4v
"model.merger.mlp.0.linear_2", # minicpmv4_6
),

MODEL_TENSOR.V_MM_GATE: (
Expand Down
4 changes: 4 additions & 0 deletions tools/mtmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` fl
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
- [MiniCPM-V 4.6](https://huggingface.co/openbmb/MiniCPM-V-4_6) ; See the guide [here](../../docs/multimodal/minicpmv4.6.md) - requires the standard `transformers` v5.7.0+ checkpoint

For older models, please refer to the relevant guide for instructions on how to obtain or create them:

Expand All @@ -60,4 +61,7 @@ NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
- [MiniCPM-V 4.0](../../docs/multimodal/minicpmv4.0.md)
- [MiniCPM-o 4.0](../../docs/multimodal/minicpmo4.0.md)
- [MiniCPM-V 4.5](../../docs/multimodal/minicpmv4.5.md)
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
15 changes: 15 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,19 @@
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"

// MiniCPM-V 4.6 ViT merger (window attention + MLP downsample),
// matching the upstream `vit_merger` module name in transformers.
#define TN_VIT_MERGER_LN1 "v.vit_merger.ln1.%s"
#define TN_VIT_MERGER_ATTN_Q "v.vit_merger.attn_q.%s"
#define TN_VIT_MERGER_ATTN_K "v.vit_merger.attn_k.%s"
#define TN_VIT_MERGER_ATTN_V "v.vit_merger.attn_v.%s"
#define TN_VIT_MERGER_ATTN_O "v.vit_merger.attn_out.%s"
#define TN_VIT_MERGER_DS_LN "v.vit_merger.ds_ln.%s"
#define TN_VIT_MERGER_DS_UP "v.vit_merger.ds_ffn_up.%s"
#define TN_VIT_MERGER_DS_DOWN "v.vit_merger.ds_ffn_down.%s"

#define KEY_INSERT_LAYER_ID "clip.vision.insert_layer_id"

#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
Expand Down Expand Up @@ -304,6 +317,7 @@ enum projector_type {
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_HUNYUANVL,
PROJECTOR_TYPE_MINICPMV4_6,
PROJECTOR_TYPE_UNKNOWN,
};

Expand Down Expand Up @@ -351,6 +365,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
{ PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
19 changes: 19 additions & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ struct clip_hparams {
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
int32_t insert_layer_id = 0; // MiniCPM-V 4.6 ViT merger insertion layer

// custom value provided by user, can be undefined if not set
int32_t custom_image_min_tokens = -1;
Expand Down Expand Up @@ -403,6 +404,24 @@ struct clip_model {
ggml_tensor * mm_model_ln_post_w = nullptr;
ggml_tensor * mm_model_ln_post_b = nullptr;

// MiniCPM-V 4.6 ViT merger (window self-attention + ViT MLP downsample)
ggml_tensor * vit_merger_ln1_w = nullptr;
ggml_tensor * vit_merger_ln1_b = nullptr;
ggml_tensor * vit_merger_attn_q_w = nullptr;
ggml_tensor * vit_merger_attn_q_b = nullptr;
ggml_tensor * vit_merger_attn_k_w = nullptr;
ggml_tensor * vit_merger_attn_k_b = nullptr;
ggml_tensor * vit_merger_attn_v_w = nullptr;
ggml_tensor * vit_merger_attn_v_b = nullptr;
ggml_tensor * vit_merger_attn_o_w = nullptr;
ggml_tensor * vit_merger_attn_o_b = nullptr;
ggml_tensor * vit_merger_ds_ln_w = nullptr;
ggml_tensor * vit_merger_ds_ln_b = nullptr;
ggml_tensor * vit_merger_ds_up_w = nullptr;
ggml_tensor * vit_merger_ds_up_b = nullptr;
ggml_tensor * vit_merger_ds_down_w = nullptr;
ggml_tensor * vit_merger_ds_down_b = nullptr;

// gemma3
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr;
Expand Down
Loading