-
Notifications
You must be signed in to change notification settings - Fork 17.6k
[Model] Support MiniCPM-V 4.6 #22529
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
2635ba4
c6c88e4
a86c2ca
8a13c00
21d09f8
1f78d3f
8792deb
efabdd8
06575b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1331,6 +1331,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: | |
| if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": | ||
| # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B | ||
| res = "qwen2" | ||
| if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": | ||
| # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6 | ||
| res = "qwen35" | ||
| if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": | ||
| # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer | ||
| res = "grok-2" | ||
|
|
@@ -1526,7 +1529,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: | |
| if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": | ||
| # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B | ||
| res = "exaone-moe" | ||
| if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4": | ||
| if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": | ||
| # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct | ||
| res = "qwen35" | ||
| if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": | ||
|
|
@@ -5432,16 +5435,107 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter | |
| yield from super().modify_tensors(data_torch, name, bid) | ||
|
|
||
|
|
||
| class _Qwen35MRopeMixin: | ||
| # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers); | ||
| # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE | ||
| # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always | ||
| # written even when a particular checkpoint omits the field in `rope_parameters`. | ||
| _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0] | ||
|
|
||
| def set_gguf_parameters(self): | ||
| super().set_gguf_parameters() # type: ignore[misc] | ||
| if "mrope_section" not in self.rope_parameters: # type: ignore[attr-defined] | ||
| self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) # type: ignore[attr-defined] | ||
|
|
||
|
|
||
| @ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") | ||
| class Qwen3_5TextModel(_LinearAttentionVReorderBase): | ||
| class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): | ||
| model_arch = gguf.MODEL_ARCH.QWEN35 | ||
|
|
||
|
|
||
| @ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") | ||
| class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase): | ||
| class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): | ||
| model_arch = gguf.MODEL_ARCH.QWEN35MOE | ||
|
|
||
|
|
||
| # MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under | ||
| # `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger | ||
| # + a final DownsampleMLP merger. The same HF arch is registered twice below: once as | ||
| # the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup. | ||
|
|
||
| @ModelBase.register("MiniCPMV4_6ForConditionalGeneration") | ||
| class MiniCPMV4_6TextModel(Qwen3_5TextModel): | ||
| model_arch = gguf.MODEL_ARCH.QWEN35 | ||
|
|
||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||
| # drop vision tower / multimodal merger tensors -- they belong to the mmproj file | ||
| if name.startswith(("model.vision_tower.", "model.merger.")): | ||
| return | ||
| # MTP tensors are not used at inference yet; align with Qwen3Next behaviour | ||
| if name.startswith("mtp"): | ||
| return | ||
| # strip the language-model wrapper so the underlying Qwen3.5 tensor mapping matches | ||
| if name.startswith("model.language_model."): | ||
| name = "model." + name[len("model.language_model."):] | ||
| yield from super().modify_tensors(data_torch, name, bid) | ||
|
|
||
|
|
||
| @ModelBase.register("MiniCPMV4_6ForConditionalGeneration") | ||
| class MiniCPMV4_6VisionModel(MmprojModel): | ||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
| if self.hparams_vision is not None: | ||
| # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP | ||
| # positional embedding bucket grid (70 x 70), while the per-slice processing | ||
| # resolution is the preprocessor's `scale_resolution` (typically 448). | ||
| # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size` | ||
| # as the slice size and warmup resolution, so report `scale_resolution` there | ||
| # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules. | ||
| scale_resolution = self.preprocessor_config.get("scale_resolution") | ||
| if scale_resolution is not None: | ||
| self.hparams_vision["image_size"] = int(scale_resolution) | ||
|
|
||
| def set_gguf_parameters(self): | ||
| super().set_gguf_parameters() | ||
| assert self.hparams_vision is not None | ||
|
|
||
| # projector type string is consumed by clip_projector_type_from_string() in clip.cpp | ||
| # (mapped to PROJECTOR_TYPE_MINICPMV4_6). | ||
| self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6) | ||
|
|
||
| # legacy version tag, used by mtmd.cpp to pick the slice template (MINICPMV_2_6). | ||
| # The clip loader reads this field via gguf_get_val_i32, so it must be written as int32. | ||
| self.gguf_writer.add_int32("clip.minicpmv_version", 46) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is unused now |
||
| # fixed merger output token count per slice for the default 16x downsample mode. | ||
| self.gguf_writer.add_uint32("clip.minicpmv_query_num", 64) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also unused, right? otherwise you should create a new constant in |
||
|
|
||
| # ViT layer index after which the window-attention merger is applied | ||
| insert_layer_id = int(self.global_config.get( | ||
| "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6))) | ||
| self.gguf_writer.add_uint32("clip.vision.insert_layer_id", insert_layer_id) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
|
|
||
| # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx). | ||
| self.gguf_writer.add_vision_use_gelu(True) | ||
| self.gguf_writer.add_vision_attention_layernorm_eps( | ||
| self.hparams_vision.get("layer_norm_eps", 1e-6)) | ||
|
|
||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||
| # text tower / lm_head / MTP -> belong to the LM file | ||
| if name.startswith(("model.language_model.", "lm_head.")) or name.startswith("mtp"): | ||
| return | ||
|
|
||
| # final merger and ViT merger | ||
| if name.startswith(("model.merger.", "model.vision_tower.vit_merger.")): | ||
| yield from super().modify_tensors(data_torch, name, bid) | ||
| return | ||
|
|
||
| # SigLIP vision body | ||
| if name.startswith("model.vision_tower."): | ||
| name = "vision_tower.vision_model." + name[len("model.vision_tower."):] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
| yield from super().modify_tensors(data_torch, name, bid) | ||
| return | ||
|
|
||
|
|
||
| @ModelBase.register("GPT2LMHeadModel") | ||
| class GPT2Model(TextModel): | ||
| model_arch = gguf.MODEL_ARCH.GPT2 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| ## MiniCPM-V 4.6 | ||
|
|
||
| ### Prepare models and code | ||
|
|
||
| Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder. | ||
|
|
||
| The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`. | ||
|
|
||
| ### Build llama.cpp | ||
|
|
||
| If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) | ||
|
|
||
| Clone llama.cpp: | ||
| ```bash | ||
| git clone https://github.com/ggml-org/llama.cpp | ||
| cd llama.cpp | ||
| ``` | ||
|
|
||
| Build llama.cpp using `CMake`: | ||
| ```bash | ||
| cmake -B build | ||
| cmake --build build --config Release | ||
| ``` | ||
|
|
||
|
|
||
| ### Usage of MiniCPM-V 4.6 | ||
|
|
||
| Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF. | ||
|
|
||
| ```bash | ||
| # language model | ||
| python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf | ||
|
|
||
| # multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger) | ||
| python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf | ||
|
|
||
| # optional: quantize to Q4_K_M | ||
| ./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M | ||
| ``` | ||
|
|
||
|
|
||
| Inference on Linux or Mac | ||
| ```bash | ||
| # run in single-turn mode | ||
| ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" | ||
|
|
||
| # run in conversation mode | ||
| ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf | ||
| ``` |
Uh oh!
There was an error while loading. Please reload this page.