diff --git a/README.md b/README.md
index fb8e9343e..f3d40bb5b 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@
## Latest News
+* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` model support
* 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support
* 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support.
* 04/28/2026 [7.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v7.0.0): 🚀 Added Huawei Ascend NPU support through native torch kernels for GPTQ, AWQ, ParoQuant, GGUF, QQQ, and EXL3. Added `internvl_chat`, `gemma3n`, `GLM-OCR`, `GLM-ASR`, and `falcon_mamba` model support.
@@ -243,23 +244,23 @@ Selected public references where teams or companies explicitly mention GPT-QMode
## Model Support
-| Model | | | | | | | | | |
-|--------------------------|---|---------------------------------|---|------------------|---|-------------------|---|-------------------------|---|
-| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ |
-| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ |
-| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ |
-| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ |
-| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ |
-| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ |
-| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ |
-| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ |
-| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ |
-| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ |
-| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ |
-| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V | ✅ | PanGu-α | ✅ |
-| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ |
-| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ |
-| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | |
+| Model | | | | | | | | | |
+|--------------------------|---|---------------------------------|---|------------------|---|---------------------|---|-------------------------|---|
+| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ |
+| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ |
+| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ |
+| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ |
+| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ |
+| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ |
+| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ |
+| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ |
+| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ |
+| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ |
+| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ |
+| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ |
+| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ |
+| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ |
+| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | |
Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included.
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 0459a48f8..f6c590378 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -128,7 +128,8 @@
from .definitions.minicpm import MiniCPMGPTQ # noqa: E402
from .definitions.minicpm3 import MiniCpm3QModel # noqa: E402
from .definitions.minicpm_o import MiniCPMOQModel # noqa: E402
-from .definitions.minicpm_v import MiniCPMVQModel # noqa: E402
+from .definitions.minicpmv import MiniCPMVQModel # noqa: E402
+from .definitions.minicpmv_4_6 import MiniCPMV4_6QModel # noqa: E402
from .definitions.minimax_m2 import MiniMaxM2GPTQ # noqa: E402
from .definitions.mistral3 import Mistral3GPTQ
from .definitions.mixtral import MixtralQModel # noqa: E402
@@ -246,6 +247,7 @@
"minicpm3": MiniCpm3QModel,
"minicpmo": MiniCPMOQModel,
"minicpmv": MiniCPMVQModel,
+ "minicpmv4_6": MiniCPMV4_6QModel,
"minimax": MiniMaxM2GPTQ,
"minimax_m2": MiniMaxM2GPTQ,
"qwen2_moe": Qwen2MoeQModel,
diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
index 7b460535e..ebb3896bd 100644
--- a/gptqmodel/models/definitions/__init__.py
+++ b/gptqmodel/models/definitions/__init__.py
@@ -48,7 +48,8 @@
from .mimo import MimoQModel
from .minicpm3 import MiniCpm3QModel
from .minicpm_o import MiniCPMOQModel
-from .minicpm_v import MiniCPMVQModel
+from .minicpmv import MiniCPMVQModel
+from .minicpmv_4_6 import MiniCPMV4_6QModel
from .minimax_m2 import MiniMaxM2GPTQ
from .mixtral import MixtralQModel
from .mllama import MLlamaQModel
diff --git a/gptqmodel/models/definitions/minicpm_v.py b/gptqmodel/models/definitions/minicpmv.py
similarity index 100%
rename from gptqmodel/models/definitions/minicpm_v.py
rename to gptqmodel/models/definitions/minicpmv.py
diff --git a/gptqmodel/models/definitions/minicpmv_4_6.py b/gptqmodel/models/definitions/minicpmv_4_6.py
new file mode 100644
index 000000000..3b0228d46
--- /dev/null
+++ b/gptqmodel/models/definitions/minicpmv_4_6.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from typing import Dict
+
+from transformers import AutoModelForImageTextToText, AutoProcessor, ProcessorMixin
+
+from ...utils.calibration import batched
+from ...utils.model import MODALITY, move_to, nested_move_to
+from ...utils.offload import offload_to_disk
+from .._const import CPU
+from ..base import BaseQModel
+
+
+class MiniCPMV4_6QModel(BaseQModel):
+ loader = AutoModelForImageTextToText
+
+ pre_lm_head_norm_module = "model.language_model.norm"
+ rotary_embedding = "model.language_model.rotary_emb"
+
+ module_tree = [
+ "model",
+ "language_model",
+ "layers",
+ "#",
+ {
+ "input_layernorm": ("input_layernorm:!",),
+ "self_attn": ("q_norm:!", "q_proj:0", "k_norm:!", "k_proj:0", "v_proj:0", "o_proj:1"),
+ "linear_attn": (
+ "norm:!",
+ "conv1d:!",
+ "in_proj_qkv:0",
+ "in_proj_z:1",
+ "in_proj_b:!:1",
+ "in_proj_a:!:1",
+ "out_proj:2",
+ ),
+ "post_attention_layernorm": ("post_attention_layernorm:!",),
+ "mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+ }
+ ]
+
+ modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]
+ require_load_processor = True
+ require_trust_remote_code = False
+ layer_modules_strict = False
+
+ def pre_quantize_generate_hook_start(self):
+ language_model = self.model.model.language_model
+ self.shell_module_materialize(language_model.embed_tokens, self.quantize_config.device)
+ self.shell_module_materialize(language_model.rotary_emb, self.quantize_config.device)
+ self.shell_module_materialize(self.model.model.vision_tower, self.quantize_config.device)
+ self.shell_module_materialize(self.model.model.merger, self.quantize_config.device)
+
+ def pre_quantize_generate_hook_end(self):
+ language_model = self.model.model.language_model
+ if self.quantize_config.offload_to_disk:
+ offload_to_disk(
+ model=language_model,
+ module=language_model.embed_tokens,
+ disk_path=self.quantize_config.offload_to_disk_path,
+ )
+ offload_to_disk(
+ model=language_model,
+ module=language_model.rotary_emb,
+ disk_path=self.quantize_config.offload_to_disk_path,
+ )
+ offload_to_disk(
+ model=self.model,
+ module=self.model.model.vision_tower,
+ disk_path=self.quantize_config.offload_to_disk_path,
+ )
+ offload_to_disk(
+ model=self.model,
+ module=self.model.model.merger,
+ disk_path=self.quantize_config.offload_to_disk_path,
+ )
+ return
+
+ language_model.embed_tokens = move_to(language_model.embed_tokens, device=CPU)
+ language_model.rotary_emb = move_to(language_model.rotary_emb, device=CPU)
+ self.model.model.vision_tower = move_to(self.model.model.vision_tower, device=CPU)
+ self.model.model.merger = move_to(self.model.model.merger, device=CPU)
+
+ def preprocess_dataset(self, sample: Dict) -> Dict:
+ return sample
+
+ def load_processor(self) -> ProcessorMixin:
+ return AutoProcessor.from_pretrained(self.model_local_path, trust_remote_code=False)
+
+ @classmethod
+ def prepare_inputs_for_conversations(
+ cls,
+ processor: ProcessorMixin,
+ conversations: list[dict] | list[list[dict]],
+ ):
+ if conversations and isinstance(conversations[0], dict):
+ conversations = [conversations]
+
+ downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail
+
+ inputs = processor.apply_chat_template(
+ conversations, tokenize=True, add_generation_prompt=True,
+ return_dict=True, return_tensors="pt",
+ downsample_mode=downsample_mode,
+ max_slice_nums=36,
+ )
+ return inputs
+
+ def prepare_dataset(self, calibration_dataset, batch_size: int = 1, **kwargs):
+ processor = self.load_processor()
+ calib_data = []
+ for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
+ calib_data.append(
+ self.prepare_inputs_for_conversations(
+ processor,
+ batch,
+ )
+ )
+ del processor
+ return calib_data
+
+ def move_input_capture_example(self, example, data_device):
+ for key, value in example.items():
+ example[key] = nested_move_to(value, device=data_device)
+
+ return self.finalize_input_capture_example(example)
+
+ def run_input_capture(self, example, use_cache: bool, data_device):
+ return self.model.generate(
+ **example,
+ )
diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py
index db2a60ed3..9f4d40546 100644
--- a/gptqmodel/utils/hf.py
+++ b/gptqmodel/utils/hf.py
@@ -1213,7 +1213,7 @@ def encoder_init_compat(self, encoder_config):
encoder_cls.__init__ = encoder_init_compat
encoder_cls._gptqmodel_meta_dpr_patch = True
- if config.model_type == "minicpmv" or config.model_type == "minicpmo":
+ if config.model_type in {"minicpmv", "minicpmv4_6", "minicpmo"}:
vision_model_cls = getattr(
remote_module,
"SiglipVisionTransformer",
diff --git a/tests/models/ovis/image_to_test_dataset.py b/tests/models/ovis/image_to_test_dataset.py
index a118035dc..7728ad1fa 100644
--- a/tests/models/ovis/image_to_test_dataset.py
+++ b/tests/models/ovis/image_to_test_dataset.py
@@ -7,7 +7,8 @@
from gptqmodel.models.definitions.ernie4_5_vl_moe import Ernie4_5_VLMoeQModel
from gptqmodel.models.definitions.internvl_chat import InternVLChatQModel
from gptqmodel.models.definitions.minicpm_o import MiniCPMOQModel
-from gptqmodel.models.definitions.minicpm_v import MiniCPMVQModel
+from gptqmodel.models.definitions.minicpmv import MiniCPMVQModel
+from gptqmodel.models.definitions.minicpmv_4_6 import MiniCPMV4_6QModel
from gptqmodel.models.definitions.ovis import OvisQModel
from gptqmodel.models.definitions.ovis2 import Ovis2QModel
from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel
@@ -98,6 +99,7 @@ def get_calib_dataset(model):
or isinstance(model, Qwen3_VLQModel)
or isinstance(model, MiniCPMOQModel)
or isinstance(model, MiniCPMVQModel)
+ or isinstance(model, MiniCPMV4_6QModel)
or isinstance(model, InternVLChatQModel)
or isinstance(model, Ernie4_5_VLMoeQModel)
):
diff --git a/tests/models/test_minicpm_v_4_5.py b/tests/models/test_minicpmv_4_5.py
similarity index 97%
rename from tests/models/test_minicpm_v_4_5.py
rename to tests/models/test_minicpmv_4_5.py
index 743db3242..396ecf1db 100644
--- a/tests/models/test_minicpm_v_4_5.py
+++ b/tests/models/test_minicpmv_4_5.py
@@ -14,7 +14,7 @@ class TestMiniCPMV4_5(ModelTest):
TRUST_REMOTE_CODE = True
EVAL_BATCH_SIZE = 1
- def test_minicpm_v_4_5(self):
+ def test_minicpmv_4_5(self):
# Evalution does not support minicpmv, and will throw an error during execution:
# E TypeError: MiniCPMV.forward() missing 1 required positional argument: 'data
with self.model_compat_test_context():
diff --git a/tests/models/test_minicpmv_4_6.py b/tests/models/test_minicpmv_4_6.py
new file mode 100644
index 000000000..22962db40
--- /dev/null
+++ b/tests/models/test_minicpmv_4_6.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+import os.path
+
+from model_test import ModelTest
+
+
+class TestMiniCPMV4_6(ModelTest):
+ NATIVE_MODEL_ID = "openbmb/MiniCPM-V-4.6" # openbmb/MiniCPM-V-4.6"
+ TRUST_REMOTE_CODE = True
+ EVAL_BATCH_SIZE = 1
+
+ def test_minicpmv_4_6(self):
+ # Evalution does not support minicpmv, and will throw an error during execution:
+ # E TypeError: MiniCPMV.forward() missing 1 required positional argument: 'data
+ with self.model_compat_test_context():
+ model, tokenizer, processor = self.quantModel(
+ self.NATIVE_MODEL_ID,
+ trust_remote_code=self.TRUST_REMOTE_CODE,
+ dtype=self.TORCH_DTYPE,
+ batch_size=1,
+ call_perform_post_quant_validation=False,
+ )
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "image": os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg"),
+ },
+ {"type": "text", "text": "Describe this image."},
+ ],
+ }
+ ]
+
+ downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail
+
+ inputs = processor.apply_chat_template(
+ messages, tokenize=True, add_generation_prompt=True,
+ return_dict=True, return_tensors="pt",
+ downsample_mode=downsample_mode,
+ max_slice_nums=36,
+ ).to(model.device)
+
+ generated_ids = model.generate(**inputs, downsample_mode=downsample_mode, max_new_tokens=512)
+
+ generated_ids_trimmed = [
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+ ]
+ output_text = processor.batch_decode(
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ )[0]
+ print(f'Output:\n{output_text}')
+
+ self.assertIn("snow", output_text.lower())