diff --git a/README.md b/README.md index fb8e9343e..f3d40bb5b 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ ## Latest News +* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` model support * 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support * 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support. * 04/28/2026 [7.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v7.0.0): 🚀 Added Huawei Ascend NPU support through native torch kernels for GPTQ, AWQ, ParoQuant, GGUF, QQQ, and EXL3. Added `internvl_chat`, `gemma3n`, `GLM-OCR`, `GLM-ASR`, and `falcon_mamba` model support. @@ -243,23 +244,23 @@ Selected public references where teams or companies explicitly mention GPT-QMode ## Model Support -| Model | | | | | | | | | | -|--------------------------|---|---------------------------------|---|------------------|---|-------------------|---|-------------------------|---| -| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ | -| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ | -| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ | -| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ | -| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ | -| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ | -| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ | -| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ | -| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ | -| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ | -| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ | -| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V | ✅ | PanGu-α | ✅ | -| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | -| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ | -| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | | +| Model | | | | | | | | | | +|--------------------------|---|---------------------------------|---|------------------|---|---------------------|---|-------------------------|---| +| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ | +| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ | +| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ | +| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ | +| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ | +| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ | +| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ | +| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ | +| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ | +| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ | +| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ | +| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ | +| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | +| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ | +| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | | Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included. diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 0459a48f8..f6c590378 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -128,7 +128,8 @@ from .definitions.minicpm import MiniCPMGPTQ # noqa: E402 from .definitions.minicpm3 import MiniCpm3QModel # noqa: E402 from .definitions.minicpm_o import MiniCPMOQModel # noqa: E402 -from .definitions.minicpm_v import MiniCPMVQModel # noqa: E402 +from .definitions.minicpmv import MiniCPMVQModel # noqa: E402 +from .definitions.minicpmv_4_6 import MiniCPMV4_6QModel # noqa: E402 from .definitions.minimax_m2 import MiniMaxM2GPTQ # noqa: E402 from .definitions.mistral3 import Mistral3GPTQ from .definitions.mixtral import MixtralQModel # noqa: E402 @@ -246,6 +247,7 @@ "minicpm3": MiniCpm3QModel, "minicpmo": MiniCPMOQModel, "minicpmv": MiniCPMVQModel, + "minicpmv4_6": MiniCPMV4_6QModel, "minimax": MiniMaxM2GPTQ, "minimax_m2": MiniMaxM2GPTQ, "qwen2_moe": Qwen2MoeQModel, diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py index 7b460535e..ebb3896bd 100644 --- a/gptqmodel/models/definitions/__init__.py +++ b/gptqmodel/models/definitions/__init__.py @@ -48,7 +48,8 @@ from .mimo import MimoQModel from .minicpm3 import MiniCpm3QModel from .minicpm_o import MiniCPMOQModel -from .minicpm_v import MiniCPMVQModel +from .minicpmv import MiniCPMVQModel +from .minicpmv_4_6 import MiniCPMV4_6QModel from .minimax_m2 import MiniMaxM2GPTQ from .mixtral import MixtralQModel from .mllama import MLlamaQModel diff --git a/gptqmodel/models/definitions/minicpm_v.py b/gptqmodel/models/definitions/minicpmv.py similarity index 100% rename from gptqmodel/models/definitions/minicpm_v.py rename to gptqmodel/models/definitions/minicpmv.py diff --git a/gptqmodel/models/definitions/minicpmv_4_6.py b/gptqmodel/models/definitions/minicpmv_4_6.py new file mode 100644 index 000000000..3b0228d46 --- /dev/null +++ b/gptqmodel/models/definitions/minicpmv_4_6.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from typing import Dict + +from transformers import AutoModelForImageTextToText, AutoProcessor, ProcessorMixin + +from ...utils.calibration import batched +from ...utils.model import MODALITY, move_to, nested_move_to +from ...utils.offload import offload_to_disk +from .._const import CPU +from ..base import BaseQModel + + +class MiniCPMV4_6QModel(BaseQModel): + loader = AutoModelForImageTextToText + + pre_lm_head_norm_module = "model.language_model.norm" + rotary_embedding = "model.language_model.rotary_emb" + + module_tree = [ + "model", + "language_model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("q_norm:!", "q_proj:0", "k_norm:!", "k_proj:0", "v_proj:0", "o_proj:1"), + "linear_attn": ( + "norm:!", + "conv1d:!", + "in_proj_qkv:0", + "in_proj_z:1", + "in_proj_b:!:1", + "in_proj_a:!:1", + "out_proj:2", + ), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"), + } + ] + + modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT] + require_load_processor = True + require_trust_remote_code = False + layer_modules_strict = False + + def pre_quantize_generate_hook_start(self): + language_model = self.model.model.language_model + self.shell_module_materialize(language_model.embed_tokens, self.quantize_config.device) + self.shell_module_materialize(language_model.rotary_emb, self.quantize_config.device) + self.shell_module_materialize(self.model.model.vision_tower, self.quantize_config.device) + self.shell_module_materialize(self.model.model.merger, self.quantize_config.device) + + def pre_quantize_generate_hook_end(self): + language_model = self.model.model.language_model + if self.quantize_config.offload_to_disk: + offload_to_disk( + model=language_model, + module=language_model.embed_tokens, + disk_path=self.quantize_config.offload_to_disk_path, + ) + offload_to_disk( + model=language_model, + module=language_model.rotary_emb, + disk_path=self.quantize_config.offload_to_disk_path, + ) + offload_to_disk( + model=self.model, + module=self.model.model.vision_tower, + disk_path=self.quantize_config.offload_to_disk_path, + ) + offload_to_disk( + model=self.model, + module=self.model.model.merger, + disk_path=self.quantize_config.offload_to_disk_path, + ) + return + + language_model.embed_tokens = move_to(language_model.embed_tokens, device=CPU) + language_model.rotary_emb = move_to(language_model.rotary_emb, device=CPU) + self.model.model.vision_tower = move_to(self.model.model.vision_tower, device=CPU) + self.model.model.merger = move_to(self.model.model.merger, device=CPU) + + def preprocess_dataset(self, sample: Dict) -> Dict: + return sample + + def load_processor(self) -> ProcessorMixin: + return AutoProcessor.from_pretrained(self.model_local_path, trust_remote_code=False) + + @classmethod + def prepare_inputs_for_conversations( + cls, + processor: ProcessorMixin, + conversations: list[dict] | list[list[dict]], + ): + if conversations and isinstance(conversations[0], dict): + conversations = [conversations] + + downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail + + inputs = processor.apply_chat_template( + conversations, tokenize=True, add_generation_prompt=True, + return_dict=True, return_tensors="pt", + downsample_mode=downsample_mode, + max_slice_nums=36, + ) + return inputs + + def prepare_dataset(self, calibration_dataset, batch_size: int = 1, **kwargs): + processor = self.load_processor() + calib_data = [] + for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset): + calib_data.append( + self.prepare_inputs_for_conversations( + processor, + batch, + ) + ) + del processor + return calib_data + + def move_input_capture_example(self, example, data_device): + for key, value in example.items(): + example[key] = nested_move_to(value, device=data_device) + + return self.finalize_input_capture_example(example) + + def run_input_capture(self, example, use_cache: bool, data_device): + return self.model.generate( + **example, + ) diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py index db2a60ed3..9f4d40546 100644 --- a/gptqmodel/utils/hf.py +++ b/gptqmodel/utils/hf.py @@ -1213,7 +1213,7 @@ def encoder_init_compat(self, encoder_config): encoder_cls.__init__ = encoder_init_compat encoder_cls._gptqmodel_meta_dpr_patch = True - if config.model_type == "minicpmv" or config.model_type == "minicpmo": + if config.model_type in {"minicpmv", "minicpmv4_6", "minicpmo"}: vision_model_cls = getattr( remote_module, "SiglipVisionTransformer", diff --git a/tests/models/ovis/image_to_test_dataset.py b/tests/models/ovis/image_to_test_dataset.py index a118035dc..7728ad1fa 100644 --- a/tests/models/ovis/image_to_test_dataset.py +++ b/tests/models/ovis/image_to_test_dataset.py @@ -7,7 +7,8 @@ from gptqmodel.models.definitions.ernie4_5_vl_moe import Ernie4_5_VLMoeQModel from gptqmodel.models.definitions.internvl_chat import InternVLChatQModel from gptqmodel.models.definitions.minicpm_o import MiniCPMOQModel -from gptqmodel.models.definitions.minicpm_v import MiniCPMVQModel +from gptqmodel.models.definitions.minicpmv import MiniCPMVQModel +from gptqmodel.models.definitions.minicpmv_4_6 import MiniCPMV4_6QModel from gptqmodel.models.definitions.ovis import OvisQModel from gptqmodel.models.definitions.ovis2 import Ovis2QModel from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel @@ -98,6 +99,7 @@ def get_calib_dataset(model): or isinstance(model, Qwen3_VLQModel) or isinstance(model, MiniCPMOQModel) or isinstance(model, MiniCPMVQModel) + or isinstance(model, MiniCPMV4_6QModel) or isinstance(model, InternVLChatQModel) or isinstance(model, Ernie4_5_VLMoeQModel) ): diff --git a/tests/models/test_minicpm_v_4_5.py b/tests/models/test_minicpmv_4_5.py similarity index 97% rename from tests/models/test_minicpm_v_4_5.py rename to tests/models/test_minicpmv_4_5.py index 743db3242..396ecf1db 100644 --- a/tests/models/test_minicpm_v_4_5.py +++ b/tests/models/test_minicpmv_4_5.py @@ -14,7 +14,7 @@ class TestMiniCPMV4_5(ModelTest): TRUST_REMOTE_CODE = True EVAL_BATCH_SIZE = 1 - def test_minicpm_v_4_5(self): + def test_minicpmv_4_5(self): # Evalution does not support minicpmv, and will throw an error during execution: # E TypeError: MiniCPMV.forward() missing 1 required positional argument: 'data with self.model_compat_test_context(): diff --git a/tests/models/test_minicpmv_4_6.py b/tests/models/test_minicpmv_4_6.py new file mode 100644 index 000000000..22962db40 --- /dev/null +++ b/tests/models/test_minicpmv_4_6.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +import os.path + +from model_test import ModelTest + + +class TestMiniCPMV4_6(ModelTest): + NATIVE_MODEL_ID = "openbmb/MiniCPM-V-4.6" # openbmb/MiniCPM-V-4.6" + TRUST_REMOTE_CODE = True + EVAL_BATCH_SIZE = 1 + + def test_minicpmv_4_6(self): + # Evalution does not support minicpmv, and will throw an error during execution: + # E TypeError: MiniCPMV.forward() missing 1 required positional argument: 'data + with self.model_compat_test_context(): + model, tokenizer, processor = self.quantModel( + self.NATIVE_MODEL_ID, + trust_remote_code=self.TRUST_REMOTE_CODE, + dtype=self.TORCH_DTYPE, + batch_size=1, + call_perform_post_quant_validation=False, + ) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg"), + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + + downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail + + inputs = processor.apply_chat_template( + messages, tokenize=True, add_generation_prompt=True, + return_dict=True, return_tensors="pt", + downsample_mode=downsample_mode, + max_slice_nums=36, + ).to(model.device) + + generated_ids = model.generate(**inputs, downsample_mode=downsample_mode, max_new_tokens=512) + + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + print(f'Output:\n{output_text}') + + self.assertIn("snow", output_text.lower())