diff --git a/README.md b/README.md index ab1062952..4f073596e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ ## Latest News +* 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support * 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support * 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support * 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support. @@ -260,7 +261,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode | ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ | | XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | | MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ | -| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | | +| InternVL Chat | ✅ | Laguna | ✅ | Mimo / Mimo V2 | ✅ | Zamba / Zamba2 | ✅ | | | Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included. diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 871fafa90..efa14e12c 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -128,6 +128,7 @@ from .definitions.llava_qwen2 import LlavaQwen2QModel # noqa: E402 from .definitions.longcat_flash import LongCatFlashQModel # noqa: E402 from .definitions.mimo import MimoQModel # noqa: E402 +from .definitions.mimo_v2 import MimoV2QModel # noqa: E402 from .definitions.minicpm import MiniCPMGPTQ # noqa: E402 from .definitions.minicpm3 import MiniCpm3QModel # noqa: E402 from .definitions.minicpm_o import MiniCPMOQModel # noqa: E402 @@ -285,6 +286,7 @@ "telechat": TeleChat2QModel, "instella": InstellaQModel, "mimo": MimoQModel, + "mimo_v2": MimoV2QModel, "falcon_h1": FalconH1QModel, "zamba": ZambaQModel, "zamba2": Zamba2QModel, diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py index d7619c17c..8e0fd5631 100644 --- a/gptqmodel/models/definitions/__init__.py +++ b/gptqmodel/models/definitions/__init__.py @@ -52,6 +52,7 @@ from .minicpmv import MiniCPMVQModel from .minicpmv_4_6 import MiniCPMV4_6QModel from .minimax_m2 import MiniMaxM2GPTQ +from .mimo_v2 import MimoV2QModel from .mixtral import MixtralQModel from .mllama import MLlamaQModel from .mobilellm import MobileLLMQModel diff --git a/gptqmodel/models/definitions/mimo_v2.py b/gptqmodel/models/definitions/mimo_v2.py new file mode 100644 index 000000000..3443dbc3f --- /dev/null +++ b/gptqmodel/models/definitions/mimo_v2.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: 2026 ModelCloud.ai +# SPDX-License-Identifier: Apache-2.0 + +import json +import os + +from safetensors import safe_open +from torch import nn + +from gptqmodel.models.moe_lifecycle import GateUpDownMoELifecycleHooks + +from ..base import BaseQModel +from ...utils.torch import CPU + + +class MimoV2QModel(BaseQModel): + # MiMo V2 uses repository-defined configuration/modeling classes. + require_trust_remote_code = True + + dynamic_expert_index = "n_routed_experts" + + pre_lm_head_norm_module = "model.norm" + rotary_embedding = "model.rotary_emb" + + awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"] + + moe_lifecycle_hooks = GateUpDownMoELifecycleHooks() + + # MiMo V2 supports both split q/k/v and fused qkv checkpoints, and individual + # layers can be dense MLP or routed MoE according to config.moe_layer_freq. + layer_modules_strict = False + + module_tree = [ + "model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("qkv_proj:0", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp:moe:?": { + "": ("gate_proj:0", "up_proj:0", "down_proj:1"), + "gate": ("gate:!",), + "experts": { + "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), + }, + }, + }, + ] + + @staticmethod + def _checkpoint_has_tensor(model_local_path: str, tensor_name: str) -> bool: + if not model_local_path: + return True + + index_path = os.path.join(model_local_path, "model.safetensors.index.json") + if os.path.isfile(index_path): + with open(index_path, encoding="utf-8") as fp: + weight_map = json.load(fp).get("weight_map", {}) + return tensor_name in weight_map + + tensor_file = os.path.join(model_local_path, "model.safetensors") + if os.path.isfile(tensor_file): + with safe_open(tensor_file, framework="pt", device="cpu") as handler: + return tensor_name in handler.keys() + + return True + + @staticmethod + def _drop_visual_merger_biases_if_checkpoint_omits_them(model, model_local_path: str) -> None: + visual = getattr(model, "visual", None) + merger = getattr(visual, "merger", None) + if not isinstance(merger, nn.Module): + return + + for module_name, module in merger.named_modules(): + if getattr(module, "bias", None) is None: + continue + + prefix = "visual.merger" + if module_name: + prefix = f"{prefix}.{module_name}" + weight_name = f"{prefix}.weight" + bias_name = f"{prefix}.bias" + if MimoV2QModel._checkpoint_has_tensor(model_local_path, bias_name): + continue + if not MimoV2QModel._checkpoint_has_tensor(model_local_path, weight_name): + continue + + # MiMo V2.5 Base visual merger checkpoints include weights but omit + # default biases; align the shell so offload-backed save skips them. + module.register_parameter("bias", None) + + @staticmethod + def _drop_parameter_if_checkpoint_omits_it(model, model_local_path: str, tensor_name: str) -> None: + if MimoV2QModel._checkpoint_has_tensor(model_local_path, tensor_name): + return + + module_path, _, leaf = tensor_name.rpartition(".") + module = model + for part in module_path.split("."): + module = getattr(module, part, None) + if module is None: + return + + if not isinstance(module, nn.Module) or leaf not in module._parameters: + return + + module.register_parameter(leaf, None) + + @staticmethod + def _drop_checkpoint_omitted_audio_tensors(model, model_local_path: str) -> None: + # Remote MiMo marks this input embedding as load-missing-ignored and + # feeds the local transformer via inputs_embeds, so no trained weight exists. + MimoV2QModel._drop_parameter_if_checkpoint_omits_it( + model, + model_local_path, + "audio_encoder.input_local_transformer.embed_tokens.weight", + ) + + def after_model_load(self, model, load_quantized_model=False): + model = super().after_model_load(model, load_quantized_model=load_quantized_model) + self._drop_visual_merger_biases_if_checkpoint_omits_them(model, self.model_local_path) + self._drop_checkpoint_omitted_audio_tensors(model, self.model_local_path) + return model + + def pre_quantize_generate_hook_start(self): + model = self.model.model + rotary_emb_cls = type(model.rotary_emb) + assert "MiMoV2RotaryEmbedding" in rotary_emb_cls.__name__ + config = model.rotary_emb.config + # MiMoV2RotaryEmbedding cannot be correctly reconstructed via `_build_nonpersistent_buffer_template()`. + # Since it takes three arguments, `_build_nonpersistent_buffer_template()` is unable to infer the `is_swa` parameter. + # Therefore, MiMoV2RotaryEmbedding is manually reconstructed here. + model.rotary_emb = rotary_emb_cls(config=config, is_swa=False, device=CPU) + model.swa_rotary_emb = rotary_emb_cls(config=config, is_swa=True, device=CPU) diff --git a/tests/models/test_mimo_v2.py b/tests/models/test_mimo_v2.py new file mode 100644 index 000000000..0cf0f02e4 --- /dev/null +++ b/tests/models/test_mimo_v2.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium +from gptqmodel.quantization.config import MoEConfig, ExpertsRoutingOverride +from model_test import ModelTest + + +class TestMimo(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/MiMo-V2.5-Base-BF16" + EVAL_TASKS_SLOW = { + "arc_challenge": { + "chat_template": True, + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } + EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW) + TRUST_REMOTE_CODE = True + USE_FLASH_ATTN = False + EVAL_BATCH_SIZE = 6 + MOE_CONFIG = MoEConfig(routing=ExpertsRoutingOverride(num_experts_per_tok="all")) + MODEL_COMPAT_FAST_LAYER_POSITION = "first" + + def test_mimo(self): + self.quantize_and_evaluate() diff --git a/tests/test_mimo_v2_support.py b/tests/test_mimo_v2_support.py new file mode 100644 index 000000000..23886acfe --- /dev/null +++ b/tests/test_mimo_v2_support.py @@ -0,0 +1,169 @@ +import json +from types import SimpleNamespace + +from torch import nn + +from gptqmodel.models import auto +from gptqmodel.models.definitions.mimo_v2 import MimoV2QModel + + +_LOCAL_MIMO_V2_5_BASE_MODELING_SIGNATURE = { + "architectures": ["MiMoV2ForCausalLM"], + "attention_projection_layout": "fused_qkv", + "hidden_size": 4096, + "intermediate_size": 16384, + "model_type": "mimo_v2", + "moe_intermediate_size": 2048, + "n_routed_experts": 256, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, +} + + +class _FakeVisualMerger(nn.Module): + def __init__(self): + super().__init__() + self.ln_q = nn.LayerNorm(8) + self.mlp = nn.Sequential( + nn.Linear(8, 8), + nn.GELU(), + nn.Linear(8, 4), + ) + + +class _FakeAudioEncoder(nn.Module): + def __init__(self): + super().__init__() + self.input_local_transformer = nn.Module() + self.input_local_transformer.embed_tokens = nn.Embedding(16, 8) + + +def test_mimo_v2_model_type_selects_definition(monkeypatch): + fake_config = SimpleNamespace(model_type="mimo_v2") + + monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code) + monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config) + + assert auto.check_and_get_model_definition("/monster/data/model/MiMo-V2.5-Base") is MimoV2QModel + + +def test_mimo_v2_5_base_local_modeling_signature_snapshot(): + assert _LOCAL_MIMO_V2_5_BASE_MODELING_SIGNATURE == { + "architectures": ["MiMoV2ForCausalLM"], + "attention_projection_layout": "fused_qkv", + "hidden_size": 4096, + "intermediate_size": 16384, + "model_type": "mimo_v2", + "moe_intermediate_size": 2048, + "n_routed_experts": 256, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, + } + + +def test_mimo_v2_module_tree_expands_fused_attention_dense_mlp_and_moe_paths(): + layer_modules = MimoV2QModel.simple_layer_modules( + model_config=SimpleNamespace(n_routed_experts=4), + quantize_config=SimpleNamespace(dynamic=None), + ) + flat_modules = {name for block in layer_modules for name in block} + + assert MimoV2QModel.require_trust_remote_code is True + assert MimoV2QModel.layer_modules_strict is False + assert MimoV2QModel.pre_lm_head_norm_module == "model.norm" + assert MimoV2QModel.rotary_embedding == "model.rotary_emb" + assert "self_attn.qkv_proj" in flat_modules + assert "self_attn.q_proj" in flat_modules + assert "self_attn.k_proj" in flat_modules + assert "self_attn.v_proj" in flat_modules + assert "self_attn.o_proj" in flat_modules + assert "mlp.gate_proj" in flat_modules + assert "mlp.up_proj" in flat_modules + assert "mlp.down_proj" in flat_modules + assert "mlp.experts.0.gate_proj" in flat_modules + assert "mlp.experts.0.up_proj" in flat_modules + assert "mlp.experts.0.down_proj" in flat_modules + assert "mlp.gate" not in flat_modules + + +def test_mimo_v2_drops_visual_merger_biases_when_checkpoint_omits_them(tmp_path): + model = SimpleNamespace( + visual=SimpleNamespace( + merger=_FakeVisualMerger() + ) + ) + index = { + "metadata": {}, + "weight_map": { + "visual.merger.ln_q.weight": "model.safetensors", + "visual.merger.mlp.0.weight": "model.safetensors", + "visual.merger.mlp.2.weight": "model.safetensors", + }, + } + (tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8") + + MimoV2QModel._drop_visual_merger_biases_if_checkpoint_omits_them(model, str(tmp_path)) + + assert model.visual.merger.ln_q.bias is None + assert model.visual.merger.mlp[0].bias is None + assert model.visual.merger.mlp[2].bias is None + + +def test_mimo_v2_keeps_visual_merger_biases_when_checkpoint_has_them(tmp_path): + model = SimpleNamespace( + visual=SimpleNamespace( + merger=_FakeVisualMerger() + ) + ) + index = { + "metadata": {}, + "weight_map": { + "visual.merger.ln_q.weight": "model.safetensors", + "visual.merger.ln_q.bias": "model.safetensors", + "visual.merger.mlp.0.weight": "model.safetensors", + "visual.merger.mlp.0.bias": "model.safetensors", + "visual.merger.mlp.2.weight": "model.safetensors", + "visual.merger.mlp.2.bias": "model.safetensors", + }, + } + (tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8") + + MimoV2QModel._drop_visual_merger_biases_if_checkpoint_omits_them(model, str(tmp_path)) + + assert model.visual.merger.ln_q.bias is not None + assert model.visual.merger.mlp[0].bias is not None + assert model.visual.merger.mlp[2].bias is not None + + +def test_mimo_v2_drops_audio_input_embedding_when_checkpoint_omits_it(tmp_path): + model = SimpleNamespace(audio_encoder=_FakeAudioEncoder()) + index = { + "metadata": {}, + "weight_map": { + "audio_encoder.input_local_transformer.layers.0.input_layernorm.weight": "model.safetensors", + }, + } + (tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8") + + MimoV2QModel._drop_checkpoint_omitted_audio_tensors(model, str(tmp_path)) + + assert model.audio_encoder.input_local_transformer.embed_tokens.weight is None + + +def test_mimo_v2_keeps_audio_input_embedding_when_checkpoint_has_it(tmp_path): + model = SimpleNamespace(audio_encoder=_FakeAudioEncoder()) + index = { + "metadata": {}, + "weight_map": { + "audio_encoder.input_local_transformer.embed_tokens.weight": "model.safetensors", + }, + } + (tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8") + + MimoV2QModel._drop_checkpoint_omitted_audio_tensors(model, str(tmp_path)) + + assert model.audio_encoder.input_local_transformer.embed_tokens.weight is not None