Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

## Latest News

* 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support
* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support
* 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support
* 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support.
Expand Down Expand Up @@ -260,7 +261,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ |
| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ |
| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ |
| InternVL Chat | ✅ | Laguna | ✅ | Zamba / Zamba2 | ✅ | | | | |
| InternVL Chat | ✅ | Laguna | ✅ | Mimo / Mimo V2 | ✅ | Zamba / Zamba2 | ✅ | | |

Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included.

Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
from .definitions.llava_qwen2 import LlavaQwen2QModel # noqa: E402
from .definitions.longcat_flash import LongCatFlashQModel # noqa: E402
from .definitions.mimo import MimoQModel # noqa: E402
from .definitions.mimo_v2 import MimoV2QModel # noqa: E402
from .definitions.minicpm import MiniCPMGPTQ # noqa: E402
from .definitions.minicpm3 import MiniCpm3QModel # noqa: E402
from .definitions.minicpm_o import MiniCPMOQModel # noqa: E402
Expand Down Expand Up @@ -285,6 +286,7 @@
"telechat": TeleChat2QModel,
"instella": InstellaQModel,
"mimo": MimoQModel,
"mimo_v2": MimoV2QModel,
"falcon_h1": FalconH1QModel,
"zamba": ZambaQModel,
"zamba2": Zamba2QModel,
Expand Down
1 change: 1 addition & 0 deletions gptqmodel/models/definitions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from .minicpmv import MiniCPMVQModel
from .minicpmv_4_6 import MiniCPMV4_6QModel
from .minimax_m2 import MiniMaxM2GPTQ
from .mimo_v2 import MimoV2QModel
from .mixtral import MixtralQModel
from .mllama import MLlamaQModel
from .mobilellm import MobileLLMQModel
Expand Down
136 changes: 136 additions & 0 deletions gptqmodel/models/definitions/mimo_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
# SPDX-License-Identifier: Apache-2.0

import json
import os

from safetensors import safe_open
from torch import nn

from gptqmodel.models.moe_lifecycle import GateUpDownMoELifecycleHooks

from ..base import BaseQModel
from ...utils.torch import CPU


class MimoV2QModel(BaseQModel):
# MiMo V2 uses repository-defined configuration/modeling classes.
require_trust_remote_code = True

dynamic_expert_index = "n_routed_experts"

pre_lm_head_norm_module = "model.norm"
rotary_embedding = "model.rotary_emb"

awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]

moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()

# MiMo V2 supports both split q/k/v and fused qkv checkpoints, and individual
# layers can be dense MLP or routed MoE according to config.moe_layer_freq.
layer_modules_strict = False

module_tree = [
"model",
"layers",
"#",
{
"input_layernorm": ("input_layernorm:!",),
"self_attn": ("qkv_proj:0", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
"post_attention_layernorm": ("post_attention_layernorm:!",),
"mlp:moe:?": {
"": ("gate_proj:0", "up_proj:0", "down_proj:1"),
"gate": ("gate:!",),
"experts": {
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
},
},
]

@staticmethod
def _checkpoint_has_tensor(model_local_path: str, tensor_name: str) -> bool:
if not model_local_path:
return True

index_path = os.path.join(model_local_path, "model.safetensors.index.json")
if os.path.isfile(index_path):
with open(index_path, encoding="utf-8") as fp:
weight_map = json.load(fp).get("weight_map", {})
return tensor_name in weight_map

tensor_file = os.path.join(model_local_path, "model.safetensors")
if os.path.isfile(tensor_file):
with safe_open(tensor_file, framework="pt", device="cpu") as handler:
return tensor_name in handler.keys()

return True

@staticmethod
def _drop_visual_merger_biases_if_checkpoint_omits_them(model, model_local_path: str) -> None:
visual = getattr(model, "visual", None)
merger = getattr(visual, "merger", None)
if not isinstance(merger, nn.Module):
return

for module_name, module in merger.named_modules():
if getattr(module, "bias", None) is None:
continue

prefix = "visual.merger"
if module_name:
prefix = f"{prefix}.{module_name}"
weight_name = f"{prefix}.weight"
bias_name = f"{prefix}.bias"
if MimoV2QModel._checkpoint_has_tensor(model_local_path, bias_name):
continue
if not MimoV2QModel._checkpoint_has_tensor(model_local_path, weight_name):
continue

# MiMo V2.5 Base visual merger checkpoints include weights but omit
# default biases; align the shell so offload-backed save skips them.
module.register_parameter("bias", None)

@staticmethod
def _drop_parameter_if_checkpoint_omits_it(model, model_local_path: str, tensor_name: str) -> None:
if MimoV2QModel._checkpoint_has_tensor(model_local_path, tensor_name):
return

module_path, _, leaf = tensor_name.rpartition(".")
module = model
for part in module_path.split("."):
module = getattr(module, part, None)
if module is None:
return

if not isinstance(module, nn.Module) or leaf not in module._parameters:
return

module.register_parameter(leaf, None)

@staticmethod
def _drop_checkpoint_omitted_audio_tensors(model, model_local_path: str) -> None:
# Remote MiMo marks this input embedding as load-missing-ignored and
# feeds the local transformer via inputs_embeds, so no trained weight exists.
MimoV2QModel._drop_parameter_if_checkpoint_omits_it(
model,
model_local_path,
"audio_encoder.input_local_transformer.embed_tokens.weight",
)

def after_model_load(self, model, load_quantized_model=False):
model = super().after_model_load(model, load_quantized_model=load_quantized_model)
self._drop_visual_merger_biases_if_checkpoint_omits_them(model, self.model_local_path)
self._drop_checkpoint_omitted_audio_tensors(model, self.model_local_path)
return model

def pre_quantize_generate_hook_start(self):
model = self.model.model
rotary_emb_cls = type(model.rotary_emb)
assert "MiMoV2RotaryEmbedding" in rotary_emb_cls.__name__
config = model.rotary_emb.config
# MiMoV2RotaryEmbedding cannot be correctly reconstructed via `_build_nonpersistent_buffer_template()`.
# Since it takes three arguments, `_build_nonpersistent_buffer_template()` is unable to infer the `is_swa` parameter.
# Therefore, MiMoV2RotaryEmbedding is manually reconstructed here.
model.rotary_emb = rotary_emb_cls(config=config, is_swa=False, device=CPU)
model.swa_rotary_emb = rotary_emb_cls(config=config, is_swa=True, device=CPU)
26 changes: 26 additions & 0 deletions tests/models/test_mimo_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium
from gptqmodel.quantization.config import MoEConfig, ExpertsRoutingOverride
from model_test import ModelTest


class TestMimo(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/MiMo-V2.5-Base-BF16"
EVAL_TASKS_SLOW = {
"arc_challenge": {
"chat_template": True,
"acc": {"value": 0.2739, "floor_pct": 0.2},
"acc_norm": {"value": 0.3055, "floor_pct": 0.2},
},
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
TRUST_REMOTE_CODE = True
USE_FLASH_ATTN = False
EVAL_BATCH_SIZE = 6
MOE_CONFIG = MoEConfig(routing=ExpertsRoutingOverride(num_experts_per_tok="all"))
MODEL_COMPAT_FAST_LAYER_POSITION = "first"

def test_mimo(self):
self.quantize_and_evaluate()
169 changes: 169 additions & 0 deletions tests/test_mimo_v2_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import json
from types import SimpleNamespace

from torch import nn

from gptqmodel.models import auto
from gptqmodel.models.definitions.mimo_v2 import MimoV2QModel


_LOCAL_MIMO_V2_5_BASE_MODELING_SIGNATURE = {
"architectures": ["MiMoV2ForCausalLM"],
"attention_projection_layout": "fused_qkv",
"hidden_size": 4096,
"intermediate_size": 16384,
"model_type": "mimo_v2",
"moe_intermediate_size": 2048,
"n_routed_experts": 256,
"num_attention_heads": 64,
"num_experts_per_tok": 8,
"num_hidden_layers": 48,
"num_key_value_heads": 4,
}


class _FakeVisualMerger(nn.Module):
def __init__(self):
super().__init__()
self.ln_q = nn.LayerNorm(8)
self.mlp = nn.Sequential(
nn.Linear(8, 8),
nn.GELU(),
nn.Linear(8, 4),
)


class _FakeAudioEncoder(nn.Module):
def __init__(self):
super().__init__()
self.input_local_transformer = nn.Module()
self.input_local_transformer.embed_tokens = nn.Embedding(16, 8)


def test_mimo_v2_model_type_selects_definition(monkeypatch):
fake_config = SimpleNamespace(model_type="mimo_v2")

monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)

assert auto.check_and_get_model_definition("/monster/data/model/MiMo-V2.5-Base") is MimoV2QModel


def test_mimo_v2_5_base_local_modeling_signature_snapshot():
assert _LOCAL_MIMO_V2_5_BASE_MODELING_SIGNATURE == {
"architectures": ["MiMoV2ForCausalLM"],
"attention_projection_layout": "fused_qkv",
"hidden_size": 4096,
"intermediate_size": 16384,
"model_type": "mimo_v2",
"moe_intermediate_size": 2048,
"n_routed_experts": 256,
"num_attention_heads": 64,
"num_experts_per_tok": 8,
"num_hidden_layers": 48,
"num_key_value_heads": 4,
}


def test_mimo_v2_module_tree_expands_fused_attention_dense_mlp_and_moe_paths():
layer_modules = MimoV2QModel.simple_layer_modules(
model_config=SimpleNamespace(n_routed_experts=4),
quantize_config=SimpleNamespace(dynamic=None),
)
flat_modules = {name for block in layer_modules for name in block}

assert MimoV2QModel.require_trust_remote_code is True
assert MimoV2QModel.layer_modules_strict is False
assert MimoV2QModel.pre_lm_head_norm_module == "model.norm"
assert MimoV2QModel.rotary_embedding == "model.rotary_emb"
assert "self_attn.qkv_proj" in flat_modules
assert "self_attn.q_proj" in flat_modules
assert "self_attn.k_proj" in flat_modules
assert "self_attn.v_proj" in flat_modules
assert "self_attn.o_proj" in flat_modules
assert "mlp.gate_proj" in flat_modules
assert "mlp.up_proj" in flat_modules
assert "mlp.down_proj" in flat_modules
assert "mlp.experts.0.gate_proj" in flat_modules
assert "mlp.experts.0.up_proj" in flat_modules
assert "mlp.experts.0.down_proj" in flat_modules
assert "mlp.gate" not in flat_modules


def test_mimo_v2_drops_visual_merger_biases_when_checkpoint_omits_them(tmp_path):
model = SimpleNamespace(
visual=SimpleNamespace(
merger=_FakeVisualMerger()
)
)
index = {
"metadata": {},
"weight_map": {
"visual.merger.ln_q.weight": "model.safetensors",
"visual.merger.mlp.0.weight": "model.safetensors",
"visual.merger.mlp.2.weight": "model.safetensors",
},
}
(tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8")

MimoV2QModel._drop_visual_merger_biases_if_checkpoint_omits_them(model, str(tmp_path))

assert model.visual.merger.ln_q.bias is None
assert model.visual.merger.mlp[0].bias is None
assert model.visual.merger.mlp[2].bias is None


def test_mimo_v2_keeps_visual_merger_biases_when_checkpoint_has_them(tmp_path):
model = SimpleNamespace(
visual=SimpleNamespace(
merger=_FakeVisualMerger()
)
)
index = {
"metadata": {},
"weight_map": {
"visual.merger.ln_q.weight": "model.safetensors",
"visual.merger.ln_q.bias": "model.safetensors",
"visual.merger.mlp.0.weight": "model.safetensors",
"visual.merger.mlp.0.bias": "model.safetensors",
"visual.merger.mlp.2.weight": "model.safetensors",
"visual.merger.mlp.2.bias": "model.safetensors",
},
}
(tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8")

MimoV2QModel._drop_visual_merger_biases_if_checkpoint_omits_them(model, str(tmp_path))

assert model.visual.merger.ln_q.bias is not None
assert model.visual.merger.mlp[0].bias is not None
assert model.visual.merger.mlp[2].bias is not None


def test_mimo_v2_drops_audio_input_embedding_when_checkpoint_omits_it(tmp_path):
model = SimpleNamespace(audio_encoder=_FakeAudioEncoder())
index = {
"metadata": {},
"weight_map": {
"audio_encoder.input_local_transformer.layers.0.input_layernorm.weight": "model.safetensors",
},
}
(tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8")

MimoV2QModel._drop_checkpoint_omitted_audio_tensors(model, str(tmp_path))

assert model.audio_encoder.input_local_transformer.embed_tokens.weight is None


def test_mimo_v2_keeps_audio_input_embedding_when_checkpoint_has_it(tmp_path):
model = SimpleNamespace(audio_encoder=_FakeAudioEncoder())
index = {
"metadata": {},
"weight_map": {
"audio_encoder.input_local_transformer.embed_tokens.weight": "model.safetensors",
},
}
(tmp_path / "model.safetensors.index.json").write_text(json.dumps(index), encoding="utf-8")

MimoV2QModel._drop_checkpoint_omitted_audio_tensors(model, str(tmp_path))

assert model.audio_encoder.input_local_transformer.embed_tokens.weight is not None