Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

## Latest News

* 05/19/2026 7.1.0-dev `main`: ✨ Added `ovis2_6_moe` model support
* 05/18/2026 7.1.0-dev `main`: ✨ Added `ovis2_5` model support
* 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support
* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support
Expand Down Expand Up @@ -257,7 +258,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ |
| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ |
| DeepSeek-V2/V3/V4/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ |
| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2/2.5 | ✅ | Seed-OSS | ✅ |
| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2/2.5/2.6 MoE | ✅ | Seed-OSS | ✅ |
| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ |
| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ |
| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ |
Expand Down
8 changes: 4 additions & 4 deletions gptqmodel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ def _build_device_thread_pool():
"cpu": WarmupTask(run_torch_linalg_warmup, scope=WarmUpCtx.THREAD_AND_DEVICE),
},
workers={
"cuda:per": 4,
"cuda:per": 1,
"xpu:per": 1,
"npu:per": 1,
"mps": 8,
"cpu": min(12, max(1, (os.cpu_count() or 1) + 1 // 2)), # count + 1, fixed pool size > 1 check when count=3
"model_loader:cpu": 2,
"mps": 1,
"cpu": 1, # count + 1, fixed pool size > 1 check when count=3
"model_loader:cpu": 1,
},
empty_cache_every_n=512,
)
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
from .definitions.ovis import OvisQModel # noqa: E402
from .definitions.ovis2 import Ovis2QModel # noqa: E402
from .definitions.ovis2_5 import Ovis2_5QModel # noqa: E402
from .definitions.ovis2_6_moe import Ovis2_6_MoeQModel # noqa: E402
from .definitions.pangu_alpha import PanguAlphaQModel # noqa: E402
from .definitions.phi import PhiQModel # noqa: E402
from .definitions.phi3 import Phi3QModel, PhiMoEGPTQForCausalLM # noqa: E402
Expand Down Expand Up @@ -285,6 +286,7 @@
"ovis": OvisQModel,
"ovis2": Ovis2QModel,
"ovis2_5": Ovis2_5QModel,
"ovis2_6_moe": Ovis2_6_MoeQModel,
"telechat": TeleChat2QModel,
"instella": InstellaQModel,
"mimo": MimoQModel,
Expand Down
5 changes: 5 additions & 0 deletions gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ class BaseQModel(nn.Module):

INPUT_EMBEDDING_EXTRA_ARGS = None

# Some models (e.g. ovis2_6_moe) do not contain MoE layers directly.
# The actual experts live inside submodules (e.g. Qwen3MoeModel.mlp.experts),
# so `defuser_module_paths` is used to explicitly locate and defuse them.
defuser_module_paths = None

def __init__(
self,
model: PreTrainedModel,
Expand Down
1 change: 1 addition & 0 deletions gptqmodel/models/definitions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from .opt import OptQModel
from .ovis import OvisQModel
from .ovis2_5 import Ovis2_5QModel
from .ovis2_6_moe import Ovis2_6_MoeQModel
from .phi import PhiQModel
from .phi3 import Phi3QModel
from .qwen import QwenQModel
Expand Down
75 changes: 75 additions & 0 deletions gptqmodel/models/definitions/ovis2_6_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

import torch
from torch import nn

from ..moe_lifecycle import GateUpDownMoELifecycleHooks
from .ovis2_5 import Ovis2_5QModel


class Ovis2_6_MoeQModel(Ovis2_5QModel):
dynamic_expert_index = "num_experts"

pre_lm_head_norm_module = "llm.model.norm"
rotary_embedding = "llm.model.rotary_emb"

awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]

defuser_module_paths = ("llm",)

moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()

module_tree = [
"llm",
"model",
"layers",
"#",
{
"input_layernorm": ("input_layernorm:!",),
"self_attn": ("q_norm:!", "k_norm:!", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
"post_attention_layernorm": ("post_attention_layernorm:!",),
"mlp:moe:?": {
"gate": ("gate:!",),
"experts": {
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
},
}
]

@staticmethod
def _materialize_layernorm_defaults(layernorm: nn.LayerNorm, device: torch.device) -> None:
if layernorm.weight is not None and (
getattr(layernorm.weight, "is_meta", False) or layernorm.weight.device.type == "meta"
):
layernorm.weight = nn.Parameter(
torch.ones(layernorm.normalized_shape, device=device, dtype=layernorm.weight.dtype),
requires_grad=layernorm.weight.requires_grad,
)

if layernorm.bias is not None and (
getattr(layernorm.bias, "is_meta", False) or layernorm.bias.device.type == "meta"
):
layernorm.bias = nn.Parameter(
torch.zeros(layernorm.normalized_shape, device=device, dtype=layernorm.bias.dtype),
requires_grad=layernorm.bias.requires_grad,
)

def _materialize_missing_vision_post_layernorm(self, device: torch.device) -> None:
post_layernorm = getattr(
getattr(getattr(self.model.visual_tokenizer, "vit", None), "vision_model", None),
"post_layernorm",
None,
)
if isinstance(post_layernorm, nn.LayerNorm):
self._materialize_layernorm_defaults(post_layernorm, device)

def pre_quantize_generate_hook_start(self):
# Ovis 2.6 checkpoints omit SigLIP2 post_layernorm tensors even though
# the remote code constructs the LayerNorm. Keep its default init instead of
# resolving nonexistent checkpoint keys.
self._materialize_missing_vision_post_layernorm(torch.device(self.quantize_config.device))
super().pre_quantize_generate_hook_start()
23 changes: 19 additions & 4 deletions gptqmodel/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,21 @@ def _maybe_print_module_tree(model) -> None:
print_module_tree(model=model)


def _convert_model_with_defuser(cls, model, cleanup_original: bool) -> bool:
converted = defuser.convert_model(model, cleanup_original=cleanup_original)

defuser_module_paths = getattr(cls, "defuser_module_paths", ())
if defuser_module_paths:
for module_path in defuser_module_paths:
module, _ = get_module_by_name_prefix(model, module_path)
if module is None:
log.warn("Loader: defuser module path `%s` was not found.", module_path)
continue
converted = defuser.convert_model(module, cleanup_original=cleanup_original) or converted

return converted


def _supports_flash_attn_2(config: PretrainedConfig) -> bool:
"""Detect whether the resolved HF architecture exposes FA2 kernels."""

Expand Down Expand Up @@ -727,12 +742,12 @@ def skip(*args, **kwargs):
)
if getattr(model, "config", None) is config:
model.config = copy.deepcopy(config)
defuser.convert_model(model, cleanup_original=False)
_convert_model_with_defuser(cls, model, cleanup_original=False)
model._model_init_kwargs = fallback_init_kwargs
_maybe_print_module_tree(model=model)
turtle_model = None
else:
defuser.convert_model(model, cleanup_original=False)
_convert_model_with_defuser(cls, model, cleanup_original=False)
shell_model_init_kwargs = dict(model_init_kwargs_without_internal)
shell_model_init_kwargs.update(hf_gguf_load_kwargs)
model._model_init_kwargs = shell_model_init_kwargs
Expand Down Expand Up @@ -768,7 +783,7 @@ def skip(*args, **kwargs):
)
if getattr(model, "config", None) is config:
model.config = copy.deepcopy(config)
defuser.convert_model(model, cleanup_original=False)
_convert_model_with_defuser(cls, model, cleanup_original=False)
direct_model_init_kwargs = dict(model_init_kwargs_without_internal)
direct_model_init_kwargs.update(hf_gguf_load_kwargs)
model._model_init_kwargs = direct_model_init_kwargs
Expand Down Expand Up @@ -1188,7 +1203,7 @@ def from_quantized(
)
else:
raise
defuser.convert_model(model, cleanup_original=True)
_convert_model_with_defuser(cls, model, cleanup_original=True)
model.checkpoint_file_name = model_save_name
if native_gguf_qspec is not None:
gguf_tensor_key_mapping = _build_gguf_tensor_key_mapping(model, config)
Expand Down
16 changes: 12 additions & 4 deletions gptqmodel/utils/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,15 @@ def encoder_init_compat(self, encoder_config):
if vision_model_cls:
try_patch_legacy_flash_attn_flag(vision_model_cls)

if config.model_type == "ovis2_6_moe":
vision_model_cls = getattr(
remote_module,
"Siglip2NavitModel",
None,
)
if vision_model_cls:
try_patch_legacy_flash_attn_flag(vision_model_cls)

if (
outer_model_cls is not None
and hasattr(outer_model_cls, "tie_weights")
Expand Down Expand Up @@ -1359,7 +1368,7 @@ def tie_weights_compat(self, *args, **kwargs):
formatter_cls.support_tokenizer_types = support_tokenizer_types
formatter_cls._gptqmodel_tokenizer_backend_patch = True

if getattr(config, "model_type", None) == "ovis2_5":
if getattr(config, "model_type", None) in {"ovis2_5", "ovis2_6", "ovis2_6_moe"}:
register_runtime_automodel_config(config, remote_module, "vit_config", "Siglip2NavitModel")

if getattr(config, "model_type", None) == "hymba" and remote_module is not None:
Expand Down Expand Up @@ -1520,9 +1529,8 @@ def try_patch_legacy_flash_attn_flag(model_cls):
return

# The remote modeling code for some models(For example, ovis.) still relies on `_supports_flash_attn_2`
if hasattr(model_cls, "_supports_flash_attn"):
if not hasattr(model_cls, "_supports_flash_attn_2"):
setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn))
if hasattr(model_cls, "_supports_flash_attn") and not hasattr(model_cls, "_supports_flash_attn_2"):
setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn))
return

# Find the most specific class that explicitly declares the newer
Expand Down
1 change: 0 additions & 1 deletion gptqmodel/utils/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2072,7 +2072,6 @@ def _copy_checkpoint_tensors_into_submodule(
grouped_names: Dict[str, list[tuple[str, str, str, Optional[int], Optional[int], Optional[int]]]] = {}
for rel_name in t_params:
full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, rel_name)
print("full_name", full_name, rel_name)
if full_name is None:
continue
shard = self._weight_map.get(full_name)
Expand Down
4 changes: 4 additions & 0 deletions tests/models/ovis/image_to_test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from gptqmodel.models.definitions.ovis import OvisQModel
from gptqmodel.models.definitions.ovis2 import Ovis2QModel
from gptqmodel.models.definitions.ovis2_5 import Ovis2_5QModel
from gptqmodel.models.definitions.ovis2_6_moe import Ovis2_6_MoeQModel
from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel


Expand Down Expand Up @@ -98,6 +99,9 @@ def get_calib_dataset(model):
if isinstance(model, Ovis2_5QModel):
return prepare_dataset(format_ovis2_dataset, n_sample=20)

if isinstance(model, Ovis2_6_MoeQModel):
return prepare_dataset(format_ovis2_dataset, n_sample=20)

if (
isinstance(model, BaseQwen2VLGPTQ)
or isinstance(model, Qwen3_VLQModel)
Expand Down
5 changes: 3 additions & 2 deletions tests/models/test_ovis2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ class Test(ModelTest):
EVAL_BATCH_SIZE = 1

def test_ovis(self):
model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE, batch_size=1)
with self.model_compat_test_context():
model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE, batch_size=1)

messages = [
{
Expand Down
15 changes: 8 additions & 7 deletions tests/models/test_ovis2_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ class Test(ModelTest):
MODEL_COMPAT_FAST_LAYER_POSITION = "first"

def test_ovis(self):
model, _tokenizer, _processor = self.quantModel(
self.NATIVE_MODEL_ID,
trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE,
batch_size=1,
call_perform_post_quant_validation=False
)
with self.model_compat_test_context():
model, _tokenizer, _processor = self.quantModel(
self.NATIVE_MODEL_ID,
trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE,
batch_size=1,
call_perform_post_quant_validation=False
)

text_tokenizer = model.text_tokenizer

Expand Down
65 changes: 65 additions & 0 deletions tests/models/test_ovis2_6_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

import os.path

import torch
from PIL import Image

from gptqmodel.quantization.config import MoEConfig, ExpertsRoutingOverride, MOE_ALL_EXPERTS
from model_test import ModelTest


class Test(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/Ovis2.6-30B-A3B"

TRUST_REMOTE_CODE = True
EVAL_BATCH_SIZE = 1
MOE_CONFIG = MoEConfig(ExpertsRoutingOverride(num_experts_per_tok=MOE_ALL_EXPERTS))
MODEL_COMPAT_FAST_LAYER_POSITION = "first"

def test_ovis2_6_moe(self):
with self.model_compat_test_context():
model, _tokenizer, _processor = self.quantModel(
self.NATIVE_MODEL_ID,
trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE,
batch_size=1,
call_perform_post_quant_validation=False,
)

text_tokenizer = model.text_tokenizer

image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg")
image = Image.open(image_path)
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "What does this picture show?"},
],
}]

input_ids, pixel_values, grid_thws = model.preprocess_inputs(
messages=messages,
add_generation_prompt=True,
)
input_ids = input_ids.to(model.device)
pixel_values = pixel_values.to(
dtype=model.visual_tokenizer.vit.dtype,
device=model.device,
) if pixel_values is not None else None
grid_thws = grid_thws.to(model.device) if grid_thws is not None else None

with torch.inference_mode():
output_ids = model.generate(
inputs=input_ids,
pixel_values=pixel_values,
grid_thws=grid_thws,
)
output = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Output:\n{output}")

self.assertIn("snow", output.lower())
10 changes: 6 additions & 4 deletions tests/models/test_ovis_1_6_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ class TestOvis1_6_Llama(ModelTest):
USE_FLASH_ATTN = False

def test_ovis_1_6(self):
# the evaluation harness does not support Ovis, and will throw an error during execution:
# TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values'
model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1, call_perform_post_quant_validation=False)
with self.model_compat_test_context():
# the evaluation harness does not support Ovis, and will throw an error during execution:
# TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values'
model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1,
call_perform_post_quant_validation=False)

text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
Expand Down
Loading