diff --git a/README.md b/README.md index 399e2c044..76c173fa8 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ ## Latest News +* 05/19/2026 7.1.0-dev `main`: ✨ Added `ovis2_6_moe` model support * 05/18/2026 7.1.0-dev `main`: ✨ Added `ovis2_5` model support * 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support * 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support @@ -257,7 +258,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode | DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ | | Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ | | DeepSeek-V2/V3/V4/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ | -| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2/2.5 | ✅ | Seed-OSS | ✅ | +| DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2/2.5/2.6 MoE | ✅ | Seed-OSS | ✅ | | Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ | | ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ | | XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py index 11a8b93e4..8a82eab62 100644 --- a/gptqmodel/__init__.py +++ b/gptqmodel/__init__.py @@ -175,12 +175,12 @@ def _build_device_thread_pool(): "cpu": WarmupTask(run_torch_linalg_warmup, scope=WarmUpCtx.THREAD_AND_DEVICE), }, workers={ - "cuda:per": 4, + "cuda:per": 1, "xpu:per": 1, "npu:per": 1, - "mps": 8, - "cpu": min(12, max(1, (os.cpu_count() or 1) + 1 // 2)), # count + 1, fixed pool size > 1 check when count=3 - "model_loader:cpu": 2, + "mps": 1, + "cpu": 1, # count + 1, fixed pool size > 1 check when count=3 + "model_loader:cpu": 1, }, empty_cache_every_n=512, ) diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index f2d1bec75..b332cf722 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -147,6 +147,7 @@ from .definitions.ovis import OvisQModel # noqa: E402 from .definitions.ovis2 import Ovis2QModel # noqa: E402 from .definitions.ovis2_5 import Ovis2_5QModel # noqa: E402 +from .definitions.ovis2_6_moe import Ovis2_6_MoeQModel # noqa: E402 from .definitions.pangu_alpha import PanguAlphaQModel # noqa: E402 from .definitions.phi import PhiQModel # noqa: E402 from .definitions.phi3 import Phi3QModel, PhiMoEGPTQForCausalLM # noqa: E402 @@ -285,6 +286,7 @@ "ovis": OvisQModel, "ovis2": Ovis2QModel, "ovis2_5": Ovis2_5QModel, + "ovis2_6_moe": Ovis2_6_MoeQModel, "telechat": TeleChat2QModel, "instella": InstellaQModel, "mimo": MimoQModel, diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index de3b6f11d..67715ebc0 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -266,6 +266,11 @@ class BaseQModel(nn.Module): INPUT_EMBEDDING_EXTRA_ARGS = None + # Some models (e.g. ovis2_6_moe) do not contain MoE layers directly. + # The actual experts live inside submodules (e.g. Qwen3MoeModel.mlp.experts), + # so `defuser_module_paths` is used to explicitly locate and defuse them. + defuser_module_paths = None + def __init__( self, model: PreTrainedModel, diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py index 858323619..d80b1df46 100644 --- a/gptqmodel/models/definitions/__init__.py +++ b/gptqmodel/models/definitions/__init__.py @@ -61,6 +61,7 @@ from .opt import OptQModel from .ovis import OvisQModel from .ovis2_5 import Ovis2_5QModel +from .ovis2_6_moe import Ovis2_6_MoeQModel from .phi import PhiQModel from .phi3 import Phi3QModel from .qwen import QwenQModel diff --git a/gptqmodel/models/definitions/ovis2_6_moe.py b/gptqmodel/models/definitions/ovis2_6_moe.py new file mode 100644 index 000000000..59702b6c8 --- /dev/null +++ b/gptqmodel/models/definitions/ovis2_6_moe.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +import torch +from torch import nn + +from ..moe_lifecycle import GateUpDownMoELifecycleHooks +from .ovis2_5 import Ovis2_5QModel + + +class Ovis2_6_MoeQModel(Ovis2_5QModel): + dynamic_expert_index = "num_experts" + + pre_lm_head_norm_module = "llm.model.norm" + rotary_embedding = "llm.model.rotary_emb" + + awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"] + + defuser_module_paths = ("llm",) + + moe_lifecycle_hooks = GateUpDownMoELifecycleHooks() + + module_tree = [ + "llm", + "model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("q_norm:!", "k_norm:!", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp:moe:?": { + "gate": ("gate:!",), + "experts": { + "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), + }, + }, + } + ] + + @staticmethod + def _materialize_layernorm_defaults(layernorm: nn.LayerNorm, device: torch.device) -> None: + if layernorm.weight is not None and ( + getattr(layernorm.weight, "is_meta", False) or layernorm.weight.device.type == "meta" + ): + layernorm.weight = nn.Parameter( + torch.ones(layernorm.normalized_shape, device=device, dtype=layernorm.weight.dtype), + requires_grad=layernorm.weight.requires_grad, + ) + + if layernorm.bias is not None and ( + getattr(layernorm.bias, "is_meta", False) or layernorm.bias.device.type == "meta" + ): + layernorm.bias = nn.Parameter( + torch.zeros(layernorm.normalized_shape, device=device, dtype=layernorm.bias.dtype), + requires_grad=layernorm.bias.requires_grad, + ) + + def _materialize_missing_vision_post_layernorm(self, device: torch.device) -> None: + post_layernorm = getattr( + getattr(getattr(self.model.visual_tokenizer, "vit", None), "vision_model", None), + "post_layernorm", + None, + ) + if isinstance(post_layernorm, nn.LayerNorm): + self._materialize_layernorm_defaults(post_layernorm, device) + + def pre_quantize_generate_hook_start(self): + # Ovis 2.6 checkpoints omit SigLIP2 post_layernorm tensors even though + # the remote code constructs the LayerNorm. Keep its default init instead of + # resolving nonexistent checkpoint keys. + self._materialize_missing_vision_post_layernorm(torch.device(self.quantize_config.device)) + super().pre_quantize_generate_hook_start() diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 1a6c1c3bf..c784840cf 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -99,6 +99,21 @@ def _maybe_print_module_tree(model) -> None: print_module_tree(model=model) +def _convert_model_with_defuser(cls, model, cleanup_original: bool) -> bool: + converted = defuser.convert_model(model, cleanup_original=cleanup_original) + + defuser_module_paths = getattr(cls, "defuser_module_paths", ()) + if defuser_module_paths: + for module_path in defuser_module_paths: + module, _ = get_module_by_name_prefix(model, module_path) + if module is None: + log.warn("Loader: defuser module path `%s` was not found.", module_path) + continue + converted = defuser.convert_model(module, cleanup_original=cleanup_original) or converted + + return converted + + def _supports_flash_attn_2(config: PretrainedConfig) -> bool: """Detect whether the resolved HF architecture exposes FA2 kernels.""" @@ -727,12 +742,12 @@ def skip(*args, **kwargs): ) if getattr(model, "config", None) is config: model.config = copy.deepcopy(config) - defuser.convert_model(model, cleanup_original=False) + _convert_model_with_defuser(cls, model, cleanup_original=False) model._model_init_kwargs = fallback_init_kwargs _maybe_print_module_tree(model=model) turtle_model = None else: - defuser.convert_model(model, cleanup_original=False) + _convert_model_with_defuser(cls, model, cleanup_original=False) shell_model_init_kwargs = dict(model_init_kwargs_without_internal) shell_model_init_kwargs.update(hf_gguf_load_kwargs) model._model_init_kwargs = shell_model_init_kwargs @@ -768,7 +783,7 @@ def skip(*args, **kwargs): ) if getattr(model, "config", None) is config: model.config = copy.deepcopy(config) - defuser.convert_model(model, cleanup_original=False) + _convert_model_with_defuser(cls, model, cleanup_original=False) direct_model_init_kwargs = dict(model_init_kwargs_without_internal) direct_model_init_kwargs.update(hf_gguf_load_kwargs) model._model_init_kwargs = direct_model_init_kwargs @@ -1188,7 +1203,7 @@ def from_quantized( ) else: raise - defuser.convert_model(model, cleanup_original=True) + _convert_model_with_defuser(cls, model, cleanup_original=True) model.checkpoint_file_name = model_save_name if native_gguf_qspec is not None: gguf_tensor_key_mapping = _build_gguf_tensor_key_mapping(model, config) diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py index 15e0a3ef0..37c9efdd2 100644 --- a/gptqmodel/utils/hf.py +++ b/gptqmodel/utils/hf.py @@ -1317,6 +1317,15 @@ def encoder_init_compat(self, encoder_config): if vision_model_cls: try_patch_legacy_flash_attn_flag(vision_model_cls) + if config.model_type == "ovis2_6_moe": + vision_model_cls = getattr( + remote_module, + "Siglip2NavitModel", + None, + ) + if vision_model_cls: + try_patch_legacy_flash_attn_flag(vision_model_cls) + if ( outer_model_cls is not None and hasattr(outer_model_cls, "tie_weights") @@ -1359,7 +1368,7 @@ def tie_weights_compat(self, *args, **kwargs): formatter_cls.support_tokenizer_types = support_tokenizer_types formatter_cls._gptqmodel_tokenizer_backend_patch = True - if getattr(config, "model_type", None) == "ovis2_5": + if getattr(config, "model_type", None) in {"ovis2_5", "ovis2_6", "ovis2_6_moe"}: register_runtime_automodel_config(config, remote_module, "vit_config", "Siglip2NavitModel") if getattr(config, "model_type", None) == "hymba" and remote_module is not None: @@ -1520,9 +1529,8 @@ def try_patch_legacy_flash_attn_flag(model_cls): return # The remote modeling code for some models(For example, ovis.) still relies on `_supports_flash_attn_2` - if hasattr(model_cls, "_supports_flash_attn"): - if not hasattr(model_cls, "_supports_flash_attn_2"): - setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn)) + if hasattr(model_cls, "_supports_flash_attn") and not hasattr(model_cls, "_supports_flash_attn_2"): + setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn)) return # Find the most specific class that explicitly declares the newer diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py index 130bb84e9..d398e680e 100644 --- a/gptqmodel/utils/structure.py +++ b/gptqmodel/utils/structure.py @@ -2072,7 +2072,6 @@ def _copy_checkpoint_tensors_into_submodule( grouped_names: Dict[str, list[tuple[str, str, str, Optional[int], Optional[int], Optional[int]]]] = {} for rel_name in t_params: full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, rel_name) - print("full_name", full_name, rel_name) if full_name is None: continue shard = self._weight_map.get(full_name) diff --git a/tests/models/ovis/image_to_test_dataset.py b/tests/models/ovis/image_to_test_dataset.py index 7388f1ff4..61343dca7 100644 --- a/tests/models/ovis/image_to_test_dataset.py +++ b/tests/models/ovis/image_to_test_dataset.py @@ -12,6 +12,7 @@ from gptqmodel.models.definitions.ovis import OvisQModel from gptqmodel.models.definitions.ovis2 import Ovis2QModel from gptqmodel.models.definitions.ovis2_5 import Ovis2_5QModel +from gptqmodel.models.definitions.ovis2_6_moe import Ovis2_6_MoeQModel from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel @@ -98,6 +99,9 @@ def get_calib_dataset(model): if isinstance(model, Ovis2_5QModel): return prepare_dataset(format_ovis2_dataset, n_sample=20) + if isinstance(model, Ovis2_6_MoeQModel): + return prepare_dataset(format_ovis2_dataset, n_sample=20) + if ( isinstance(model, BaseQwen2VLGPTQ) or isinstance(model, Qwen3_VLQModel) diff --git a/tests/models/test_ovis2.py b/tests/models/test_ovis2.py index 6fb808fa0..1fac91b2f 100644 --- a/tests/models/test_ovis2.py +++ b/tests/models/test_ovis2.py @@ -17,8 +17,9 @@ class Test(ModelTest): EVAL_BATCH_SIZE = 1 def test_ovis(self): - model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, - dtype=self.TORCH_DTYPE, batch_size=1) + with self.model_compat_test_context(): + model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, + dtype=self.TORCH_DTYPE, batch_size=1) messages = [ { diff --git a/tests/models/test_ovis2_5.py b/tests/models/test_ovis2_5.py index a757e2c96..990e02d66 100644 --- a/tests/models/test_ovis2_5.py +++ b/tests/models/test_ovis2_5.py @@ -19,13 +19,14 @@ class Test(ModelTest): MODEL_COMPAT_FAST_LAYER_POSITION = "first" def test_ovis(self): - model, _tokenizer, _processor = self.quantModel( - self.NATIVE_MODEL_ID, - trust_remote_code=self.TRUST_REMOTE_CODE, - dtype=self.TORCH_DTYPE, - batch_size=1, - call_perform_post_quant_validation=False - ) + with self.model_compat_test_context(): + model, _tokenizer, _processor = self.quantModel( + self.NATIVE_MODEL_ID, + trust_remote_code=self.TRUST_REMOTE_CODE, + dtype=self.TORCH_DTYPE, + batch_size=1, + call_perform_post_quant_validation=False + ) text_tokenizer = model.text_tokenizer diff --git a/tests/models/test_ovis2_6_moe.py b/tests/models/test_ovis2_6_moe.py new file mode 100644 index 000000000..19b4b1c5c --- /dev/null +++ b/tests/models/test_ovis2_6_moe.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +import os.path + +import torch +from PIL import Image + +from gptqmodel.quantization.config import MoEConfig, ExpertsRoutingOverride, MOE_ALL_EXPERTS +from model_test import ModelTest + + +class Test(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/Ovis2.6-30B-A3B" + + TRUST_REMOTE_CODE = True + EVAL_BATCH_SIZE = 1 + MOE_CONFIG = MoEConfig(ExpertsRoutingOverride(num_experts_per_tok=MOE_ALL_EXPERTS)) + MODEL_COMPAT_FAST_LAYER_POSITION = "first" + + def test_ovis2_6_moe(self): + with self.model_compat_test_context(): + model, _tokenizer, _processor = self.quantModel( + self.NATIVE_MODEL_ID, + trust_remote_code=self.TRUST_REMOTE_CODE, + dtype=self.TORCH_DTYPE, + batch_size=1, + call_perform_post_quant_validation=False, + ) + + text_tokenizer = model.text_tokenizer + + image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg") + image = Image.open(image_path) + messages = [{ + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "What does this picture show?"}, + ], + }] + + input_ids, pixel_values, grid_thws = model.preprocess_inputs( + messages=messages, + add_generation_prompt=True, + ) + input_ids = input_ids.to(model.device) + pixel_values = pixel_values.to( + dtype=model.visual_tokenizer.vit.dtype, + device=model.device, + ) if pixel_values is not None else None + grid_thws = grid_thws.to(model.device) if grid_thws is not None else None + + with torch.inference_mode(): + output_ids = model.generate( + inputs=input_ids, + pixel_values=pixel_values, + grid_thws=grid_thws, + ) + output = text_tokenizer.decode(output_ids[0], skip_special_tokens=True) + print(f"Output:\n{output}") + + self.assertIn("snow", output.lower()) diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py index e0f820da1..833d5eebe 100644 --- a/tests/models/test_ovis_1_6_llama.py +++ b/tests/models/test_ovis_1_6_llama.py @@ -18,10 +18,12 @@ class TestOvis1_6_Llama(ModelTest): USE_FLASH_ATTN = False def test_ovis_1_6(self): - # the evaluation harness does not support Ovis, and will throw an error during execution: - # TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values' - model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, - dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1, call_perform_post_quant_validation=False) + with self.model_compat_test_context(): + # the evaluation harness does not support Ovis, and will throw an error during execution: + # TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values' + model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, + dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1, + call_perform_post_quant_validation=False) text_tokenizer = model.get_text_tokenizer() visual_tokenizer = model.get_visual_tokenizer() diff --git a/tests/test_ovis2_6_moe_support.py b/tests/test_ovis2_6_moe_support.py new file mode 100644 index 000000000..bec8db0cc --- /dev/null +++ b/tests/test_ovis2_6_moe_support.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: 2026 ModelCloud.ai +# SPDX-License-Identifier: Apache-2.0 + +from types import SimpleNamespace + +import torch +from torch import nn + +from gptqmodel.models import auto +from gptqmodel.models.definitions.ovis2_6_moe import Ovis2_6_MoeQModel + + +def test_ovis2_6_moe_model_type_selects_definition(monkeypatch): + fake_config = SimpleNamespace(model_type="ovis2_6_moe") + + monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code) + monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config) + + assert auto.check_and_get_model_definition("/tmp/ovis2_6_moe") is Ovis2_6_MoeQModel + assert Ovis2_6_MoeQModel.extract_layers_node() == ["llm.model.layers"] + + +def test_ovis2_6_moe_module_tree_expands_qwen3_moe_paths(): + layer_modules = Ovis2_6_MoeQModel.simple_layer_modules( + model_config=SimpleNamespace(num_experts=3), + quantize_config=SimpleNamespace(dynamic=None), + ) + flat_modules = {name for block in layer_modules for name in block} + + assert "self_attn.q_proj" in flat_modules + assert "self_attn.q_norm" not in flat_modules + assert "self_attn.k_norm" not in flat_modules + assert "mlp.gate" not in flat_modules + assert "mlp.experts.0.gate_proj" in flat_modules + assert "mlp.experts.1.up_proj" in flat_modules + assert "mlp.experts.2.down_proj" in flat_modules + assert Ovis2_6_MoeQModel.defuser_module_paths == ("llm",) + + +def test_ovis2_6_moe_materializes_missing_vision_post_layernorm_defaults(): + layernorm = nn.LayerNorm(4, device="meta", dtype=torch.bfloat16) + + Ovis2_6_MoeQModel._materialize_layernorm_defaults(layernorm, torch.device("cpu")) + + assert layernorm.weight.device.type == "cpu" + assert layernorm.bias.device.type == "cpu" + assert layernorm.weight.dtype == torch.bfloat16 + assert layernorm.bias.dtype == torch.bfloat16 + torch.testing.assert_close(layernorm.weight, torch.ones(4, dtype=torch.bfloat16)) + torch.testing.assert_close(layernorm.bias, torch.zeros(4, dtype=torch.bfloat16))