ModelCloud · Qubitium · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@
 
 ## Latest News
 
+* 05/19/2026 7.1.0-dev `main`: ✨ Added `ovis2_6_moe` model support
 * 05/18/2026 7.1.0-dev `main`: ✨ Added `ovis2_5` model support
 * 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support
 * 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support
@@ -257,7 +258,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
 | DBRX Converted           | ✅ | GPT-2                           | ✅ | Llama 3.2 VL     | ✅ | Nemotron Ultra      | ✅ | TeleChat2               | ✅ |
 | Deci                     | ✅ | GPT-J                           | ✅ | Llama 4          | ✅ | OPT                 | ✅ | Trinity                 | ✅ |
 | DeepSeek-V2/V3/V4/R1        | ✅ | GPT-OSS                         | ✅ | LongCat Flash    | ✅ | OLMo2 / LLaDA2      | ✅ | Yi                      | ✅ |
-| DeepSeek-V2-Lite         | ✅ | Granite / Granite MoE           | ✅ | LongLLaMA        | ✅ | Ovis 1.6/2/2.5          | ✅ | Seed-OSS                | ✅ |
+| DeepSeek-V2-Lite         | ✅ | Granite / Granite MoE           | ✅ | LongLLaMA        | ✅ | Ovis 1.6/2/2.5/2.6 MoE | ✅ | Seed-OSS                | ✅ |
 | Dream                    | ✅ | GRIN-MoE                        | ✅ | Instella         | ✅ | Phi 1-4             | ✅ | Voxtral                 | ✅ |
 | ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α                 | ✅ |
 | XVERSE                   | ✅ | Brumby                          | ✅ | Hymba            | ✅ | Mistral             | ✅ | Qwen 1/2/3/3.5          | ✅ |

diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
@@ -175,12 +175,12 @@ def _build_device_thread_pool():
             "cpu": WarmupTask(run_torch_linalg_warmup, scope=WarmUpCtx.THREAD_AND_DEVICE),
         },
         workers={
-            "cuda:per": 4,
+            "cuda:per": 1,
             "xpu:per": 1,
             "npu:per": 1,
-            "mps": 8,
-            "cpu": min(12, max(1, (os.cpu_count() or 1) + 1 // 2)),  # count + 1, fixed pool size > 1 check when count=3
-            "model_loader:cpu": 2,
+            "mps": 1,
+            "cpu": 1,  # count + 1, fixed pool size > 1 check when count=3
+            "model_loader:cpu": 1,
         },
         empty_cache_every_n=512,
     )

diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -147,6 +147,7 @@
 from .definitions.ovis import OvisQModel  # noqa: E402
 from .definitions.ovis2 import Ovis2QModel  # noqa: E402
 from .definitions.ovis2_5 import Ovis2_5QModel  # noqa: E402
+from .definitions.ovis2_6_moe import Ovis2_6_MoeQModel  # noqa: E402
 from .definitions.pangu_alpha import PanguAlphaQModel  # noqa: E402
 from .definitions.phi import PhiQModel  # noqa: E402
 from .definitions.phi3 import Phi3QModel, PhiMoEGPTQForCausalLM  # noqa: E402
@@ -285,6 +286,7 @@
     "ovis": OvisQModel,
     "ovis2": Ovis2QModel,
     "ovis2_5": Ovis2_5QModel,
+    "ovis2_6_moe": Ovis2_6_MoeQModel,
     "telechat": TeleChat2QModel,
     "instella": InstellaQModel,
     "mimo": MimoQModel,

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -266,6 +266,11 @@ class BaseQModel(nn.Module):
 
     INPUT_EMBEDDING_EXTRA_ARGS = None
 
+    # Some models (e.g. ovis2_6_moe) do not contain MoE layers directly.
+    # The actual experts live inside submodules (e.g. Qwen3MoeModel.mlp.experts),
+    # so `defuser_module_paths` is used to explicitly locate and defuse them.
+    defuser_module_paths = None
+
     def __init__(
         self,
         model: PreTrainedModel,

diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
@@ -61,6 +61,7 @@
 from .opt import OptQModel
 from .ovis import OvisQModel
 from .ovis2_5 import Ovis2_5QModel
+from .ovis2_6_moe import Ovis2_6_MoeQModel
 from .phi import PhiQModel
 from .phi3 import Phi3QModel
 from .qwen import QwenQModel

diff --git a/gptqmodel/models/definitions/ovis2_6_moe.py b/gptqmodel/models/definitions/ovis2_6_moe.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+import torch
+from torch import nn
+
+from ..moe_lifecycle import GateUpDownMoELifecycleHooks
+from .ovis2_5 import Ovis2_5QModel
+
+
+class Ovis2_6_MoeQModel(Ovis2_5QModel):
+    dynamic_expert_index = "num_experts"
+
+    pre_lm_head_norm_module = "llm.model.norm"
+    rotary_embedding = "llm.model.rotary_emb"
+
+    awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]
+
+    defuser_module_paths = ("llm",)
+
+    moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()
+
+    module_tree = [
+        "llm",
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": ("q_norm:!", "k_norm:!", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp:moe:?": {
+                "gate": ("gate:!",),
+                "experts": {
+                    "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                },
+            },
+        }
+    ]
+
+    @staticmethod
+    def _materialize_layernorm_defaults(layernorm: nn.LayerNorm, device: torch.device) -> None:
+        if layernorm.weight is not None and (
+            getattr(layernorm.weight, "is_meta", False) or layernorm.weight.device.type == "meta"
+        ):
+            layernorm.weight = nn.Parameter(
+                torch.ones(layernorm.normalized_shape, device=device, dtype=layernorm.weight.dtype),
+                requires_grad=layernorm.weight.requires_grad,
+            )
+
+        if layernorm.bias is not None and (
+            getattr(layernorm.bias, "is_meta", False) or layernorm.bias.device.type == "meta"
+        ):
+            layernorm.bias = nn.Parameter(
+                torch.zeros(layernorm.normalized_shape, device=device, dtype=layernorm.bias.dtype),
+                requires_grad=layernorm.bias.requires_grad,
+            )
+
+    def _materialize_missing_vision_post_layernorm(self, device: torch.device) -> None:
+        post_layernorm = getattr(
+            getattr(getattr(self.model.visual_tokenizer, "vit", None), "vision_model", None),
+            "post_layernorm",
+            None,
+        )
+        if isinstance(post_layernorm, nn.LayerNorm):
+            self._materialize_layernorm_defaults(post_layernorm, device)
+
+    def pre_quantize_generate_hook_start(self):
+        # Ovis 2.6 checkpoints omit SigLIP2 post_layernorm tensors even though
+        # the remote code constructs the LayerNorm. Keep its default init instead of
+        # resolving nonexistent checkpoint keys.
+        self._materialize_missing_vision_post_layernorm(torch.device(self.quantize_config.device))
+        super().pre_quantize_generate_hook_start()
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
@@ -99,6 +99,21 @@ def _maybe_print_module_tree(model) -> None:
         print_module_tree(model=model)
 
 
+def _convert_model_with_defuser(cls, model, cleanup_original: bool) -> bool:
+    converted = defuser.convert_model(model, cleanup_original=cleanup_original)
+
+    defuser_module_paths = getattr(cls, "defuser_module_paths", ())
+    if defuser_module_paths:
+        for module_path in defuser_module_paths:
+            module, _ = get_module_by_name_prefix(model, module_path)
+            if module is None:
+                log.warn("Loader: defuser module path `%s` was not found.", module_path)
+                continue
+            converted = defuser.convert_model(module, cleanup_original=cleanup_original) or converted
+
+    return converted
+
+
 def _supports_flash_attn_2(config: PretrainedConfig) -> bool:
     """Detect whether the resolved HF architecture exposes FA2 kernels."""
 
@@ -727,12 +742,12 @@ def skip(*args, **kwargs):
                 )
                 if getattr(model, "config", None) is config:
                     model.config = copy.deepcopy(config)
-                defuser.convert_model(model, cleanup_original=False)
+                _convert_model_with_defuser(cls, model, cleanup_original=False)
                 model._model_init_kwargs = fallback_init_kwargs
                 _maybe_print_module_tree(model=model)
                 turtle_model = None
             else:
-                defuser.convert_model(model, cleanup_original=False)
+                _convert_model_with_defuser(cls, model, cleanup_original=False)
                 shell_model_init_kwargs = dict(model_init_kwargs_without_internal)
                 shell_model_init_kwargs.update(hf_gguf_load_kwargs)
                 model._model_init_kwargs = shell_model_init_kwargs
@@ -768,7 +783,7 @@ def skip(*args, **kwargs):
             )
             if getattr(model, "config", None) is config:
                 model.config = copy.deepcopy(config)
-            defuser.convert_model(model, cleanup_original=False)
+            _convert_model_with_defuser(cls, model, cleanup_original=False)
             direct_model_init_kwargs = dict(model_init_kwargs_without_internal)
             direct_model_init_kwargs.update(hf_gguf_load_kwargs)
             model._model_init_kwargs = direct_model_init_kwargs
@@ -1188,7 +1203,7 @@ def from_quantized(
                     )
                 else:
                     raise
-            defuser.convert_model(model, cleanup_original=True)
+            _convert_model_with_defuser(cls, model, cleanup_original=True)
             model.checkpoint_file_name = model_save_name
             if native_gguf_qspec is not None:
                 gguf_tensor_key_mapping = _build_gguf_tensor_key_mapping(model, config)

diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py
@@ -1317,6 +1317,15 @@ def encoder_init_compat(self, encoder_config):
             if vision_model_cls:
                 try_patch_legacy_flash_attn_flag(vision_model_cls)
 
+        if config.model_type == "ovis2_6_moe":
+            vision_model_cls = getattr(
+                remote_module,
+                "Siglip2NavitModel",
+                None,
+            )
+            if vision_model_cls:
+                try_patch_legacy_flash_attn_flag(vision_model_cls)
+
         if (
             outer_model_cls is not None
             and hasattr(outer_model_cls, "tie_weights")
@@ -1359,7 +1368,7 @@ def tie_weights_compat(self, *args, **kwargs):
                     formatter_cls.support_tokenizer_types = support_tokenizer_types
                 formatter_cls._gptqmodel_tokenizer_backend_patch = True
 
-        if getattr(config, "model_type", None) == "ovis2_5":
+        if getattr(config, "model_type", None) in {"ovis2_5", "ovis2_6", "ovis2_6_moe"}:
             register_runtime_automodel_config(config, remote_module, "vit_config", "Siglip2NavitModel")
 
         if getattr(config, "model_type", None) == "hymba" and remote_module is not None:
@@ -1520,9 +1529,8 @@ def try_patch_legacy_flash_attn_flag(model_cls):
             return
 
         # The remote modeling code for some models(For example, ovis.) still relies on `_supports_flash_attn_2`
-        if hasattr(model_cls, "_supports_flash_attn"):
-            if not hasattr(model_cls, "_supports_flash_attn_2"):
-                setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn))
+        if hasattr(model_cls, "_supports_flash_attn") and not hasattr(model_cls, "_supports_flash_attn_2"):
+            setattr(model_cls, "_supports_flash_attn_2", bool(model_cls._supports_flash_attn))
             return
 
         # Find the most specific class that explicitly declares the newer

diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py
@@ -2072,7 +2072,6 @@ def _copy_checkpoint_tensors_into_submodule(
         grouped_names: Dict[str, list[tuple[str, str, str, Optional[int], Optional[int], Optional[int]]]] = {}
         for rel_name in t_params:
             full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, rel_name)
-            print("full_name", full_name, rel_name)
             if full_name is None:
                 continue
             shard = self._weight_map.get(full_name)

diff --git a/tests/models/ovis/image_to_test_dataset.py b/tests/models/ovis/image_to_test_dataset.py
@@ -12,6 +12,7 @@
 from gptqmodel.models.definitions.ovis import OvisQModel
 from gptqmodel.models.definitions.ovis2 import Ovis2QModel
 from gptqmodel.models.definitions.ovis2_5 import Ovis2_5QModel
+from gptqmodel.models.definitions.ovis2_6_moe import Ovis2_6_MoeQModel
 from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel
 
 
@@ -98,6 +99,9 @@ def get_calib_dataset(model):
     if isinstance(model, Ovis2_5QModel):
         return prepare_dataset(format_ovis2_dataset, n_sample=20)
 
+    if isinstance(model, Ovis2_6_MoeQModel):
+        return prepare_dataset(format_ovis2_dataset, n_sample=20)
+
     if (
         isinstance(model, BaseQwen2VLGPTQ)
         or isinstance(model, Qwen3_VLQModel)

diff --git a/tests/models/test_ovis2.py b/tests/models/test_ovis2.py
@@ -17,8 +17,9 @@ class Test(ModelTest):
     EVAL_BATCH_SIZE = 1
 
     def test_ovis(self):
-        model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
-                                           dtype=self.TORCH_DTYPE, batch_size=1)
+        with self.model_compat_test_context():
+            model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
+                                               dtype=self.TORCH_DTYPE, batch_size=1)
 
         messages = [
             {

diff --git a/tests/models/test_ovis2_5.py b/tests/models/test_ovis2_5.py
@@ -19,13 +19,14 @@ class Test(ModelTest):
     MODEL_COMPAT_FAST_LAYER_POSITION = "first"
 
     def test_ovis(self):
-        model, _tokenizer, _processor = self.quantModel(
-            self.NATIVE_MODEL_ID,
-            trust_remote_code=self.TRUST_REMOTE_CODE,
-            dtype=self.TORCH_DTYPE,
-            batch_size=1,
-            call_perform_post_quant_validation=False
-        )
+        with self.model_compat_test_context():
+            model, _tokenizer, _processor = self.quantModel(
+                self.NATIVE_MODEL_ID,
+                trust_remote_code=self.TRUST_REMOTE_CODE,
+                dtype=self.TORCH_DTYPE,
+                batch_size=1,
+                call_perform_post_quant_validation=False
+            )
 
         text_tokenizer = model.text_tokenizer
 

diff --git a/tests/models/test_ovis2_6_moe.py b/tests/models/test_ovis2_6_moe.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+import os.path
+
+import torch
+from PIL import Image
+
+from gptqmodel.quantization.config import MoEConfig, ExpertsRoutingOverride, MOE_ALL_EXPERTS
+from model_test import ModelTest
+
+
+class Test(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/Ovis2.6-30B-A3B"
+
+    TRUST_REMOTE_CODE = True
+    EVAL_BATCH_SIZE = 1
+    MOE_CONFIG = MoEConfig(ExpertsRoutingOverride(num_experts_per_tok=MOE_ALL_EXPERTS))
+    MODEL_COMPAT_FAST_LAYER_POSITION = "first"
+
+    def test_ovis2_6_moe(self):
+        with self.model_compat_test_context():
+            model, _tokenizer, _processor = self.quantModel(
+                self.NATIVE_MODEL_ID,
+                trust_remote_code=self.TRUST_REMOTE_CODE,
+                dtype=self.TORCH_DTYPE,
+                batch_size=1,
+                call_perform_post_quant_validation=False,
+            )
+
+        text_tokenizer = model.text_tokenizer
+
+        image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg")
+        image = Image.open(image_path)
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "What does this picture show?"},
+            ],
+        }]
+
+        input_ids, pixel_values, grid_thws = model.preprocess_inputs(
+            messages=messages,
+            add_generation_prompt=True,
+        )
+        input_ids = input_ids.to(model.device)
+        pixel_values = pixel_values.to(
+            dtype=model.visual_tokenizer.vit.dtype,
+            device=model.device,
+        ) if pixel_values is not None else None
+        grid_thws = grid_thws.to(model.device) if grid_thws is not None else None
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                inputs=input_ids,
+                pixel_values=pixel_values,
+                grid_thws=grid_thws,
+            )
+            output = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+            print(f"Output:\n{output}")
+
+            self.assertIn("snow", output.lower())
diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py
@@ -18,10 +18,12 @@ class TestOvis1_6_Llama(ModelTest):
     USE_FLASH_ATTN = False
 
     def test_ovis_1_6(self):
-        # the evaluation harness does not support Ovis, and will throw an error during execution:
-        # TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values'
-        model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
-                                              dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1, call_perform_post_quant_validation=False)
+        with self.model_compat_test_context():
+            # the evaluation harness does not support Ovis, and will throw an error during execution:
+            # TypeError: Ovis.forward() missing 3 required positional arguments: 'attention_mask', 'labels', and 'pixel_values'
+            model, tokenizer, _ = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
+                                                  dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1,
+                                                  call_perform_post_quant_validation=False)
 
         text_tokenizer = model.get_text_tokenizer()
         visual_tokenizer = model.get_visual_tokenizer()