diff --git a/README.md b/README.md index 160503e0e..5a5ecc65d 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ ## Latest News +* 05/21/2026 7.1.0-dev `main`: ✨ Added `nemotron_labs_diffusion` model support * 05/20/2026 7.1.0-dev `main`: ✨ Added `interns1`, `ovis2_5`, `ovis2_6_moe` and `ovis2_6_next` model support * 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support * 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support @@ -247,23 +248,23 @@ Selected public references where teams or companies explicitly mention GPT-QMode ## Model Support -| Model | | | | | | | | | | -|--------------------------|---|---------------------------------|---|------------------|---|---------------------|---|-------------------------|---| -| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ | -| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ | -| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ | -| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ | -| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ | -| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ | -| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | TeleChat2 | ✅ | -| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ | -| DeepSeek-V2/V3/V4/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ | +| Model | | | | | | | | | | +|--------------------------|---|---------------------------------|---|------------------|---|---------------------------------|---|-------------------------|---| +| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ | +| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ | +| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ | +| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ | +| CodeGen | ✅ | GPTBigCode | ✅ | LING/RING | ✅ | MPT | ✅ | StableLM | ✅ | +| Cohere 1-2 | ✅ | GPT-Neo / NeoX | ✅ | Llama 1-3.3 | ✅ | Nemotron H / Omni | ✅ | StarCoder2 | ✅ | +| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra / Labs-Diffusion | ✅ | TeleChat2 | ✅ | +| Deci | ✅ | GPT-J | ✅ | Llama 4 | ✅ | OPT | ✅ | Trinity | ✅ | +| DeepSeek-V2/V3/V4/R1 | ✅ | GPT-OSS | ✅ | LongCat Flash | ✅ | OLMo2 / LLaDA2 | ✅ | Yi | ✅ | | DeepSeek-V2-Lite | ✅ | Granite / Granite MoE | ✅ | LongLLaMA | ✅ | Ovis 1.6/2/2.5/2.6 MoE/2.6 Next | ✅ | Seed-OSS | ✅ | -| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ | -| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ | -| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | -| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ | -| InternVL Chat | ✅ | Laguna | ✅ | Mimo / Mimo V2 | ✅ | Zamba / Zamba2 | ✅ | Intern S1 | ✅ | +| Dream | ✅ | GRIN-MoE | ✅ | Instella | ✅ | Phi 1-4 | ✅ | Voxtral | ✅ | +| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α | ✅ | +| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ | +| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ | +| InternVL Chat | ✅ | Laguna | ✅ | Mimo / Mimo V2 | ✅ | Zamba / Zamba2 | ✅ | Intern S1 | ✅ | Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included. diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 0a903a05f..13be417f3 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -143,6 +143,7 @@ from .definitions.mobilellm import MobileLLMQModel # noqa: E402 from .definitions.moss import MossQModel # noqa: E402 from .definitions.mpt import MptQModel # noqa: E402 +from .definitions.nemotron_labs_diffusion import NemotronLabsDiffusionQModel # noqa: E402 from .definitions.nemotron_h import NemotronHQModel # noqa: E402 from .definitions.nemotron_omni import NemotronOmniQModel # noqa: E402 from .definitions.opt import OptQModel # noqa: E402 @@ -310,6 +311,7 @@ "longcat_flash": LongCatFlashQModel, "llava_qwen2": LlavaQwen2QModel, "nemotron_h": NemotronHQModel, + "nemotron_labs_diffusion": NemotronLabsDiffusionQModel, "nemotronh_nano_omni_reasoning_v3": NemotronOmniQModel, "bailing_moe": BailingMoeQModel, "bailing_hybrid": BailingMoeQModel, diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py index e60f02585..cb4afa012 100644 --- a/gptqmodel/models/definitions/__init__.py +++ b/gptqmodel/models/definitions/__init__.py @@ -60,6 +60,7 @@ from .mobilellm import MobileLLMQModel from .moss import MossQModel from .mpt import MptQModel +from .nemotron_labs_diffusion import NemotronLabsDiffusionQModel from .opt import OptQModel from .ovis import OvisQModel from .ovis2_5 import Ovis2_5QModel diff --git a/gptqmodel/models/definitions/nemotron_labs_diffusion.py b/gptqmodel/models/definitions/nemotron_labs_diffusion.py new file mode 100644 index 000000000..b7c697674 --- /dev/null +++ b/gptqmodel/models/definitions/nemotron_labs_diffusion.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2026 ModelCloud.ai +# SPDX-License-Identifier: Apache-2.0 + +from transformers import AutoModel + +from ..base import BaseQModel + + +class NemotronLabsDiffusionQModel(BaseQModel): + require_trust_remote_code = True + loader = AutoModel + + lm_head = "diffusion_head" + pre_lm_head_norm_module = "encoder.norm" + + awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"] + + # Nemotron Labs Diffusion uses a custom AutoModel with an internal + # Ministral-style decoder stack under encoder.layers. + module_tree = [ + "encoder", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"), + }, + ] diff --git a/tests/models/test_nemotron_labs_diffusion.py b/tests/models/test_nemotron_labs_diffusion.py new file mode 100644 index 000000000..0f426969c --- /dev/null +++ b/tests/models/test_nemotron_labs_diffusion.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from model_test import ModelTest + + +class TestNemotronUltra(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/Nemotron-Labs-Diffusion-3B" # nvidia/Nemotron-Labs-Diffusion-3B + # FIXME Evalution appears to be incompatible with NemoStation/Marlin-2B support; the original model's scores are also quite low. + # original model score: {'arc_challenge': {'accuracy,loglikelihood': 0.19795221843003413, 'accuracy,loglikelihood_norm': 0.20819112627986347}} + EVAL_TASKS_SLOW = { + "arc_challenge": { + "chat_template": True, + "acc": {"value": 0.197098976109215, "floor_pct": 0.36}, + "acc_norm": {"value": 0.2235494880546075, "floor_pct": 0.36}, + }, + } + EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW) + TRUST_REMOTE_CODE = True + SAVE_PATH = "./temp/Nemotron-Labs-Diffusion" + + def test_nemotron_ultra(self): + self.quantize_and_evaluate()