Skip to content

Commit 1f0a204

Browse files
committed
support ovis2_5
Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
1 parent 1a50635 commit 1f0a204

5 files changed

Lines changed: 224 additions & 1 deletion

File tree

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
## Latest News
2323

24+
* 05/18/2026 7.1.0-dev `main`: ✨ Added `ovis2_5` model support
2425
* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` model support
2526
* 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support
2627
* 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support.
@@ -255,7 +256,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
255256
| DBRX Converted || GPT-2 || Llama 3.2 VL || Nemotron Ultra || TeleChat2 ||
256257
| Deci || GPT-J || Llama 4 || OPT || Trinity ||
257258
| DeepSeek-V2/V3/R1 || GPT-OSS || LongCat Flash || OLMo2 / LLaDA2 || Yi ||
258-
| DeepSeek-V2-Lite || Granite / Granite MoE || LongLLaMA || Ovis 1.6/2 || Seed-OSS ||
259+
| DeepSeek-V2-Lite || Granite / Granite MoE || LongLLaMA || Ovis 1.6/2/2.5 || Seed-OSS ||
259260
| Dream || GRIN-MoE || Instella || Phi 1-4 || Voxtral ||
260261
| ERNIE 4.5 / MoE / VL MoE || GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR || GLM4 MoE / Lite / 4.5V MoE || MiniCPM 3/O/V/V 4_6 || PanGu-α ||
261262
| XVERSE || Brumby || Hymba || Mistral || Qwen 1/2/3/3.5 ||

gptqmodel/models/auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@
142142
from .definitions.opt import OptQModel # noqa: E402
143143
from .definitions.ovis import OvisQModel # noqa: E402
144144
from .definitions.ovis2 import Ovis2QModel # noqa: E402
145+
from .definitions.ovis2_5 import Ovis2_5QModel # noqa: E402
145146
from .definitions.pangu_alpha import PanguAlphaQModel # noqa: E402
146147
from .definitions.phi import PhiQModel # noqa: E402
147148
from .definitions.phi3 import Phi3QModel, PhiMoEGPTQForCausalLM # noqa: E402
@@ -277,6 +278,7 @@
277278
"olmo2": LlamaQModel, # 100% llama clone
278279
"ovis": OvisQModel,
279280
"ovis2": Ovis2QModel,
281+
"ovis2_5": Ovis2_5QModel,
280282
"telechat": TeleChat2QModel,
281283
"instella": InstellaQModel,
282284
"mimo": MimoQModel,
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2+
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
3+
# SPDX-License-Identifier: Apache-2.0
4+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
5+
from types import SimpleNamespace
6+
from typing import Dict, Optional
7+
8+
import requests
9+
from PIL import Image
10+
import torch
11+
from transformers import AutoModelForCausalLM, AutoProcessor, ProcessorMixin
12+
13+
from ...utils.calibration import batched
14+
from ...utils.image import extract_vision_info, fetch_image
15+
from ...utils.model import MODALITY, move_to
16+
from ...utils.offload import offload_to_disk
17+
from .._const import CPU
18+
from ..base import BaseQModel
19+
20+
class Ovis2_5QModel(BaseQModel):
21+
loader = AutoModelForCausalLM
22+
23+
pre_lm_head_norm_module = "llm.model.model.norm"
24+
25+
# HF_CONVERSION_MAP_REVERSED = (
26+
# # Ovis 2.5 builds the SigLIP visual backbone via `AutoModel`, whose
27+
# # runtime shell exposes `visual_tokenizer.vit.*` directly, while
28+
# # checkpoint tensors still live under `visual_tokenizer.vit.vision_model.*`.
29+
# SimpleNamespace(
30+
# source_patterns=[r"^visual_tokenizer\.vit\.(?!vision_model\.)(.+)$"],
31+
# target_patterns=[r"^visual_tokenizer.vit.vision_model.\1"],
32+
# operations=[],
33+
# ),
34+
# )
35+
36+
module_tree = [
37+
"llm",
38+
"model",
39+
"layers",
40+
"#",
41+
{
42+
"input_layernorm": ("input_layernorm:!",),
43+
"self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
44+
"post_attention_layernorm": ("post_attention_layernorm:!",),
45+
"mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
46+
}
47+
]
48+
49+
modality = [MODALITY.IMAGE_TO_TEXT]
50+
51+
require_load_processor = True
52+
53+
def pre_quantize_generate_hook_start(self):
54+
self.shell_module_materialize(self.model.llm.model.embed_tokens, self.quantize_config.device)
55+
self.shell_module_materialize(self.model.llm.model.rotary_emb, self.quantize_config.device)
56+
self.shell_module_materialize(self.model.visual_tokenizer, self.quantize_config.device)
57+
self.shell_module_materialize(self.model.vte, self.quantize_config.device)
58+
59+
# VisionRotaryEmbedding cannot be correctly reconstructed via `_build_nonpersistent_buffer_template()`.
60+
# Therefore, VisionRotaryEmbedding is manually reconstructed here.
61+
rotary_pos_emb_cls = type(self.model.visual_tokenizer.vit.vision_model.encoder.rotary_pos_emb)
62+
config = self.model.config.vit_config
63+
assert "VisionRotaryEmbedding" in rotary_pos_emb_cls.__name__
64+
rotary_pos_emb = rotary_pos_emb_cls(config.hidden_size // config.num_attention_heads // 2).to(self.quantize_config.device)
65+
self.model.visual_tokenizer.vit.vision_model.encoder.rotary_pos_emb = rotary_pos_emb
66+
67+
def pre_quantize_generate_hook_end(self):
68+
if self.quantize_config.offload_to_disk:
69+
offload_to_disk(model=self.model.llm,
70+
module=self.model.llm.model.embed_tokens,
71+
disk_path=self.quantize_config.offload_to_disk_path,
72+
)
73+
offload_to_disk(model=self.model.llm,
74+
module=self.model.llm.model.rotary_emb,
75+
disk_path=self.quantize_config.offload_to_disk_path,
76+
)
77+
offload_to_disk(model=self.model,
78+
module=self.model.visual_tokenizer,
79+
disk_path=self.quantize_config.offload_to_disk_path,
80+
)
81+
offload_to_disk(model=self.model,
82+
module=self.model.vte,
83+
disk_path=self.quantize_config.offload_to_disk_path,
84+
)
85+
return
86+
87+
self.model.llm.model.embed_tokens = move_to(self.model.llm.model.embed_tokens, device=CPU)
88+
self.model.llm.model.rotary_emb = move_to(self.model.llm.model.rotary_emb, device=CPU)
89+
self.model.visual_tokenizer = move_to(self.model.visual_tokenizer, device=CPU)
90+
self.model.vte = move_to(self.model.vte, device=CPU)
91+
92+
def preprocess_dataset(self, sample: Dict) -> Dict:
93+
return sample
94+
95+
def load_processor(self) -> ProcessorMixin:
96+
return AutoProcessor.from_pretrained(self.model_local_path)
97+
98+
@staticmethod
99+
def process_vision_info(
100+
conversations: list[dict] | list[list[dict]],
101+
) -> Optional[list[Image.Image]]:
102+
vision_infos = extract_vision_info(conversations)
103+
# Read images
104+
image_inputs = []
105+
for vision_info in vision_infos:
106+
if "image" in vision_info or "image_url" in vision_info:
107+
image_inputs.append(fetch_image(vision_info))
108+
else:
109+
raise ValueError("image, image_url should in content.")
110+
if len(image_inputs) == 0:
111+
image_inputs = None
112+
return image_inputs
113+
114+
@staticmethod
115+
def replace_image_with_pil(sample):
116+
"""
117+
image url -> PIL.Image
118+
"""
119+
120+
for msg in sample:
121+
if "content" not in msg and not isinstance(msg["content"], dict):
122+
continue
123+
124+
for item in msg["content"]:
125+
if isinstance(item, dict) and item.get("type") == "image":
126+
item["image"] = Image.open(
127+
requests.get(item["image"], stream=True).raw
128+
)
129+
130+
return sample
131+
132+
def prepare_dataset(self, calibration_dataset, batch_size: int = 1, **kwargs):
133+
calib_data = []
134+
for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
135+
for sample in batch:
136+
sample = self.replace_image_with_pil(sample)
137+
input_ids, pixel_values, grid_thws = self.model.preprocess_inputs(
138+
messages=sample,
139+
add_generation_prompt=True,
140+
)
141+
attention_mask = torch.ne(input_ids, self.model.text_tokenizer.pad_token_id)
142+
143+
if pixel_values is not None:
144+
pixel_values = pixel_values.to(dtype=self.model.visual_tokenizer.vit.dtype)
145+
146+
calib_data.append(
147+
{
148+
"input_ids": input_ids,
149+
"attention_mask": attention_mask,
150+
"pixel_values": pixel_values,
151+
"grid_thws": grid_thws,
152+
}
153+
)
154+
return calib_data

tests/models/ovis/image_to_test_dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from gptqmodel.models.definitions.minicpmv_4_6 import MiniCPMV4_6QModel
1212
from gptqmodel.models.definitions.ovis import OvisQModel
1313
from gptqmodel.models.definitions.ovis2 import Ovis2QModel
14+
from gptqmodel.models.definitions.ovis2_5 import Ovis2_5QModel
1415
from gptqmodel.models.definitions.qwen3_vl import Qwen3_VLQModel
1516

1617

@@ -94,6 +95,9 @@ def get_calib_dataset(model):
9495
if isinstance(model, Ovis2QModel):
9596
return prepare_dataset(format_ovis2_dataset, n_sample=20)
9697

98+
if isinstance(model, Ovis2_5QModel):
99+
return prepare_dataset(format_ovis2_dataset, n_sample=20)
100+
97101
if (
98102
isinstance(model, BaseQwen2VLGPTQ)
99103
or isinstance(model, Qwen3_VLQModel)

tests/models/test_ovis2_5.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2+
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
3+
# SPDX-License-Identifier: Apache-2.0
4+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
5+
6+
import os.path
7+
8+
import torch
9+
10+
from model_test import ModelTest
11+
from PIL import Image
12+
13+
14+
class Test(ModelTest):
15+
NATIVE_MODEL_ID = "/monster/data/model/Ovis2.5-2B" # AIDC-AI/Ovis2.5-2B
16+
17+
TRUST_REMOTE_CODE = True
18+
EVAL_BATCH_SIZE = 1
19+
MODEL_COMPAT_FAST_LAYER_POSITION = "first"
20+
21+
def test_ovis(self):
22+
model, _tokenizer, _processor = self.quantModel(
23+
self.NATIVE_MODEL_ID,
24+
trust_remote_code=self.TRUST_REMOTE_CODE,
25+
dtype=self.TORCH_DTYPE,
26+
batch_size=1,
27+
call_perform_post_quant_validation=False
28+
)
29+
30+
text_tokenizer = model.text_tokenizer
31+
32+
image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg")
33+
image = Image.open(image_path)
34+
messages = [{
35+
"role": "user",
36+
"content": [
37+
{"type": "image", "image": image},
38+
{"type": "text", "text": "What does this picture show?"},
39+
],
40+
}]
41+
42+
input_ids, pixel_values, grid_thws = model.preprocess_inputs(
43+
messages=messages,
44+
add_generation_prompt=True,
45+
)
46+
input_ids = input_ids.to(model.device)
47+
pixel_values = pixel_values.to(
48+
dtype=model.visual_tokenizer.vit.dtype,
49+
device=model.device,
50+
) if pixel_values is not None else None
51+
grid_thws = grid_thws.to(model.device) if grid_thws is not None else None
52+
53+
with torch.inference_mode():
54+
output_ids = model.generate(
55+
inputs=input_ids,
56+
pixel_values=pixel_values,
57+
grid_thws=grid_thws,
58+
)
59+
output = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
60+
print(f'Output:\n{output}')
61+
62+
self.assertIn("snow", output.lower())

0 commit comments

Comments
 (0)