Skip to content

Commit a1e68b0

Browse files
committed
add qwen3-vl
1 parent 4828dac commit a1e68b0

13 files changed

+376
-8
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ pip install -e .
127127
#### ✅ (Recommended) Install Optional High-Performance Dependencies
128128
For the best performance, especially during inference, we highly recommend installing vllm.
129129
```bash
130-
pip install vllm
130+
pip install -U vllm
131131
```
132132

133133
---

editscore/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,27 @@ def __init__(
5959
lora_path=lora_path,
6060
cache_dir=cache_dir,
6161
)
62+
elif self.backbone == "qwen3vl":
63+
from .mllm_tools.qwen3vl import Qwen3VL
64+
self.model = Qwen3VL(
65+
vlm_model=model_name_or_path,
66+
temperature=temperature,
67+
seed=seed,
68+
lora_path=lora_path,
69+
)
70+
elif self.backbone == "qwen3vl_vllm":
71+
from .mllm_tools.qwen3vl_vllm import Qwen3VL
72+
self.model = Qwen3VL(
73+
vlm_model=model_name_or_path,
74+
tensor_parallel_size=tensor_parallel_size,
75+
max_model_len=max_model_len,
76+
max_num_seqs=max_num_seqs,
77+
max_num_batched_tokens=max_num_batched_tokens,
78+
temperature=temperature,
79+
seed=seed,
80+
lora_path=lora_path,
81+
cache_dir=cache_dir,
82+
)
6283
elif self.backbone == "internvl3_5":
6384
from .mllm_tools.internvl35_lmdeploy import InternVL35
6485
self.model = InternVL35(model=model_name_or_path, tensor_parallel_size=tensor_parallel_size)

editscore/mllm_tools/qwen3vl.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from typing import Optional
2+
import random
3+
import numpy as np
4+
import torch
5+
6+
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
7+
from peft import PeftModel
8+
9+
10+
def set_seed(seed: int):
11+
"""
12+
Args:
13+
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
14+
seed (`int`): The seed to set.
15+
"""
16+
random.seed(seed)
17+
np.random.seed(seed)
18+
torch.manual_seed(seed)
19+
torch.cuda.manual_seed_all(seed)
20+
21+
22+
def apply_chat_template(prompt, num_images: int = 2):
23+
"""
24+
This is used since the bug of transformers which do not support vision id https://github.com/QwenLM/Qwen2.5-VL/issues/716#issuecomment-2723316100
25+
"""
26+
template = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
27+
template += "".join([f"<img{i}>: <|vision_start|><|image_pad|><|vision_end|>" for i in range(1, num_images + 1)])
28+
template += f"{prompt}<|im_end|>\n<|im_start|>assistant\n"
29+
return template
30+
31+
32+
class Qwen3VL():
33+
def __init__(
34+
self,
35+
vlm_model,
36+
temperature: float = 0.7,
37+
seed: Optional[int] = None,
38+
lora_path: Optional[str] = None,
39+
) -> None:
40+
self.model = Qwen3VLForConditionalGeneration.from_pretrained(
41+
vlm_model, torch_dtype=torch.bfloat16, device_map="auto"
42+
)
43+
if lora_path:
44+
self.model = PeftModel.from_pretrained(self.model, lora_path)
45+
self.model = self.model.merge_and_unload()
46+
47+
self.processor = AutoProcessor.from_pretrained(vlm_model)
48+
self.temperature = temperature
49+
self.seed = seed
50+
51+
def prepare_input(self, images, text_prompt: str = ""):
52+
if not isinstance(images, list):
53+
images = [images]
54+
55+
messages = [
56+
{
57+
"role": "user",
58+
"content": [{"type": "image", "image": image} for image in images]
59+
+ [{"type": "text", "text": text_prompt}],
60+
}
61+
]
62+
63+
inputs = self.processor.apply_chat_template(
64+
messages,
65+
tokenize=True,
66+
add_generation_prompt=True,
67+
return_dict=True,
68+
return_tensors="pt"
69+
)
70+
71+
inputs = inputs.to("cuda")
72+
73+
return inputs
74+
75+
def inference(self, inputs, seed: Optional[int] = None):
76+
seed = self.seed if seed is None else seed
77+
78+
set_seed(seed)
79+
generated_ids = self.model.generate(
80+
**inputs,
81+
max_new_tokens=512,
82+
do_sample=True,
83+
temperature=self.temperature,
84+
top_p=0.9,
85+
top_k=20,
86+
)
87+
generated_ids_trimmed = [
88+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
89+
]
90+
outputs = self.processor.batch_decode(
91+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
92+
)
93+
94+
outputs = [output.strip() for output in outputs]
95+
return outputs[0]
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
from typing import Optional
2+
3+
import os
4+
import hashlib
5+
import random
6+
import time
7+
import numpy as np
8+
import torch
9+
10+
from vllm import LLM
11+
from vllm.sampling_params import SamplingParams
12+
13+
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
14+
from peft import PeftModel
15+
16+
from qwen_vl_utils import process_vision_info
17+
18+
19+
def set_seed(seed: int):
20+
"""
21+
Args:
22+
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
23+
seed (`int`): The seed to set.
24+
"""
25+
random.seed(seed)
26+
np.random.seed(seed)
27+
torch.manual_seed(seed)
28+
torch.cuda.manual_seed_all(seed)
29+
30+
31+
def apply_chat_template(prompt, num_images: int = 2):
32+
"""
33+
This is used since the bug of transformers which do not support vision id https://github.com/QwenLM/Qwen2.5-VL/issues/716#issuecomment-2723316100
34+
"""
35+
template = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
36+
template += "".join([f"<img{i}>: <|vision_start|><|image_pad|><|vision_end|>" for i in range(1, num_images + 1)])
37+
template += f"{prompt}<|im_end|>\n<|im_start|>assistant\n"
38+
return template
39+
40+
41+
class Qwen3VL():
42+
def __init__(
43+
self,
44+
vlm_model,
45+
max_model_len: int = 1536,
46+
tensor_parallel_size=1,
47+
max_num_seqs=32,
48+
max_num_batched_tokens=1536,
49+
temperature: float = 0.7,
50+
seed: Optional[int] = None,
51+
lora_path: Optional[str] = None,
52+
cache_dir: Optional[str] = None,
53+
) -> None:
54+
if lora_path:
55+
if cache_dir is None:
56+
root_dir = torch.hub.get_dir() # default: ~/.cache/torch/hub
57+
58+
lora_filename = os.path.splitext(os.path.basename(lora_path))[0]
59+
lora_hash = hashlib.md5(lora_path.encode()).hexdigest()[:8]
60+
lora_identifier = f"{lora_filename}_{lora_hash}"
61+
62+
cache_dir = os.path.join(root_dir, "EditScore", f"{os.path.basename(vlm_model)}_merged_lora_{lora_identifier}")
63+
64+
if not os.path.exists(cache_dir):
65+
print(f"Merging LORA to {vlm_model} and saving to {cache_dir}", flush=True)
66+
start_time = time.time()
67+
model = Qwen3VLForConditionalGeneration.from_pretrained(
68+
vlm_model, torch_dtype=torch.bfloat16, device_map="cpu"
69+
)
70+
model = PeftModel.from_pretrained(model, lora_path)
71+
model = model.merge_and_unload()
72+
model.save_pretrained(cache_dir)
73+
74+
processor = AutoProcessor.from_pretrained(vlm_model)
75+
processor.save_pretrained(cache_dir)
76+
77+
print(f"Merging LORA to {vlm_model} and saving to {cache_dir} took {time.time() - start_time} seconds", flush=True)
78+
else:
79+
print(f"Skipping merging LORA, as merged model already exists in {cache_dir}", flush=True)
80+
81+
vlm_model = cache_dir
82+
83+
self.model = LLM(
84+
model=vlm_model,
85+
max_model_len=max_model_len,
86+
tensor_parallel_size=tensor_parallel_size,
87+
max_num_seqs=max_num_seqs,
88+
max_num_batched_tokens=max_num_batched_tokens,
89+
limit_mm_per_prompt={"image": 2},
90+
enable_prefix_caching=True,
91+
)
92+
93+
self.processor = AutoProcessor.from_pretrained(vlm_model)
94+
self.temperature = temperature
95+
self.seed = seed
96+
97+
def prepare_input(self, images, text_prompt: str = ""):
98+
if not isinstance(images, list):
99+
images = [images]
100+
101+
messages = [
102+
{
103+
"role": "user",
104+
"content": [{"type": "image", "image": image} for image in images]
105+
+ [{"type": "text", "text": text_prompt}],
106+
}
107+
]
108+
# text = apply_chat_template(text_prompt, num_images=len(images))
109+
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
110+
image_inputs, _ = process_vision_info(messages)
111+
112+
messages = {
113+
"prompt": text,
114+
"multi_modal_data": {"image": image_inputs},
115+
}
116+
return messages
117+
118+
def inference(self, messages, seed: Optional[int] = None):
119+
seed = self.seed if seed is None else seed
120+
sampling_params = SamplingParams(max_tokens=512, temperature=self.temperature, top_p=0.9, top_k=20, seed=seed)
121+
outputs = self.model.generate(messages, sampling_params, use_tqdm=False)
122+
123+
responses = []
124+
for output in outputs:
125+
instruction = output.outputs[0].text.strip()
126+
responses.append(instruction)
127+
128+
return responses[0]
129+
130+
131+
def batch_inference(self, messages, seed: Optional[int] = None):
132+
seed = self.seed if seed is None else seed
133+
sampling_params = SamplingParams(max_tokens=512, temperature=self.temperature, top_p=0.9, top_k=20, seed=seed)
134+
outputs = self.model.generate(messages, sampling_params, use_tqdm=False)
135+
136+
responses = []
137+
for output in outputs:
138+
instruction = output.outputs[0].text.strip()
139+
responses.append(instruction)
140+
141+
return responses

evaluate.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ python evaluation.py \
77
--result_dir results/EditScore-7B \
88
--backbone qwen25vl \
99
--model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct \
10-
--enable_lora \
1110
--lora_path EditScore/EditScore-7B \
1211
--score_range 25 \
1312
--max_workers 1 \

evaluate_72B_vllm.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ python evaluation.py \
77
--result_dir results/EditScore-72B \
88
--backbone qwen25vl_vllm \
99
--model_name_or_path Qwen/Qwen2.5-VL-72B-Instruct \
10-
--enable_lora \
1110
--lora_path EditScore/EditScore-72B \
1211
--score_range 25 \
1312
--max_workers 1 \

evaluate_qwen3_vl_4B.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# !/bin/bash
2+
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
3+
cd $SHELL_FOLDER
4+
5+
source "$(dirname $(which conda))/../etc/profile.d/conda.sh"
6+
conda activate editscore
7+
8+
python evaluation.py \
9+
--benchmark_dir EditScore/EditReward-Bench \
10+
--result_dir results/EditScore-Qwen3-VL-4B \
11+
--backbone qwen3vl \
12+
--model_name_or_path /share/project/shared_models/Qwen3-VL-4B-Instruct \
13+
--lora_path /share/project/jiahao/LLaMA-Factory3/output/editscore_qwen3_4B_ins \
14+
--score_range 25 \
15+
--max_workers 1 \
16+
--max_model_len 4096 \
17+
--max_num_seqs 1 \
18+
--max_num_batched_tokens 4096 \
19+
--tensor_parallel_size 1 \
20+
--num_pass 1
21+
22+
python calculate_statistics.py \
23+
--result_dir results/EditScore-Qwen3-VL-4B/qwen3vl

evaluate_qwen3_vl_4B_avg4.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# !/bin/bash
2+
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
3+
cd $SHELL_FOLDER
4+
5+
source "$(dirname $(which conda))/../etc/profile.d/conda.sh"
6+
conda activate editscore
7+
8+
python evaluation.py \
9+
--benchmark_dir EditScore/EditReward-Bench \
10+
--result_dir results/EditScore-Qwen3-VL-4B-avg4 \
11+
--backbone qwen3vl \
12+
--model_name_or_path /share/project/shared_models/Qwen3-VL-4B-Instruct \
13+
--lora_path /share/project/jiahao/LLaMA-Factory3/output/editscore_qwen3_4B_ins \
14+
--score_range 25 \
15+
--max_workers 1 \
16+
--max_model_len 4096 \
17+
--max_num_seqs 1 \
18+
--max_num_batched_tokens 4096 \
19+
--tensor_parallel_size 1 \
20+
--num_pass 4
21+
22+
python calculate_statistics.py \
23+
--result_dir results/EditScore-Qwen3-VL-4B-avg4/qwen3vl

evaluate_qwen3_vl_4B_vllm.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# !/bin/bash
2+
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
3+
cd $SHELL_FOLDER
4+
5+
source "$(dirname $(which conda))/../etc/profile.d/conda.sh"
6+
conda activate editscore
7+
8+
python evaluation.py \
9+
--benchmark_dir EditScore/EditReward-Bench \
10+
--result_dir results/EditScore-Qwen3-VL-4B \
11+
--backbone qwen3vl_vllm \
12+
--model_name_or_path /share/project/shared_models/Qwen3-VL-4B-Instruct \
13+
--lora_path /share/project/jiahao/LLaMA-Factory3/output/editscore_qwen3_4B_ins \
14+
--score_range 25 \
15+
--max_workers 1 \
16+
--max_model_len 4096 \
17+
--max_num_seqs 1 \
18+
--max_num_batched_tokens 4096 \
19+
--tensor_parallel_size 1 \
20+
--num_pass 1
21+
22+
python calculate_statistics.py \
23+
--result_dir results/EditScore-Qwen3-VL-4B/qwen3vl_vllm

evaluate_qwen3_vl_8B.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# !/bin/bash
2+
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
3+
cd $SHELL_FOLDER
4+
5+
source "$(dirname $(which conda))/../etc/profile.d/conda.sh"
6+
conda activate editscore
7+
8+
python evaluation.py \
9+
--benchmark_dir EditScore/EditReward-Bench \
10+
--result_dir results/EditScore-Qwen3-VL-8B \
11+
--backbone qwen3vl \
12+
--model_name_or_path /share/project/jiahao/models/Qwen3-VL-8B-Instruct \
13+
--lora_path /share/project/jiahao/LLaMA-Factory3/output/editscore_qwen3_8B_ins \
14+
--score_range 25 \
15+
--max_workers 1 \
16+
--max_model_len 4096 \
17+
--max_num_seqs 1 \
18+
--max_num_batched_tokens 4096 \
19+
--tensor_parallel_size 1 \
20+
--num_pass 1
21+
22+
python calculate_statistics.py \
23+
--result_dir results/EditScore-Qwen3-VL-8B/qwen3vl

0 commit comments

Comments
 (0)