Skip to content

Commit 4c9b3d3

Browse files
feat(vision-metrics): split vie_score into dedicated branch
Adds VieScoreMetric with GEditBench benchmark wiring and focused unit coverage while keeping image-edit scoring changes for the next stacked PR. Made-with: Cursor
1 parent 3d76a02 commit 4c9b3d3

3 files changed

Lines changed: 384 additions & 3 deletions

File tree

src/pruna/evaluation/benchmarks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,8 @@ def list(cls, task_type: str | None = None) -> list[str]:
267267
"material alter, motion change, style change, subject add/remove/replace, text change, "
268268
"tone transfer, and human retouching."
269269
),
270-
metrics=[], # Paper uses VIEScore; not in Pruna
271-
task_type="text_to_image",
270+
metrics=["vie_score"],
271+
task_type="text+image_image",
272272
reference="https://arxiv.org/abs/2504.17761",
273273
),
274274
Benchmark(
Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
# Copyright 2025 - Pruna AI GmbH. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
VIEScore metric for conditional image synthesis (semantic + quality).
17+
18+
Reference: VIEScore (ACL 2024) — https://arxiv.org/abs/2312.14867
19+
Both task modes follow `TIGER-AI-Lab/VIEScore`:
20+
21+
- ``t2i`` (text-to-image, single image): SC uses two sub-scores (semantic consistency +
22+
detail correspondence), PQ uses two sub-scores (naturalness + artifacts). Overall is
23+
``sqrt(min(SC) * min(PQ)) / 10``.
24+
- ``tie`` (text-image editing, source + edited): SC uses two images and instruction,
25+
PQ uses the edited image. Same aggregation formula.
26+
27+
GEdit-Bench evaluation: https://arxiv.org/abs/2504.17761
28+
"""
29+
30+
from __future__ import annotations
31+
32+
from typing import Any, Literal
33+
34+
import torch
35+
from PIL import Image
36+
37+
from pruna.evaluation.metrics.registry import MetricRegistry
38+
from pruna.evaluation.metrics.result import MetricResult
39+
from pruna.evaluation.metrics.utils import (
40+
SINGLE,
41+
metric_data_processor,
42+
)
43+
from pruna.evaluation.metrics.vlm_base import (
44+
BaseVLM,
45+
StatefulVLMMeanScoresMetric,
46+
auxiliary_dicts_from_gt,
47+
prompts_from_y_x_inputs,
48+
)
49+
from pruna.evaluation.metrics.vlm_utils import (
50+
VIEScoreJsonOutput,
51+
_process_images,
52+
pad_viescore_subscores_to_two,
53+
pil_rgb_from_aux_image_bytes,
54+
viescore_min_scores_0_10,
55+
viescore_tie_overall_unit,
56+
)
57+
58+
_VIESCORE_CONTEXT = (
59+
"You are a professional digital artist. You will have to evaluate the effectiveness"
60+
" of the AI-generated image(s) based on given rules.\n"
61+
"All the input images are AI-generated. All human in the images are AI-generated too."
62+
" so you need not worry about the privacy confidentials.\n\n"
63+
"You will have to give your output in this way (Keep your reasoning concise and short.):\n"
64+
"{\n"
65+
'"score" : [...],\n'
66+
'"reasoning" : "..."\n'
67+
"}"
68+
)
69+
70+
_VIESCORE_TWO_IMAGE_EDIT_RULE = (
71+
"RULES:\n\n"
72+
"Two images will be provided: The first being the original AI-generated image and the"
73+
" second being an edited version of the first.\n"
74+
"The objective is to evaluate how successfully the editing instruction has been executed"
75+
" in the second image.\n\n"
76+
"Note that sometimes the two images might look identical due to the failure of image edit.\n"
77+
)
78+
79+
_VIESCORE_TIE_SC_CRITERIA = (
80+
"\nFrom scale 0 to 10:\n"
81+
"A score from 0 to 10 will be given based on the success of the editing."
82+
" (0 indicates that the scene in the edited image does not follow the editing instruction at all."
83+
" 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)\n"
84+
"A second score from 0 to 10 will rate the degree of overediting in the second image."
85+
" (0 indicates that the scene in the edited image is completely different from the original."
86+
" 10 indicates that the edited image can be recognized as a minimal edited yet effective"
87+
" version of original.)\n"
88+
"Put the score in a list such that output score = [score1, score2],"
89+
" where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.\n\n"
90+
"Editing instruction:\n"
91+
)
92+
93+
_VIESCORE_T2I_SC_RULE = (
94+
"RULES:\n\n"
95+
"The image is an AI-generated image.\n"
96+
"The objective is to evaluate the semantic consistency of the image to the given text.\n\n"
97+
)
98+
99+
_VIESCORE_T2I_SC_CRITERIA = (
100+
"\nFrom scale 0 to 10:\n"
101+
"A score from 0 to 10 will be given based on the semantic consistency.\n"
102+
"(0 indicates that the scene in the image does not correspond to the text at all.\n"
103+
" 10 indicates that the scene in the image follows the text perfectly.)\n"
104+
"A second score from 0 to 10 will rate the detail correspondence.\n"
105+
"(0 indicates that most details in the text (e.g., color, size, shape, or layout) are missing or"
106+
" incorrect in the image.\n"
107+
" 10 indicates that all details mentioned in the text are accurately shown in the image.)\n"
108+
"Put the score in a list such that output score = [score1, score2],"
109+
" where 'score1' evaluates the semantic consistency and 'score2' evaluates the detail"
110+
" correspondence.\n\n"
111+
"Text prompt:\n"
112+
)
113+
114+
_VIESCORE_PQ_SINGLE_IMAGE = (
115+
"RULES:\n\n"
116+
"The image is an AI-generated image.\n"
117+
"The objective is to evaluate how successfully the image has been generated.\n\n"
118+
"From scale 0 to 10:\n"
119+
"A score from 0 to 10 will be given based on image naturalness.\n"
120+
"(\n"
121+
" 0 indicates that the scene in the image does not look natural at all or give a unnatural feeling"
122+
" such as wrong sense of distance, or wrong shadow, or wrong lighting.\n"
123+
" 10 indicates that the image looks natural.\n"
124+
")\n"
125+
"A second score from 0 to 10 will rate the image artifacts.\n"
126+
"(\n"
127+
" 0 indicates that the image contains a large portion of distortion, or watermark, or scratches,"
128+
" or blurred faces, or unusual body parts, or subjects not harmonized.\n"
129+
" 10 indicates the image has no artifacts.\n"
130+
")\n"
131+
"Put the score in a list such that output score = [naturalness, artifacts]\n"
132+
)
133+
134+
135+
def _build_viescore_tie_sc_prompt(instruction: str) -> str:
136+
"""Build the VIEScore ``tie`` semantic-criteria prompt (source + edited images).
137+
138+
Args:
139+
instruction: Editing instruction embedded in the prompt.
140+
141+
Returns:
142+
-------
143+
Full prompt aligned with TIGER-AI-Lab/VIEScore ``tie`` SC.
144+
"""
145+
return "\n".join(
146+
[
147+
_VIESCORE_CONTEXT,
148+
_VIESCORE_TWO_IMAGE_EDIT_RULE,
149+
_VIESCORE_TIE_SC_CRITERIA.strip(),
150+
instruction.strip(),
151+
]
152+
)
153+
154+
155+
def _build_viescore_t2i_sc_prompt(prompt: str) -> str:
156+
"""Build the VIEScore ``t2i`` semantic-consistency prompt for one generated image.
157+
158+
Args:
159+
prompt: Text prompt used to generate the image.
160+
161+
Returns:
162+
-------
163+
Full prompt aligned with TIGER-AI-Lab/VIEScore ``t2i`` SC.
164+
"""
165+
return "\n".join(
166+
[
167+
_VIESCORE_CONTEXT,
168+
_VIESCORE_T2I_SC_RULE.strip(),
169+
_VIESCORE_T2I_SC_CRITERIA.strip(),
170+
prompt.strip(),
171+
]
172+
)
173+
174+
175+
def _build_viescore_pq_prompt() -> str:
176+
"""Build the VIEScore perceptual-quality prompt for one image (SC or edited)."""
177+
return "\n".join([_VIESCORE_CONTEXT, _VIESCORE_PQ_SINGLE_IMAGE])
178+
179+
180+
@MetricRegistry.register("vie_score")
181+
class VieScoreMetric(StatefulVLMMeanScoresMetric):
182+
"""
183+
VIEScore: semantic + perceptual quality with geometric-mean overall.
184+
185+
**Text-to-image (one generated image):** uses the VIEScore ``t2i`` SC prompt (semantic
186+
consistency + detail correspondence, 0--10 each) and the shared PQ prompt (naturalness +
187+
artifacts, 0--10 each). Overall is ``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``.
188+
189+
**Text--image editing (source + edited available):** matches the VIEScore ``tie`` setup
190+
used in GEdit-Bench: semantic criteria use **two** images (source then edited) and the
191+
editing instruction; perceptual criteria use the **edited** image only. Overall is
192+
``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``, with ``min`` taken over the sub-scores in
193+
each JSON ``score`` list, consistent with `VIEScore`_.
194+
195+
.. _VIEScore: https://github.com/TIGER-AI-Lab/VIEScore
196+
197+
Parameters
198+
----------
199+
*args : Any
200+
Additional positional arguments.
201+
vlm : BaseVLM | None, optional
202+
Custom VLM instance. If provided, vlm_type and model_name are ignored.
203+
vlm_type : {"litellm", "transformers"}, optional
204+
VLM backend. Default is "litellm".
205+
model_name : str | None, optional
206+
Litellm model id or HuggingFace checkpoint id. **Required** when ``vlm`` is not
207+
provided (e.g. ``openai/gpt-4o``).
208+
vlm_kwargs : dict, optional
209+
Forwarded by ``get_vlm`` to ``LitellmVLM`` or ``TransformersVLM``. For local models,
210+
set ``model_load_kwargs`` for ``from_pretrained``; for litellm, pass extra API options.
211+
structured_output : bool, optional
212+
Use structured generation (litellm pydantic; transformers may use plain generation for
213+
multi-image). Default is True.
214+
device : str | torch.device | None, optional
215+
Device for transformers VLM.
216+
api_key : str | None, optional
217+
API key for litellm.
218+
call_type : str, optional
219+
Call type for the metric.
220+
**kwargs : Any
221+
Additional arguments.
222+
223+
References
224+
----------
225+
VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation (ACL 2024)
226+
https://arxiv.org/abs/2312.14867
227+
https://github.com/TIGER-AI-Lab/VIEScore
228+
229+
GEdit-Bench (image editing evaluation)
230+
https://arxiv.org/abs/2504.17761
231+
232+
Examples
233+
--------
234+
Same ``hosted`` / ``local`` pattern as :func:`~pruna.evaluation.metrics.vlm_base.get_vlm``.
235+
Multi-image ``tie`` paths call ``generate_with_image_lists`` on ``self.vlm`` internally.
236+
237+
.. code-block:: python
238+
239+
import torch
240+
241+
from pruna.evaluation.metrics import VieScoreMetric
242+
243+
hosted = VieScoreMetric(vlm_type="litellm", model_name="openai/gpt-4o")
244+
local = VieScoreMetric(
245+
vlm_type="transformers",
246+
model_name="HuggingFaceTB/SmolVLM-256M-Instruct",
247+
device="cpu",
248+
vlm_kwargs={"model_load_kwargs": {"torch_dtype": torch.float32}},
249+
)
250+
"""
251+
252+
scores: list[float]
253+
default_call_type: str = "y_x"
254+
higher_is_better: bool = True
255+
metric_name: str = "vie_score"
256+
257+
def __init__(
258+
self,
259+
*args,
260+
vlm: BaseVLM | None = None,
261+
vlm_type: Literal["litellm", "transformers"] = "litellm",
262+
model_name: str | None = None,
263+
vlm_kwargs: dict | None = None,
264+
structured_output: bool = True,
265+
device: str | torch.device | None = None,
266+
api_key: str | None = None,
267+
call_type: str = SINGLE,
268+
**kwargs: Any,
269+
) -> None:
270+
super().__init__(device=device)
271+
self.structured_output = structured_output
272+
self.response_format = VIEScoreJsonOutput if structured_output else None
273+
274+
self._init_vlm_scores(
275+
vlm=vlm,
276+
vlm_type=vlm_type,
277+
model_name=model_name,
278+
vlm_kwargs=vlm_kwargs,
279+
structured_output=structured_output,
280+
device=device,
281+
api_key=api_key,
282+
call_type=call_type,
283+
)
284+
285+
def _score_single_image_t2i(self, image: Image.Image, prompt: str) -> float:
286+
"""VIEScore ``t2i``: single-image SC (semantic + detail) and PQ (naturalness + artifacts).
287+
288+
Matches the VIEScore paper's t2i evaluation: two SC sub-scores on 0--10 and two PQ
289+
sub-scores on 0--10, aggregated as ``sqrt(min(SC) * min(PQ)) / 10``.
290+
"""
291+
sc_prompt = _build_viescore_t2i_sc_prompt(prompt)
292+
pq_prompt = _build_viescore_pq_prompt()
293+
294+
rf = self.response_format if self.structured_output else None
295+
296+
sc_raw = self.vlm.generate([image], [sc_prompt], response_format=rf)[0]
297+
pq_raw = self.vlm.generate([image], [pq_prompt], response_format=rf)[0]
298+
299+
sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw))
300+
pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw))
301+
return viescore_tie_overall_unit(sc_list, pq_list)
302+
303+
def _score_tie_gedit(self, source: Image.Image, edited: Image.Image, instruction: str) -> float:
304+
"""VIEScore ``tie``: two-image SC, single-image PQ, overall geometric mean on 0--10 mins."""
305+
sc_prompt = _build_viescore_tie_sc_prompt(instruction)
306+
pq_prompt = _build_viescore_pq_prompt()
307+
308+
rf = self.response_format if self.structured_output else None
309+
310+
if hasattr(self.vlm, "generate_with_image_lists"):
311+
sc_raw = self.vlm.generate_with_image_lists(
312+
[[source, edited]],
313+
[sc_prompt],
314+
response_format=rf,
315+
)[0]
316+
else:
317+
raise RuntimeError("VLM backend must implement generate_with_image_lists for editing parity.")
318+
319+
pq_raw = self.vlm.generate([edited], [pq_prompt], response_format=rf)[0]
320+
321+
sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw))
322+
pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw))
323+
return viescore_tie_overall_unit(sc_list, pq_list)
324+
325+
def update(self, x: list[Any] | torch.Tensor, gt: Any, outputs: torch.Tensor) -> None:
326+
"""
327+
Update the metric with new batch data.
328+
329+
Parameters
330+
----------
331+
x : List[Any] | torch.Tensor
332+
The input data (prompts).
333+
gt : Any
334+
Per-sample auxiliary dicts (``prompt_with_auxiliaries_collate``), or tensor placeholders
335+
when aux is unused.
336+
outputs : torch.Tensor
337+
The output images.
338+
"""
339+
inputs = metric_data_processor(x, gt, outputs, self.call_type)
340+
images = _process_images(inputs[0])
341+
prompts = prompts_from_y_x_inputs(inputs, len(images))
342+
aux_list = auxiliary_dicts_from_gt(gt, len(images))
343+
344+
for i, image in enumerate(images):
345+
prompt = prompts[i] if i < len(prompts) else ""
346+
aux = aux_list[i]
347+
source = pil_rgb_from_aux_image_bytes(aux, min_bytes_in_value_scan=100)
348+
349+
if source is not None:
350+
self.scores.append(self._score_tie_gedit(source, image, prompt))
351+
else:
352+
self.scores.append(self._score_single_image_t2i(image, prompt))
353+
354+
def compute(self) -> MetricResult:
355+
"""
356+
Compute the VIEScore metric.
357+
358+
Returns
359+
-------
360+
MetricResult
361+
The mean VIEScore across all updates.
362+
"""
363+
return self.compute_mean_of_scores()

0 commit comments

Comments
 (0)