|
| 1 | +# Copyright 2025 - Pruna AI GmbH. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +""" |
| 16 | +VIEScore metric for conditional image synthesis (semantic + quality). |
| 17 | +
|
| 18 | +Reference: VIEScore (ACL 2024) — https://arxiv.org/abs/2312.14867 |
| 19 | +Both task modes follow `TIGER-AI-Lab/VIEScore`: |
| 20 | +
|
| 21 | +- ``t2i`` (text-to-image, single image): SC uses two sub-scores (semantic consistency + |
| 22 | + detail correspondence), PQ uses two sub-scores (naturalness + artifacts). Overall is |
| 23 | + ``sqrt(min(SC) * min(PQ)) / 10``. |
| 24 | +- ``tie`` (text-image editing, source + edited): SC uses two images and instruction, |
| 25 | + PQ uses the edited image. Same aggregation formula. |
| 26 | +
|
| 27 | +GEdit-Bench evaluation: https://arxiv.org/abs/2504.17761 |
| 28 | +""" |
| 29 | + |
| 30 | +from __future__ import annotations |
| 31 | + |
| 32 | +from typing import Any, Literal |
| 33 | + |
| 34 | +import torch |
| 35 | +from PIL import Image |
| 36 | + |
| 37 | +from pruna.evaluation.metrics.registry import MetricRegistry |
| 38 | +from pruna.evaluation.metrics.result import MetricResult |
| 39 | +from pruna.evaluation.metrics.utils import ( |
| 40 | + SINGLE, |
| 41 | + metric_data_processor, |
| 42 | +) |
| 43 | +from pruna.evaluation.metrics.vlm_base import ( |
| 44 | + BaseVLM, |
| 45 | + StatefulVLMMeanScoresMetric, |
| 46 | + auxiliary_dicts_from_gt, |
| 47 | + prompts_from_y_x_inputs, |
| 48 | +) |
| 49 | +from pruna.evaluation.metrics.vlm_utils import ( |
| 50 | + VIEScoreJsonOutput, |
| 51 | + _process_images, |
| 52 | + pad_viescore_subscores_to_two, |
| 53 | + pil_rgb_from_aux_image_bytes, |
| 54 | + viescore_min_scores_0_10, |
| 55 | + viescore_tie_overall_unit, |
| 56 | +) |
| 57 | + |
| 58 | +_VIESCORE_CONTEXT = ( |
| 59 | + "You are a professional digital artist. You will have to evaluate the effectiveness" |
| 60 | + " of the AI-generated image(s) based on given rules.\n" |
| 61 | + "All the input images are AI-generated. All human in the images are AI-generated too." |
| 62 | + " so you need not worry about the privacy confidentials.\n\n" |
| 63 | + "You will have to give your output in this way (Keep your reasoning concise and short.):\n" |
| 64 | + "{\n" |
| 65 | + '"score" : [...],\n' |
| 66 | + '"reasoning" : "..."\n' |
| 67 | + "}" |
| 68 | +) |
| 69 | + |
| 70 | +_VIESCORE_TWO_IMAGE_EDIT_RULE = ( |
| 71 | + "RULES:\n\n" |
| 72 | + "Two images will be provided: The first being the original AI-generated image and the" |
| 73 | + " second being an edited version of the first.\n" |
| 74 | + "The objective is to evaluate how successfully the editing instruction has been executed" |
| 75 | + " in the second image.\n\n" |
| 76 | + "Note that sometimes the two images might look identical due to the failure of image edit.\n" |
| 77 | +) |
| 78 | + |
| 79 | +_VIESCORE_TIE_SC_CRITERIA = ( |
| 80 | + "\nFrom scale 0 to 10:\n" |
| 81 | + "A score from 0 to 10 will be given based on the success of the editing." |
| 82 | + " (0 indicates that the scene in the edited image does not follow the editing instruction at all." |
| 83 | + " 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)\n" |
| 84 | + "A second score from 0 to 10 will rate the degree of overediting in the second image." |
| 85 | + " (0 indicates that the scene in the edited image is completely different from the original." |
| 86 | + " 10 indicates that the edited image can be recognized as a minimal edited yet effective" |
| 87 | + " version of original.)\n" |
| 88 | + "Put the score in a list such that output score = [score1, score2]," |
| 89 | + " where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.\n\n" |
| 90 | + "Editing instruction:\n" |
| 91 | +) |
| 92 | + |
| 93 | +_VIESCORE_T2I_SC_RULE = ( |
| 94 | + "RULES:\n\n" |
| 95 | + "The image is an AI-generated image.\n" |
| 96 | + "The objective is to evaluate the semantic consistency of the image to the given text.\n\n" |
| 97 | +) |
| 98 | + |
| 99 | +_VIESCORE_T2I_SC_CRITERIA = ( |
| 100 | + "\nFrom scale 0 to 10:\n" |
| 101 | + "A score from 0 to 10 will be given based on the semantic consistency.\n" |
| 102 | + "(0 indicates that the scene in the image does not correspond to the text at all.\n" |
| 103 | + " 10 indicates that the scene in the image follows the text perfectly.)\n" |
| 104 | + "A second score from 0 to 10 will rate the detail correspondence.\n" |
| 105 | + "(0 indicates that most details in the text (e.g., color, size, shape, or layout) are missing or" |
| 106 | + " incorrect in the image.\n" |
| 107 | + " 10 indicates that all details mentioned in the text are accurately shown in the image.)\n" |
| 108 | + "Put the score in a list such that output score = [score1, score2]," |
| 109 | + " where 'score1' evaluates the semantic consistency and 'score2' evaluates the detail" |
| 110 | + " correspondence.\n\n" |
| 111 | + "Text prompt:\n" |
| 112 | +) |
| 113 | + |
| 114 | +_VIESCORE_PQ_SINGLE_IMAGE = ( |
| 115 | + "RULES:\n\n" |
| 116 | + "The image is an AI-generated image.\n" |
| 117 | + "The objective is to evaluate how successfully the image has been generated.\n\n" |
| 118 | + "From scale 0 to 10:\n" |
| 119 | + "A score from 0 to 10 will be given based on image naturalness.\n" |
| 120 | + "(\n" |
| 121 | + " 0 indicates that the scene in the image does not look natural at all or give a unnatural feeling" |
| 122 | + " such as wrong sense of distance, or wrong shadow, or wrong lighting.\n" |
| 123 | + " 10 indicates that the image looks natural.\n" |
| 124 | + ")\n" |
| 125 | + "A second score from 0 to 10 will rate the image artifacts.\n" |
| 126 | + "(\n" |
| 127 | + " 0 indicates that the image contains a large portion of distortion, or watermark, or scratches," |
| 128 | + " or blurred faces, or unusual body parts, or subjects not harmonized.\n" |
| 129 | + " 10 indicates the image has no artifacts.\n" |
| 130 | + ")\n" |
| 131 | + "Put the score in a list such that output score = [naturalness, artifacts]\n" |
| 132 | +) |
| 133 | + |
| 134 | + |
| 135 | +def _build_viescore_tie_sc_prompt(instruction: str) -> str: |
| 136 | + """Build the VIEScore ``tie`` semantic-criteria prompt (source + edited images). |
| 137 | +
|
| 138 | + Args: |
| 139 | + instruction: Editing instruction embedded in the prompt. |
| 140 | +
|
| 141 | + Returns: |
| 142 | + ------- |
| 143 | + Full prompt aligned with TIGER-AI-Lab/VIEScore ``tie`` SC. |
| 144 | + """ |
| 145 | + return "\n".join( |
| 146 | + [ |
| 147 | + _VIESCORE_CONTEXT, |
| 148 | + _VIESCORE_TWO_IMAGE_EDIT_RULE, |
| 149 | + _VIESCORE_TIE_SC_CRITERIA.strip(), |
| 150 | + instruction.strip(), |
| 151 | + ] |
| 152 | + ) |
| 153 | + |
| 154 | + |
| 155 | +def _build_viescore_t2i_sc_prompt(prompt: str) -> str: |
| 156 | + """Build the VIEScore ``t2i`` semantic-consistency prompt for one generated image. |
| 157 | +
|
| 158 | + Args: |
| 159 | + prompt: Text prompt used to generate the image. |
| 160 | +
|
| 161 | + Returns: |
| 162 | + ------- |
| 163 | + Full prompt aligned with TIGER-AI-Lab/VIEScore ``t2i`` SC. |
| 164 | + """ |
| 165 | + return "\n".join( |
| 166 | + [ |
| 167 | + _VIESCORE_CONTEXT, |
| 168 | + _VIESCORE_T2I_SC_RULE.strip(), |
| 169 | + _VIESCORE_T2I_SC_CRITERIA.strip(), |
| 170 | + prompt.strip(), |
| 171 | + ] |
| 172 | + ) |
| 173 | + |
| 174 | + |
| 175 | +def _build_viescore_pq_prompt() -> str: |
| 176 | + """Build the VIEScore perceptual-quality prompt for one image (SC or edited).""" |
| 177 | + return "\n".join([_VIESCORE_CONTEXT, _VIESCORE_PQ_SINGLE_IMAGE]) |
| 178 | + |
| 179 | + |
| 180 | +@MetricRegistry.register("vie_score") |
| 181 | +class VieScoreMetric(StatefulVLMMeanScoresMetric): |
| 182 | + """ |
| 183 | + VIEScore: semantic + perceptual quality with geometric-mean overall. |
| 184 | +
|
| 185 | + **Text-to-image (one generated image):** uses the VIEScore ``t2i`` SC prompt (semantic |
| 186 | + consistency + detail correspondence, 0--10 each) and the shared PQ prompt (naturalness + |
| 187 | + artifacts, 0--10 each). Overall is ``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``. |
| 188 | +
|
| 189 | + **Text--image editing (source + edited available):** matches the VIEScore ``tie`` setup |
| 190 | + used in GEdit-Bench: semantic criteria use **two** images (source then edited) and the |
| 191 | + editing instruction; perceptual criteria use the **edited** image only. Overall is |
| 192 | + ``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``, with ``min`` taken over the sub-scores in |
| 193 | + each JSON ``score`` list, consistent with `VIEScore`_. |
| 194 | +
|
| 195 | + .. _VIEScore: https://github.com/TIGER-AI-Lab/VIEScore |
| 196 | +
|
| 197 | + Parameters |
| 198 | + ---------- |
| 199 | + *args : Any |
| 200 | + Additional positional arguments. |
| 201 | + vlm : BaseVLM | None, optional |
| 202 | + Custom VLM instance. If provided, vlm_type and model_name are ignored. |
| 203 | + vlm_type : {"litellm", "transformers"}, optional |
| 204 | + VLM backend. Default is "litellm". |
| 205 | + model_name : str | None, optional |
| 206 | + Litellm model id or HuggingFace checkpoint id. **Required** when ``vlm`` is not |
| 207 | + provided (e.g. ``openai/gpt-4o``). |
| 208 | + vlm_kwargs : dict, optional |
| 209 | + Forwarded by ``get_vlm`` to ``LitellmVLM`` or ``TransformersVLM``. For local models, |
| 210 | + set ``model_load_kwargs`` for ``from_pretrained``; for litellm, pass extra API options. |
| 211 | + structured_output : bool, optional |
| 212 | + Use structured generation (litellm pydantic; transformers may use plain generation for |
| 213 | + multi-image). Default is True. |
| 214 | + device : str | torch.device | None, optional |
| 215 | + Device for transformers VLM. |
| 216 | + api_key : str | None, optional |
| 217 | + API key for litellm. |
| 218 | + call_type : str, optional |
| 219 | + Call type for the metric. |
| 220 | + **kwargs : Any |
| 221 | + Additional arguments. |
| 222 | +
|
| 223 | + References |
| 224 | + ---------- |
| 225 | + VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation (ACL 2024) |
| 226 | + https://arxiv.org/abs/2312.14867 |
| 227 | + https://github.com/TIGER-AI-Lab/VIEScore |
| 228 | +
|
| 229 | + GEdit-Bench (image editing evaluation) |
| 230 | + https://arxiv.org/abs/2504.17761 |
| 231 | +
|
| 232 | + Examples |
| 233 | + -------- |
| 234 | + Same ``hosted`` / ``local`` pattern as :func:`~pruna.evaluation.metrics.vlm_base.get_vlm``. |
| 235 | + Multi-image ``tie`` paths call ``generate_with_image_lists`` on ``self.vlm`` internally. |
| 236 | +
|
| 237 | + .. code-block:: python |
| 238 | +
|
| 239 | + import torch |
| 240 | +
|
| 241 | + from pruna.evaluation.metrics import VieScoreMetric |
| 242 | +
|
| 243 | + hosted = VieScoreMetric(vlm_type="litellm", model_name="openai/gpt-4o") |
| 244 | + local = VieScoreMetric( |
| 245 | + vlm_type="transformers", |
| 246 | + model_name="HuggingFaceTB/SmolVLM-256M-Instruct", |
| 247 | + device="cpu", |
| 248 | + vlm_kwargs={"model_load_kwargs": {"torch_dtype": torch.float32}}, |
| 249 | + ) |
| 250 | + """ |
| 251 | + |
| 252 | + scores: list[float] |
| 253 | + default_call_type: str = "y_x" |
| 254 | + higher_is_better: bool = True |
| 255 | + metric_name: str = "vie_score" |
| 256 | + |
| 257 | + def __init__( |
| 258 | + self, |
| 259 | + *args, |
| 260 | + vlm: BaseVLM | None = None, |
| 261 | + vlm_type: Literal["litellm", "transformers"] = "litellm", |
| 262 | + model_name: str | None = None, |
| 263 | + vlm_kwargs: dict | None = None, |
| 264 | + structured_output: bool = True, |
| 265 | + device: str | torch.device | None = None, |
| 266 | + api_key: str | None = None, |
| 267 | + call_type: str = SINGLE, |
| 268 | + **kwargs: Any, |
| 269 | + ) -> None: |
| 270 | + super().__init__(device=device) |
| 271 | + self.structured_output = structured_output |
| 272 | + self.response_format = VIEScoreJsonOutput if structured_output else None |
| 273 | + |
| 274 | + self._init_vlm_scores( |
| 275 | + vlm=vlm, |
| 276 | + vlm_type=vlm_type, |
| 277 | + model_name=model_name, |
| 278 | + vlm_kwargs=vlm_kwargs, |
| 279 | + structured_output=structured_output, |
| 280 | + device=device, |
| 281 | + api_key=api_key, |
| 282 | + call_type=call_type, |
| 283 | + ) |
| 284 | + |
| 285 | + def _score_single_image_t2i(self, image: Image.Image, prompt: str) -> float: |
| 286 | + """VIEScore ``t2i``: single-image SC (semantic + detail) and PQ (naturalness + artifacts). |
| 287 | +
|
| 288 | + Matches the VIEScore paper's t2i evaluation: two SC sub-scores on 0--10 and two PQ |
| 289 | + sub-scores on 0--10, aggregated as ``sqrt(min(SC) * min(PQ)) / 10``. |
| 290 | + """ |
| 291 | + sc_prompt = _build_viescore_t2i_sc_prompt(prompt) |
| 292 | + pq_prompt = _build_viescore_pq_prompt() |
| 293 | + |
| 294 | + rf = self.response_format if self.structured_output else None |
| 295 | + |
| 296 | + sc_raw = self.vlm.generate([image], [sc_prompt], response_format=rf)[0] |
| 297 | + pq_raw = self.vlm.generate([image], [pq_prompt], response_format=rf)[0] |
| 298 | + |
| 299 | + sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw)) |
| 300 | + pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw)) |
| 301 | + return viescore_tie_overall_unit(sc_list, pq_list) |
| 302 | + |
| 303 | + def _score_tie_gedit(self, source: Image.Image, edited: Image.Image, instruction: str) -> float: |
| 304 | + """VIEScore ``tie``: two-image SC, single-image PQ, overall geometric mean on 0--10 mins.""" |
| 305 | + sc_prompt = _build_viescore_tie_sc_prompt(instruction) |
| 306 | + pq_prompt = _build_viescore_pq_prompt() |
| 307 | + |
| 308 | + rf = self.response_format if self.structured_output else None |
| 309 | + |
| 310 | + if hasattr(self.vlm, "generate_with_image_lists"): |
| 311 | + sc_raw = self.vlm.generate_with_image_lists( |
| 312 | + [[source, edited]], |
| 313 | + [sc_prompt], |
| 314 | + response_format=rf, |
| 315 | + )[0] |
| 316 | + else: |
| 317 | + raise RuntimeError("VLM backend must implement generate_with_image_lists for editing parity.") |
| 318 | + |
| 319 | + pq_raw = self.vlm.generate([edited], [pq_prompt], response_format=rf)[0] |
| 320 | + |
| 321 | + sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw)) |
| 322 | + pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw)) |
| 323 | + return viescore_tie_overall_unit(sc_list, pq_list) |
| 324 | + |
| 325 | + def update(self, x: list[Any] | torch.Tensor, gt: Any, outputs: torch.Tensor) -> None: |
| 326 | + """ |
| 327 | + Update the metric with new batch data. |
| 328 | +
|
| 329 | + Parameters |
| 330 | + ---------- |
| 331 | + x : List[Any] | torch.Tensor |
| 332 | + The input data (prompts). |
| 333 | + gt : Any |
| 334 | + Per-sample auxiliary dicts (``prompt_with_auxiliaries_collate``), or tensor placeholders |
| 335 | + when aux is unused. |
| 336 | + outputs : torch.Tensor |
| 337 | + The output images. |
| 338 | + """ |
| 339 | + inputs = metric_data_processor(x, gt, outputs, self.call_type) |
| 340 | + images = _process_images(inputs[0]) |
| 341 | + prompts = prompts_from_y_x_inputs(inputs, len(images)) |
| 342 | + aux_list = auxiliary_dicts_from_gt(gt, len(images)) |
| 343 | + |
| 344 | + for i, image in enumerate(images): |
| 345 | + prompt = prompts[i] if i < len(prompts) else "" |
| 346 | + aux = aux_list[i] |
| 347 | + source = pil_rgb_from_aux_image_bytes(aux, min_bytes_in_value_scan=100) |
| 348 | + |
| 349 | + if source is not None: |
| 350 | + self.scores.append(self._score_tie_gedit(source, image, prompt)) |
| 351 | + else: |
| 352 | + self.scores.append(self._score_single_image_t2i(image, prompt)) |
| 353 | + |
| 354 | + def compute(self) -> MetricResult: |
| 355 | + """ |
| 356 | + Compute the VIEScore metric. |
| 357 | +
|
| 358 | + Returns |
| 359 | + ------- |
| 360 | + MetricResult |
| 361 | + The mean VIEScore across all updates. |
| 362 | + """ |
| 363 | + return self.compute_mean_of_scores() |
0 commit comments