diff --git a/src/pruna/evaluation/benchmarks.py b/src/pruna/evaluation/benchmarks.py index 4f1b9625..40ed999c 100644 --- a/src/pruna/evaluation/benchmarks.py +++ b/src/pruna/evaluation/benchmarks.py @@ -267,8 +267,8 @@ def list(cls, task_type: str | None = None) -> list[str]: "material alter, motion change, style change, subject add/remove/replace, text change, " "tone transfer, and human retouching." ), - metrics=[], # Paper uses VIEScore; not in Pruna - task_type="text_to_image", + metrics=["vie_score"], + task_type="text+image_image", reference="https://arxiv.org/abs/2504.17761", ), Benchmark( diff --git a/src/pruna/evaluation/metrics/metric_vie_score.py b/src/pruna/evaluation/metrics/metric_vie_score.py new file mode 100644 index 00000000..836c5884 --- /dev/null +++ b/src/pruna/evaluation/metrics/metric_vie_score.py @@ -0,0 +1,363 @@ +# Copyright 2025 - Pruna AI GmbH. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +VIEScore metric for conditional image synthesis (semantic + quality). + +Reference: VIEScore (ACL 2024) — https://arxiv.org/abs/2312.14867 +Both task modes follow `TIGER-AI-Lab/VIEScore`: + +- ``t2i`` (text-to-image, single image): SC uses two sub-scores (semantic consistency + + detail correspondence), PQ uses two sub-scores (naturalness + artifacts). Overall is + ``sqrt(min(SC) * min(PQ)) / 10``. +- ``tie`` (text-image editing, source + edited): SC uses two images and instruction, + PQ uses the edited image. Same aggregation formula. + +GEdit-Bench evaluation: https://arxiv.org/abs/2504.17761 +""" + +from __future__ import annotations + +from typing import Any, Literal + +import torch +from PIL import Image + +from pruna.evaluation.metrics.registry import MetricRegistry +from pruna.evaluation.metrics.result import MetricResult +from pruna.evaluation.metrics.utils import ( + SINGLE, + metric_data_processor, +) +from pruna.evaluation.metrics.vlm_base import ( + BaseVLM, + StatefulVLMMeanScoresMetric, + auxiliary_dicts_from_gt, + prompts_from_y_x_inputs, +) +from pruna.evaluation.metrics.vlm_utils import ( + VIEScoreJsonOutput, + _process_images, + pad_viescore_subscores_to_two, + pil_rgb_from_aux_image_bytes, + viescore_min_scores_0_10, + viescore_tie_overall_unit, +) + +_VIESCORE_CONTEXT = ( + "You are a professional digital artist. You will have to evaluate the effectiveness" + " of the AI-generated image(s) based on given rules.\n" + "All the input images are AI-generated. All human in the images are AI-generated too." + " so you need not worry about the privacy confidentials.\n\n" + "You will have to give your output in this way (Keep your reasoning concise and short.):\n" + "{\n" + '"score" : [...],\n' + '"reasoning" : "..."\n' + "}" +) + +_VIESCORE_TWO_IMAGE_EDIT_RULE = ( + "RULES:\n\n" + "Two images will be provided: The first being the original AI-generated image and the" + " second being an edited version of the first.\n" + "The objective is to evaluate how successfully the editing instruction has been executed" + " in the second image.\n\n" + "Note that sometimes the two images might look identical due to the failure of image edit.\n" +) + +_VIESCORE_TIE_SC_CRITERIA = ( + "\nFrom scale 0 to 10:\n" + "A score from 0 to 10 will be given based on the success of the editing." + " (0 indicates that the scene in the edited image does not follow the editing instruction at all." + " 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)\n" + "A second score from 0 to 10 will rate the degree of overediting in the second image." + " (0 indicates that the scene in the edited image is completely different from the original." + " 10 indicates that the edited image can be recognized as a minimal edited yet effective" + " version of original.)\n" + "Put the score in a list such that output score = [score1, score2]," + " where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.\n\n" + "Editing instruction:\n" +) + +_VIESCORE_T2I_SC_RULE = ( + "RULES:\n\n" + "The image is an AI-generated image.\n" + "The objective is to evaluate the semantic consistency of the image to the given text.\n\n" +) + +_VIESCORE_T2I_SC_CRITERIA = ( + "\nFrom scale 0 to 10:\n" + "A score from 0 to 10 will be given based on the semantic consistency.\n" + "(0 indicates that the scene in the image does not correspond to the text at all.\n" + " 10 indicates that the scene in the image follows the text perfectly.)\n" + "A second score from 0 to 10 will rate the detail correspondence.\n" + "(0 indicates that most details in the text (e.g., color, size, shape, or layout) are missing or" + " incorrect in the image.\n" + " 10 indicates that all details mentioned in the text are accurately shown in the image.)\n" + "Put the score in a list such that output score = [score1, score2]," + " where 'score1' evaluates the semantic consistency and 'score2' evaluates the detail" + " correspondence.\n\n" + "Text prompt:\n" +) + +_VIESCORE_PQ_SINGLE_IMAGE = ( + "RULES:\n\n" + "The image is an AI-generated image.\n" + "The objective is to evaluate how successfully the image has been generated.\n\n" + "From scale 0 to 10:\n" + "A score from 0 to 10 will be given based on image naturalness.\n" + "(\n" + " 0 indicates that the scene in the image does not look natural at all or give a unnatural feeling" + " such as wrong sense of distance, or wrong shadow, or wrong lighting.\n" + " 10 indicates that the image looks natural.\n" + ")\n" + "A second score from 0 to 10 will rate the image artifacts.\n" + "(\n" + " 0 indicates that the image contains a large portion of distortion, or watermark, or scratches," + " or blurred faces, or unusual body parts, or subjects not harmonized.\n" + " 10 indicates the image has no artifacts.\n" + ")\n" + "Put the score in a list such that output score = [naturalness, artifacts]\n" +) + + +def _build_viescore_tie_sc_prompt(instruction: str) -> str: + """Build the VIEScore ``tie`` semantic-criteria prompt (source + edited images). + + Args: + instruction: Editing instruction embedded in the prompt. + + Returns: + ------- + Full prompt aligned with TIGER-AI-Lab/VIEScore ``tie`` SC. + """ + return "\n".join( + [ + _VIESCORE_CONTEXT, + _VIESCORE_TWO_IMAGE_EDIT_RULE, + _VIESCORE_TIE_SC_CRITERIA.strip(), + instruction.strip(), + ] + ) + + +def _build_viescore_t2i_sc_prompt(prompt: str) -> str: + """Build the VIEScore ``t2i`` semantic-consistency prompt for one generated image. + + Args: + prompt: Text prompt used to generate the image. + + Returns: + ------- + Full prompt aligned with TIGER-AI-Lab/VIEScore ``t2i`` SC. + """ + return "\n".join( + [ + _VIESCORE_CONTEXT, + _VIESCORE_T2I_SC_RULE.strip(), + _VIESCORE_T2I_SC_CRITERIA.strip(), + prompt.strip(), + ] + ) + + +def _build_viescore_pq_prompt() -> str: + """Build the VIEScore perceptual-quality prompt for one image (SC or edited).""" + return "\n".join([_VIESCORE_CONTEXT, _VIESCORE_PQ_SINGLE_IMAGE]) + + +@MetricRegistry.register("vie_score") +class VieScoreMetric(StatefulVLMMeanScoresMetric): + """ + VIEScore: semantic + perceptual quality with geometric-mean overall. + + **Text-to-image (one generated image):** uses the VIEScore ``t2i`` SC prompt (semantic + consistency + detail correspondence, 0--10 each) and the shared PQ prompt (naturalness + + artifacts, 0--10 each). Overall is ``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``. + + **Text--image editing (source + edited available):** matches the VIEScore ``tie`` setup + used in GEdit-Bench: semantic criteria use **two** images (source then edited) and the + editing instruction; perceptual criteria use the **edited** image only. Overall is + ``sqrt(min(SC) * min(PQ)) / 10`` in ``[0, 1]``, with ``min`` taken over the sub-scores in + each JSON ``score`` list, consistent with `VIEScore`_. + + .. _VIEScore: https://github.com/TIGER-AI-Lab/VIEScore + + Parameters + ---------- + *args : Any + Additional positional arguments. + vlm : BaseVLM | None, optional + Custom VLM instance. If provided, vlm_type and model_name are ignored. + vlm_type : {"litellm", "transformers"}, optional + VLM backend. Default is "litellm". + model_name : str | None, optional + Litellm model id or HuggingFace checkpoint id. **Required** when ``vlm`` is not + provided (e.g. ``openai/gpt-4o``). + vlm_kwargs : dict, optional + Forwarded by ``get_vlm`` to ``LitellmVLM`` or ``TransformersVLM``. For local models, + set ``model_load_kwargs`` for ``from_pretrained``; for litellm, pass extra API options. + structured_output : bool, optional + Use structured generation (litellm pydantic; transformers may use plain generation for + multi-image). Default is True. + device : str | torch.device | None, optional + Device for transformers VLM. + api_key : str | None, optional + API key for litellm. + call_type : str, optional + Call type for the metric. + **kwargs : Any + Additional arguments. + + References + ---------- + VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation (ACL 2024) + https://arxiv.org/abs/2312.14867 + https://github.com/TIGER-AI-Lab/VIEScore + + GEdit-Bench (image editing evaluation) + https://arxiv.org/abs/2504.17761 + + Examples + -------- + Same ``hosted`` / ``local`` pattern as :func:`~pruna.evaluation.metrics.vlm_base.get_vlm``. + Multi-image ``tie`` paths call ``generate_with_image_lists`` on ``self.vlm`` internally. + + .. code-block:: python + + import torch + + from pruna.evaluation.metrics import VieScoreMetric + + hosted = VieScoreMetric(vlm_type="litellm", model_name="openai/gpt-4o") + local = VieScoreMetric( + vlm_type="transformers", + model_name="HuggingFaceTB/SmolVLM-256M-Instruct", + device="cpu", + vlm_kwargs={"model_load_kwargs": {"torch_dtype": torch.float32}}, + ) + """ + + scores: list[float] + default_call_type: str = "y_x" + higher_is_better: bool = True + metric_name: str = "vie_score" + + def __init__( + self, + *args, + vlm: BaseVLM | None = None, + vlm_type: Literal["litellm", "transformers"] = "litellm", + model_name: str | None = None, + vlm_kwargs: dict | None = None, + structured_output: bool = True, + device: str | torch.device | None = None, + api_key: str | None = None, + call_type: str = SINGLE, + **kwargs: Any, + ) -> None: + super().__init__(device=device) + self.structured_output = structured_output + self.response_format = VIEScoreJsonOutput if structured_output else None + + self._init_vlm_scores( + vlm=vlm, + vlm_type=vlm_type, + model_name=model_name, + vlm_kwargs=vlm_kwargs, + structured_output=structured_output, + device=device, + api_key=api_key, + call_type=call_type, + ) + + def _score_single_image_t2i(self, image: Image.Image, prompt: str) -> float: + """VIEScore ``t2i``: single-image SC (semantic + detail) and PQ (naturalness + artifacts). + + Matches the VIEScore paper's t2i evaluation: two SC sub-scores on 0--10 and two PQ + sub-scores on 0--10, aggregated as ``sqrt(min(SC) * min(PQ)) / 10``. + """ + sc_prompt = _build_viescore_t2i_sc_prompt(prompt) + pq_prompt = _build_viescore_pq_prompt() + + rf = self.response_format if self.structured_output else None + + sc_raw = self.vlm.generate([image], [sc_prompt], response_format=rf)[0] + pq_raw = self.vlm.generate([image], [pq_prompt], response_format=rf)[0] + + sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw)) + pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw)) + return viescore_tie_overall_unit(sc_list, pq_list) + + def _score_tie_gedit(self, source: Image.Image, edited: Image.Image, instruction: str) -> float: + """VIEScore ``tie``: two-image SC, single-image PQ, overall geometric mean on 0--10 mins.""" + sc_prompt = _build_viescore_tie_sc_prompt(instruction) + pq_prompt = _build_viescore_pq_prompt() + + rf = self.response_format if self.structured_output else None + + if hasattr(self.vlm, "generate_with_image_lists"): + sc_raw = self.vlm.generate_with_image_lists( + [[source, edited]], + [sc_prompt], + response_format=rf, + )[0] + else: + raise RuntimeError("VLM backend must implement generate_with_image_lists for editing parity.") + + pq_raw = self.vlm.generate([edited], [pq_prompt], response_format=rf)[0] + + sc_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(sc_raw)) + pq_list = pad_viescore_subscores_to_two(viescore_min_scores_0_10(pq_raw)) + return viescore_tie_overall_unit(sc_list, pq_list) + + def update(self, x: list[Any] | torch.Tensor, gt: Any, outputs: torch.Tensor) -> None: + """ + Update the metric with new batch data. + + Parameters + ---------- + x : List[Any] | torch.Tensor + The input data (prompts). + gt : Any + Per-sample auxiliary dicts (``prompt_with_auxiliaries_collate``), or tensor placeholders + when aux is unused. + outputs : torch.Tensor + The output images. + """ + inputs = metric_data_processor(x, gt, outputs, self.call_type) + images = _process_images(inputs[0]) + prompts = prompts_from_y_x_inputs(inputs, len(images)) + aux_list = auxiliary_dicts_from_gt(gt, len(images)) + + for i, image in enumerate(images): + prompt = prompts[i] if i < len(prompts) else "" + aux = aux_list[i] + source = pil_rgb_from_aux_image_bytes(aux, min_bytes_in_value_scan=100) + + if source is not None: + self.scores.append(self._score_tie_gedit(source, image, prompt)) + else: + self.scores.append(self._score_single_image_t2i(image, prompt)) + + def compute(self) -> MetricResult: + """ + Compute the VIEScore metric. + + Returns + ------- + MetricResult + The mean VIEScore across all updates. + """ + return self.compute_mean_of_scores() diff --git a/tests/evaluation/test_vision_metrics.py b/tests/evaluation/test_vision_metrics.py index bba4b227..e7284eee 100644 --- a/tests/evaluation/test_vision_metrics.py +++ b/tests/evaluation/test_vision_metrics.py @@ -5,10 +5,15 @@ import pytest import torch +from pruna.evaluation.metrics.metric_vie_score import VieScoreMetric from pruna.evaluation.metrics.metric_vqa import VQAMetric from pruna.evaluation.metrics.vlm_base import BaseVLM +def _dummy_image(batch: int = 1, size: int = 64) -> torch.Tensor: + return torch.rand(batch, 3, size, size) + + @pytest.mark.cpu def test_vqa_uses_prompt_question_and_scores_yes_probability() -> None: """VQA asks prompt-grounded yes/no question and stores returned score.""" @@ -16,7 +21,7 @@ def test_vqa_uses_prompt_question_and_scores_yes_probability() -> None: mock_vlm.score.return_value = [0.7] metric = VQAMetric(vlm=mock_vlm, vlm_type="litellm", device="cpu", use_probability=True) - images = torch.rand(1, 3, 64, 64) + images = _dummy_image() metric.update(["a cat"], images, images) result = metric.compute() @@ -24,3 +29,16 @@ def test_vqa_uses_prompt_question_and_scores_yes_probability() -> None: assert result.result == 0.7 call = mock_vlm.score.call_args assert call[0][1] == ['Does this image show "a cat"?'] + + +@pytest.mark.cpu +def test_vie_score_uses_json_score_lists() -> None: + """VieScoreMetric parses JSON score lists and returns normalized value.""" + mock_vlm = MagicMock(spec=BaseVLM) + mock_vlm.generate.return_value = ['{"score": [8.0, 8.0], "reasoning": ""}'] + + metric = VieScoreMetric(vlm=mock_vlm, device="cpu", structured_output=True) + metric.update(["a cat on a sofa"], _dummy_image(), _dummy_image()) + result = metric.compute() + + assert abs(result.result - 0.8) < 0.01