PrunaAI · davidberenstein1957 · Apr 28, 2026 · May 8, 2026 · Jun 2, 2026 · Jun 4, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,41 +67,33 @@ name = "pruna_internal"
 url = "https://prunaai.pythonanywhere.com/simple/"
 explicit = true
 
-[[tool.uv.index]]
-name = "intel-pytorch-extension"
-url = "https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/"
-explicit = true
-
 [tool.uv]
 index-strategy = "first-index"
+exclude-newer = "1 week"  # protection against compromised dependencies
+# trusted dev wheels that are missing an upload date
+exclude-newer-package = { gptqmodel = false, "stable-fast-pruna" = false }
 
 conflicts = [
     [{ extra = "awq" }, { extra = "vbench" }],
     [{ extra = "vllm" }, { extra = "vbench" }],
-    [{ extra = "intel" }, { extra = "awq" }],
     [{ extra = "gptq" }, { extra = "awq" }],
-    # intel is incompatible with all stable-fast variants and vllm
-    [{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "vllm" }],
     [{ extra = "kvpress" }, { extra = "vbench" }],
 ]
 
 [tool.uv.sources]
 gptqmodel = { index = "pruna_internal", marker = "sys_platform != 'darwin' or platform_machine != 'arm64'" }
-intel-extension-for-pytorch = { index = "intel-pytorch-extension" }
 stable-fast-pruna = { index = "pruna_internal", extra = "stable-fast-extraindex" }
 
 [project]
 name = "pruna"
-version = "0.3.2"
+version = "0.3.3"
 description = "Smash your AI models"
 authors = [
     {name = "Pruna AI", email = "hello@pruna.ai"}
 ]
 license = {file = "LICENSE"}
 readme = "README.md"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<3.14"
 keywords = ["AI", "machine learning", "model optimization", "pruning"]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -246,12 +238,6 @@ lmharness = [
     "lm-eval>=0.4.0"
 ]
 
-# Intel extension is tightly coupled with the torch version
-intel = [
-    "intel-extension-for-pytorch>=2.7.0",
-    "torch>=2.7.0,<2.9.0",
-    "torchvision>=0.22.0,<0.24.0",
-]
 kvpress = [
     "kvpress>=0.5.2",
 ]

diff --git a/src/pruna/evaluation/benchmarks.py b/src/pruna/evaluation/benchmarks.py
@@ -226,7 +226,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "counting, colors, position, color attributes. Evaluates fine-grained alignment "
             "between prompts and generated images via VQA-style questions."
         ),
-        metrics=["clip_score"],  # §3.2: Mask2Former; not in Pruna
+        metrics=["qa_accuracy", "clip_score"],  # strict QA + CLIP score
         task_type="text_to_image",
         reference="https://arxiv.org/abs/2310.11513",
     ),

diff --git a/src/pruna/evaluation/metrics/__init__.py b/src/pruna/evaluation/metrics/__init__.py
@@ -23,6 +23,7 @@
 from pruna.evaluation.metrics.metric_memory import DiskMemoryMetric, InferenceMemoryMetric, TrainingMemoryMetric
 from pruna.evaluation.metrics.metric_model_architecture import TotalMACsMetric, TotalParamsMetric
 from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
+from pruna.evaluation.metrics.metric_qa_accuracy import QAAccuracyMetric
 from pruna.evaluation.metrics.metric_rapiddata import RapidataMetric as RapidataMetric
 from pruna.evaluation.metrics.metric_sharpness import SharpnessMetric
 from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
@@ -53,6 +54,7 @@
     "SharpnessMetric",
     "AestheticLAION",
     "LMEvalMetric",
+    "QAAccuracyMetric",
     "RapidataMetric",
     "BaseVLM",
     "LitellmVLM",

diff --git a/src/pruna/evaluation/metrics/metric_qa_accuracy.py b/src/pruna/evaluation/metrics/metric_qa_accuracy.py
@@ -0,0 +1,204 @@
+# Copyright 2025 - Pruna AI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""QA Accuracy metric using VLM for image understanding evaluation."""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+import numpy as np
+import torch
+
+from pruna.evaluation.metrics.registry import MetricRegistry
+from pruna.evaluation.metrics.result import MetricResult
+from pruna.evaluation.metrics.utils import (
+    SINGLE,
+    metric_data_processor,
+)
+from pruna.evaluation.metrics.vlm_base import BaseVLM, StatefulVLMMeanScoresMetric
+from pruna.evaluation.metrics.vlm_utils import VQAnswer, _process_images
+
+
+@MetricRegistry.register("qa_accuracy")
+class QAAccuracyMetric(StatefulVLMMeanScoresMetric):
+    """
+    QA Accuracy metric.
+
+    Uses a VLM to score yes/no alignment between each question and the generated image.
+    Higher scores indicate better image understanding.
+
+    **Multiple questions** come from each auxiliary dict's ``questions`` mapping (e.g. GenEval
+    atomic probes, OneIG items). Each question is scored independently via :meth:`BaseVLM.score`
+    with expected answer ``"Yes"``.
+
+    **Aggregation** (``aggregation`` kwarg):
+
+    - ``mean`` (default): per image, average VLM scores over all questions; the metric's
+      :meth:`compute` returns the mean of those per-image values across ``update`` calls.
+    - ``all_or_nothing``: per image, ``1.0`` only if **every** question scores strictly above
+      ``0.5`` (scores equal to ``0.5`` count as failure). This matches strict GenEval-style
+      reporting (all atomic checks must pass per sample; see `GenEval
+      <https://arxiv.org/abs/2310.11513>`_). :class:`~pruna.evaluation.task.Task` wires this for
+      the GenEval benchmark.
+
+    Parameters
+    ----------
+    vlm : BaseVLM | None, optional
+        Custom VLM instance. If provided, ``vlm_type`` and ``model_name`` are ignored.
+    vlm_type : {"litellm", "transformers"}, optional
+        VLM backend. Default is "litellm".
+    model_name : str | None, optional
+        Litellm model id or HuggingFace checkpoint id. **Required** when ``vlm`` is not
+        provided (e.g. ``openai/gpt-4o``).
+    vlm_kwargs : dict, optional
+        Forwarded by ``get_vlm`` to ``LitellmVLM`` or ``TransformersVLM``. For local models,
+        set ``model_load_kwargs`` for ``from_pretrained``; for litellm, pass extra API options.
+    structured_output : bool, optional
+        Use structured generation (litellm pydantic; transformers outlines when applicable).
+        Default is True.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
+    aggregation : {"mean", "all_or_nothing"}, optional
+        Per-image score aggregation (keyword-only). Default is ``"mean"``.
+    **kwargs : Any
+        Additional keyword arguments forwarded to the parent class.
+
+    Raises
+    ------
+    ValueError
+        If ``aggregation`` is not ``"mean"`` or ``"all_or_nothing"``.
+
+    Examples
+    --------
+    Same ``hosted`` / ``local`` pattern as :func:`~pruna.evaluation.metrics.vlm_base.get_vlm`:
+
+    .. code-block:: python
+
+        import torch
+
+        from pruna.evaluation.metrics import QAAccuracyMetric
+
+        hosted = QAAccuracyMetric(vlm_type="litellm", model_name="openai/gpt-4o")
+        local = QAAccuracyMetric(
+            vlm_type="transformers",
+            model_name="HuggingFaceTB/SmolVLM-256M-Instruct",
+            device="cpu",
+            vlm_kwargs={"model_load_kwargs": {"torch_dtype": torch.float32}},
+        )
+    """
+
+    scores: list[float]
+    default_call_type: str = "y_gt"
+    higher_is_better: bool = True
+    metric_units: str = "accuracy"
+    metric_name: str = "qa_accuracy"
+
+    def __init__(
+        self,
+        vlm: BaseVLM | None = None,
+        vlm_type: Literal["litellm", "transformers"] = "litellm",
+        model_name: str | None = None,
+        vlm_kwargs: dict | None = None,
+        structured_output: bool = True,
+        device: str | torch.device | None = None,
+        api_key: str | None = None,
+        call_type: str | None = None,
+        *,
+        aggregation: str = "mean",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(device=device)
+        self.response_format = VQAnswer if structured_output else None
+        self.aggregation = aggregation
+        if self.aggregation not in {"mean", "all_or_nothing"}:
+            raise ValueError(
+                f"qa_accuracy aggregation must be one of {{'mean', 'all_or_nothing'}}. Got: {self.aggregation!r}."
+            )
+        self._init_vlm_scores(
+            vlm=vlm,
+            vlm_type=vlm_type,
+            model_name=model_name,
+            vlm_kwargs=vlm_kwargs,
+            structured_output=structured_output,
+            device=device,
+            api_key=api_key,
+            call_type=call_type if call_type is not None else SINGLE,
+        )
+
+    def _extract_questions(self, gt: Any, n: int) -> list[list[str]]:
+        if isinstance(gt, (list, tuple)) and len(gt) >= n:
+            out = []
+            for i in range(n):
+                v = gt[i]
+                if isinstance(v, dict) and "questions" in v:
+                    qs = v["questions"]
+                    out.append(list(qs.values()) if isinstance(qs, dict) else list(qs))
+                else:
+                    out.append([])
+            return out
+        return [[] for _ in range(n)]
+
+    def update(self, x: list[Any] | torch.Tensor, gt: torch.Tensor, outputs: torch.Tensor) -> None:
+        """
+        Update the metric with new batch data.
+
+        Parameters
+        ----------
+        x : list[Any] | torch.Tensor
+            The input data.
+        gt : torch.Tensor
+            The ground truth (questions per image).
+        outputs : torch.Tensor
+            The output images.
+        """
+        inputs = metric_data_processor(x, gt, outputs, self.call_type)
+        images = _process_images(inputs[0])
+        auxiliaries = inputs[1] if len(inputs) > 1 else []
+        questions_per_image = self._extract_questions(auxiliaries, len(images))
+        for i, image in enumerate(images):
+            questions = questions_per_image[i] if i < len(questions_per_image) else []
+            if not questions:
+                aux = auxiliaries[i] if i < len(auxiliaries) else {}
+                raise ValueError(
+                    "qa_accuracy requires 'questions' in auxiliaries. "
+                    "Use a benchmark that provides it (e.g. GenEval, DPG, OneIG). "
+                    f"Got aux keys: {list(aux.keys()) if isinstance(aux, dict) else 'not a dict'}."
+                )
+            scores = self.vlm.score(
+                [image] * len(questions),
+                questions,
+                ["Yes"] * len(questions),
+                response_format=self.response_format,
+            )
+            if self.aggregation == "all_or_nothing":
+                score = 1.0 if all(s > 0.5 for s in scores) else 0.0
+            else:
+                score = float(np.mean(scores))
+            self.scores.append(score)
+
+    def compute(self) -> MetricResult:
+        """
+        Compute the QA accuracy score.
+
+        Returns
+        -------
+        MetricResult
+            The mean QA accuracy across all updates.
+        """
+        return self.compute_mean_of_scores()
diff --git a/src/pruna/evaluation/metrics/metric_torch.py b/src/pruna/evaluation/metrics/metric_torch.py
@@ -50,6 +50,26 @@
 )
 from pruna.logging.logger import pruna_logger
 
+_PRUNA_TASK_ROUTING_KWARGS: tuple[str, ...] = (
+    "vlm_type",
+    "model_name",
+    "structured_output",
+    "vlm_kwargs",
+    "api_key",
+)
+
+
+def _strip_task_routing_kwargs(kwargs: dict[str, Any]) -> None:
+    """
+    Drop kwargs :class:`~pruna.evaluation.task.Task` passes when building mixed metric lists.
+
+    Torchmetrics classes often end with ``**kwargs`` and would otherwise accept bogus keys
+    until a lower layer raises. Stripping here keeps :class:`TorchMetricWrapper` the single
+    choke point between Pruna routing and torchmetrics constructors.
+    """
+    for key in _PRUNA_TASK_ROUTING_KWARGS:
+        kwargs.pop(key, None)
+
 
 def default_update(metric: Metric, *args, **kwargs) -> None:
     """
@@ -124,9 +144,7 @@ def arniqa_update(metric: ARNIQA, preds: Any) -> None:
 
 
 def ssim_update(
-        metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure,
-        preds: Any,
-        target: Any
+    metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure, preds: Any, target: Any
 ) -> None:
     """
     Update handler for SSIM or MS-SSIM metric.
@@ -152,29 +170,22 @@ class TorchMetrics(Enum):
     """
     Enumeration of torchmetrics metrics for evaluation.
 
-    This enum provides a tuple per member (metric_factory, update_fn, call_type):
-    metric_factory builds the metric (typically a torchmetrics class, or
-    functools.partial when some constructor arguments are fixed); update_fn is
-    an optional custom update handler; call_type describes how inputs are paired
-    for the metric.
+    Each member value is a ``(metric_factory, update_fn, call_type)`` tuple.
 
     Parameters
     ----------
     value : tuple
-        Tuple holding metric_factory, update_fn, and call_type as described above.
+        ``(metric_factory, update_fn, call_type)`` for this enum member.
     names : str
-        The name of the enum member.
+        Enum member name.
     module : str
-        The module where the enum is defined.
+        Defining module name.
     qualname : str
-        The qualified name of the enum.
+        Qualified name of the enum class.
     type : type
-        The type of the enum.
+        Enum metaclass type.
     start : int
-        The start index for auto-numbering enum values.
-    boundary : enum.FlagBoundary or None
-        Boundary handling mode used by the Enum functional API for Flag and
-        IntFlag enums.
+        Auto-numbering start index for functional API enums.
     """
 
     fid = (FrechetInceptionDistance, fid_update, "gt_y")
@@ -246,6 +257,7 @@ def __new__(cls, metric_name: str, call_type: str = "", **kwargs) -> StatefulMet
         if metric_name == "clip_score" and call_type.startswith(PAIRWISE):
             from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
 
+            _strip_task_routing_kwargs(kwargs)
             return PairwiseClipScore(**kwargs)
         return super().__new__(cls)
 
@@ -259,6 +271,7 @@ def __init__(self, metric_name: str, call_type: str = "", **kwargs) -> None:
             If the metric name is not supported.
         """
         self.metric_name = metric_name
+        _strip_task_routing_kwargs(kwargs)
         super().__init__(kwargs.pop("device", None))
         try:
             self.metric = TorchMetrics[metric_name](**kwargs)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,14 @@
+import os
 from typing import Any
 
 import pytest
 
+if os.environ.get("PRUNA_CI_CPU_ONLY") == "1":
+    import torch
+
+    if hasattr(torch.backends, "mps"):
+        torch.backends.mps.is_available = lambda: False  # type: ignore[method-assign]
+
 # import all fixtures to make them avaliable for pytest
 from .fixtures import *  # noqa: F403, F401