PrunaAI · davidberenstein1957 · Apr 28, 2026 · Jun 2, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,41 +67,33 @@ name = "pruna_internal"
 url = "https://prunaai.pythonanywhere.com/simple/"
 explicit = true
 
-[[tool.uv.index]]
-name = "intel-pytorch-extension"
-url = "https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/"
-explicit = true
-
 [tool.uv]
 index-strategy = "first-index"
+exclude-newer = "1 week"  # protection against compromised dependencies
+# trusted dev wheels that are missing an upload date
+exclude-newer-package = { gptqmodel = false, "stable-fast-pruna" = false }
 
 conflicts = [
     [{ extra = "awq" }, { extra = "vbench" }],
     [{ extra = "vllm" }, { extra = "vbench" }],
-    [{ extra = "intel" }, { extra = "awq" }],
     [{ extra = "gptq" }, { extra = "awq" }],
-    # intel is incompatible with all stable-fast variants and vllm
-    [{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "vllm" }],
     [{ extra = "kvpress" }, { extra = "vbench" }],
 ]
 
 [tool.uv.sources]
 gptqmodel = { index = "pruna_internal", marker = "sys_platform != 'darwin' or platform_machine != 'arm64'" }
-intel-extension-for-pytorch = { index = "intel-pytorch-extension" }
 stable-fast-pruna = { index = "pruna_internal", extra = "stable-fast-extraindex" }
 
 [project]
 name = "pruna"
-version = "0.3.2"
+version = "0.3.3"
 description = "Smash your AI models"
 authors = [
     {name = "Pruna AI", email = "hello@pruna.ai"}
 ]
 license = {file = "LICENSE"}
 readme = "README.md"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<3.14"
 keywords = ["AI", "machine learning", "model optimization", "pruning"]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -246,12 +238,6 @@ lmharness = [
     "lm-eval>=0.4.0"
 ]
 
-# Intel extension is tightly coupled with the torch version
-intel = [
-    "intel-extension-for-pytorch>=2.7.0",
-    "torch>=2.7.0,<2.9.0",
-    "torchvision>=0.22.0,<0.24.0",
-]
 kvpress = [
     "kvpress>=0.5.2",
 ]

diff --git a/src/pruna/evaluation/benchmarks.py b/src/pruna/evaluation/benchmarks.py
@@ -66,19 +66,19 @@ class BenchmarkRegistry:
     paper (see reference URL). All entries verified from paper evaluation
     sections (ar5iv/HTML or PDF) as of verification pass:
 
-    - Parti Prompts (2206.10789 §5.2, §5.4): human side-by-side only on P222.
-    - DrawBench (2205.11487 §4.3): human raters only; COCO uses FID + CLIP.
+    - Parti Prompts (2206.10789 ?5.2, ?5.4): human side-by-side only on P222.
+    - DrawBench (2205.11487 ?4.3): human raters only; COCO uses FID + CLIP.
     - GenAI Bench (2406.13743): VQAScore only (web/PWC; ar5iv failed).
     - VBench (2311.17982): 16 dimension-specific methods; no single Pruna metric.
-    - COCO (2205.11487 §4.1): FID and CLIP score for fidelity and alignment.
-    - ImageNet (1409.0575 §4): top-1/top-5 classification accuracy.
-    - WikiText (1609.07843 §5): perplexity on validation/test.
-    - GenEval (2310.11513 §3.2): Mask2Former + CLIP color pipeline, binary score.
+    - COCO (2205.11487 ?4.1): FID and CLIP score for fidelity and alignment.
+    - ImageNet (1409.0575 ?4): top-1/top-5 classification accuracy.
+    - WikiText (1609.07843 ?5): perplexity on validation/test.
+    - GenEval (2310.11513 ?3.2): Mask2Former + CLIP color pipeline, binary score.
     - HPS (2306.09341): HPS v2 scoring model (CLIP fine-tuned on HPD v2).
-    - ImgEdit (2505.20275 §4.2): GPT-4o 1���5 ratings and ImgEdit-Judge.
-    - Long Text Bench (2507.22058 §4): Text Accuracy (OCR, Qwen2.5-VL-7B).
-    - GEditBench (2504.17761 §4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
-    - OneIG (2506.07977 §4.1): per-dimension metrics (semantic alignment, ED, etc.).
+    - ImgEdit (2505.20275 ?4.2): GPT-4o 1���5 ratings and ImgEdit-Judge.
+    - Long Text Bench (2507.22058 ?4): Text Accuracy (OCR, Qwen2.5-VL-7B).
+    - GEditBench (2504.17761 ?4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
+    - OneIG (2506.07977 ?4.1): per-dimension metrics (semantic alignment, ED, etc.).
     - DPG (2403.05135): DSG-style graph score, mPLUG-large adjudicator.
     """
 
@@ -174,7 +174,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning "
             "(counting, comparison, logic/negation) with over 24k human ratings."
         ),
-        metrics=["vqa", "clip_score"],
+        metrics=[],  # Paper uses VQAScore only; not in Pruna
         task_type="text_to_image",
         reference="https://arxiv.org/abs/2406.13743",
     ),
@@ -195,7 +195,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "MS-COCO for text-to-image evaluation (Imagen, 2205.11487). Paper reports "
             "FID for fidelity and CLIP score for image-text alignment."
         ),
-        metrics=["fid", "clip_score"],  # §4.1: FID + CLIP score
+        metrics=["fid", "clip_score"],  # ?4.1: FID + CLIP score
         task_type="text_to_image",
         reference="https://arxiv.org/abs/2205.11487",
     ),
@@ -285,13 +285,6 @@ def list(cls, task_type: str | None = None) -> list[str]:
         task_type="text_to_image",
         reference="https://arxiv.org/abs/2506.07977",
     ),
-    Benchmark(
-        name="OneIG Knowledge Reasoning",
-        description="OneIG subset: knowledge- and reasoning-heavy prompts.",
-        metrics=["oneig_reasoning"],
-        task_type="text_to_image",
-        reference="https://arxiv.org/abs/2506.07977",
-    ),
     Benchmark(
         name="OneIG Multilingualism",
         description="OneIG subset: multilingual prompts (incl. Chinese splits).",

diff --git a/src/pruna/evaluation/metrics/__init__.py b/src/pruna/evaluation/metrics/__init__.py
@@ -22,15 +22,16 @@
 from pruna.evaluation.metrics.metric_evalharness import LMEvalMetric
 from pruna.evaluation.metrics.metric_memory import DiskMemoryMetric, InferenceMemoryMetric, TrainingMemoryMetric
 from pruna.evaluation.metrics.metric_model_architecture import TotalMACsMetric, TotalParamsMetric
-from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
 from pruna.evaluation.metrics.metric_oneig_alignment import OneIGAlignmentMetric
 from pruna.evaluation.metrics.metric_oneig_reasoning import OneIGReasoningMetric
+from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
 from pruna.evaluation.metrics.metric_qa_accuracy import QAAccuracyMetric
-from pruna.evaluation.metrics.metric_text_score import OneIGTextScoreMetric, TextScoreMetric
-from pruna.evaluation.metrics.metric_vqa import VQAMetric
 from pruna.evaluation.metrics.metric_rapiddata import RapidataMetric as RapidataMetric
 from pruna.evaluation.metrics.metric_sharpness import SharpnessMetric
+from pruna.evaluation.metrics.metric_text_score import OneIGTextScoreMetric, TextScoreMetric
 from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
+from pruna.evaluation.metrics.metric_vie_score import VieScoreMetric
+from pruna.evaluation.metrics.metric_vqa import VQAMetric
 from pruna.evaluation.metrics.vlm_base import (
     BaseVLM,
     LitellmVLM,
@@ -65,6 +66,7 @@
     "RapidataMetric",
     "TextScoreMetric",
     "VQAMetric",
+    "VieScoreMetric",
     "BaseVLM",
     "LitellmVLM",
     "StatefulVLMMeanScoresMetric",

diff --git a/src/pruna/evaluation/metrics/metric_oneig_alignment.py b/src/pruna/evaluation/metrics/metric_oneig_alignment.py
@@ -151,8 +151,6 @@ class OneIGAlignmentMetric(QAAccuracyMetric):
     (default ``2 x 2``), score **one question per VLM call** across all cells, apply
     dependency masking per cell, then average cell scores.
 
-    Scoring semantics
-    -----------------
     OneIG Q_D probes are phrased so **Yes = aligned**. Each call requests
     :meth:`~pruna.evaluation.metrics.vlm_base.BaseVLM.score` with expected answer
     ``"Yes"`` (probability of Yes). Low scores act as semantic **No** for dependency
@@ -178,11 +176,9 @@ class OneIGAlignmentMetric(QAAccuracyMetric):
     api_key : str | None, optional
         API key for litellm.
     call_type : str, optional
-        Call type for the metric.
-    aggregation : str, optional
-        Unused; kept for registry compatibility with :class:`QAAccuracyMetric`.
+        Call type for the metric (``"single"`` or ``"pairwise"``).
     **kwargs : Any
-        Additional keyword arguments for :class:`QAAccuracyMetric`.
+        Forwarded to :class:`QAAccuracyMetric` (e.g. ``aggregation``).
 
     Examples
     --------
@@ -199,7 +195,6 @@ class OneIGAlignmentMetric(QAAccuracyMetric):
 
     def __init__(
         self,
-        *args: Any,
         grid_size: tuple[int, int] = (2, 2),
         vlm: Any | None = None,
         vlm_type: Literal["litellm", "transformers"] = "transformers",
@@ -212,18 +207,18 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         super().__init__(
-            *args,
             vlm=vlm,
             vlm_type=vlm_type,
             model_name=model_name,
             vlm_kwargs=vlm_kwargs,
             structured_output=structured_output,
             device=device,
             api_key=api_key,
-            call_type=call_type if call_type is not None else "y_gt",
+            call_type=call_type,
             **kwargs,
         )
         self.grid_size = (int(grid_size[0]), int(grid_size[1]))
+        self.metric_units = type(self).metric_units
 
     def _score_sample(self, image: Any, aux: dict[str, Any]) -> float:
         if not isinstance(image, Image.Image):

diff --git a/src/pruna/evaluation/metrics/metric_qa_accuracy.py b/src/pruna/evaluation/metrics/metric_qa_accuracy.py
@@ -55,8 +55,6 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):
 
     Parameters
     ----------
-    *args : Any
-        Additional positional arguments.
     vlm : BaseVLM | None, optional
         Custom VLM instance. If provided, ``vlm_type`` and ``model_name`` are ignored.
     vlm_type : {"litellm", "transformers"}, optional
@@ -76,8 +74,10 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):
         API key for litellm.
     call_type : str, optional
         Call type for the metric.
+    aggregation : {"mean", "all_or_nothing"}, optional
+        Per-image score aggregation (keyword-only). Default is ``"mean"``.
     **kwargs : Any
-        Supports ``aggregation``: ``"mean"`` or ``"all_or_nothing"``.
+        Additional keyword arguments forwarded to the parent class.
 
     Raises
     ------
@@ -111,15 +111,14 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):
 
     def __init__(
         self,
-        *args,
         vlm: BaseVLM | None = None,
         vlm_type: Literal["litellm", "transformers"] = "litellm",
         model_name: str | None = None,
         vlm_kwargs: dict | None = None,
         structured_output: bool = True,
         device: str | torch.device | None = None,
         api_key: str | None = None,
-        call_type: str = SINGLE,
+        call_type: str | None = None,
         *,
         aggregation: str = "mean",
         **kwargs: Any,
@@ -139,7 +138,7 @@ def __init__(
             structured_output=structured_output,
             device=device,
             api_key=api_key,
-            call_type=call_type,
+            call_type=call_type if call_type is not None else SINGLE,
         )
 
     def _extract_questions(self, gt: Any, n: int) -> list[list[str]]:

diff --git a/src/pruna/evaluation/metrics/metric_torch.py b/src/pruna/evaluation/metrics/metric_torch.py
@@ -50,6 +50,26 @@
 )
 from pruna.logging.logger import pruna_logger
 
+_PRUNA_TASK_ROUTING_KWARGS: tuple[str, ...] = (
+    "vlm_type",
+    "model_name",
+    "structured_output",
+    "vlm_kwargs",
+    "api_key",
+)
+
+
+def _strip_task_routing_kwargs(kwargs: dict[str, Any]) -> None:
+    """
+    Drop kwargs :class:`~pruna.evaluation.task.Task` passes when building mixed metric lists.
+
+    Torchmetrics classes often end with ``**kwargs`` and would otherwise accept bogus keys
+    until a lower layer raises. Stripping here keeps :class:`TorchMetricWrapper` the single
+    choke point between Pruna routing and torchmetrics constructors.
+    """
+    for key in _PRUNA_TASK_ROUTING_KWARGS:
+        kwargs.pop(key, None)
+
 
 def default_update(metric: Metric, *args, **kwargs) -> None:
     """
@@ -124,9 +144,7 @@ def arniqa_update(metric: ARNIQA, preds: Any) -> None:
 
 
 def ssim_update(
-        metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure,
-        preds: Any,
-        target: Any
+    metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure, preds: Any, target: Any
 ) -> None:
     """
     Update handler for SSIM or MS-SSIM metric.
@@ -152,29 +170,22 @@ class TorchMetrics(Enum):
     """
     Enumeration of torchmetrics metrics for evaluation.
 
-    This enum provides a tuple per member (metric_factory, update_fn, call_type):
-    metric_factory builds the metric (typically a torchmetrics class, or
-    functools.partial when some constructor arguments are fixed); update_fn is
-    an optional custom update handler; call_type describes how inputs are paired
-    for the metric.
+    Each member value is a ``(metric_factory, update_fn, call_type)`` tuple.
 
     Parameters
     ----------
     value : tuple
-        Tuple holding metric_factory, update_fn, and call_type as described above.
+        ``(metric_factory, update_fn, call_type)`` for this enum member.
     names : str
-        The name of the enum member.
+        Enum member name.
     module : str
-        The module where the enum is defined.
+        Defining module name.
     qualname : str
-        The qualified name of the enum.
+        Qualified name of the enum class.
     type : type
-        The type of the enum.
+        Enum metaclass type.
     start : int
-        The start index for auto-numbering enum values.
-    boundary : enum.FlagBoundary or None
-        Boundary handling mode used by the Enum functional API for Flag and
-        IntFlag enums.
+        Auto-numbering start index for functional API enums.
     """
 
     fid = (FrechetInceptionDistance, fid_update, "gt_y")
@@ -246,6 +257,7 @@ def __new__(cls, metric_name: str, call_type: str = "", **kwargs) -> StatefulMet
         if metric_name == "clip_score" and call_type.startswith(PAIRWISE):
             from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
 
+            _strip_task_routing_kwargs(kwargs)
             return PairwiseClipScore(**kwargs)
         return super().__new__(cls)
 
@@ -259,6 +271,7 @@ def __init__(self, metric_name: str, call_type: str = "", **kwargs) -> None:
             If the metric name is not supported.
         """
         self.metric_name = metric_name
+        _strip_task_routing_kwargs(kwargs)
         super().__init__(kwargs.pop("device", None))
         try:
             self.metric = TorchMetrics[metric_name](**kwargs)