PrunaAI · davidberenstein1957 · Apr 25, 2026 · May 8, 2026 · Jun 2, 2026 · Jun 4, 2026
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -11,6 +11,17 @@ on:
   pull_request:
     branches:
       - main
+      # TEMP: remove feat/vlm-pr-* entries before merging the VLM stack to main
+      - feat/vlm-pr-1-vendor
+      - feat/vlm-pr-2-infrastructure
+      - feat/vlm-pr-3a-qa-accuracy
+      - feat/vlm-pr-3b-oneig-alignment
+      - feat/vlm-pr-3c-text-score-pair
+      - feat/vlm-pr-3d-oneig-reasoning
+      - feat/vlm-pr-4a-vqa
+      - feat/vlm-pr-4b-vie-score
+      - feat/vlm-pr-4c-img-edit-score
+      - feat/vlm-pr-5-e2e-tests
 
 concurrency:
   group: ci-${{ github.repository }}-tests-${{ github.ref }}

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,9 +36,6 @@ possibly-missing-attribute = "ignore"
 missing-argument = "ignore"
 unused-type-ignore-comment = "ignore"
 
-[tool.bandit]
-exclude_dirs = ["tests", "docs"]
-
 
 [tool.coverage.run]
 source = ["src/pruna"]
@@ -70,29 +67,21 @@ name = "pruna_internal"
 url = "https://prunaai.pythonanywhere.com/simple/"
 explicit = true
 
-[[tool.uv.index]]
-name = "intel-pytorch-extension"
-url = "https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/"
-explicit = true
-
 [tool.uv]
 index-strategy = "first-index"
+exclude-newer = "1 week"  # protection against compromised dependencies
+# trusted dev wheels that are missing an upload date
+exclude-newer-package = { gptqmodel = false, "stable-fast-pruna" = false }
 
 conflicts = [
     [{ extra = "awq" }, { extra = "vbench" }],
     [{ extra = "vllm" }, { extra = "vbench" }],
-    [{ extra = "intel" }, { extra = "awq" }],
     [{ extra = "gptq" }, { extra = "awq" }],
-    # intel is incompatible with all stable-fast variants and vllm
-    [{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
-    [{ extra = "intel" }, { extra = "vllm" }],
     [{ extra = "kvpress" }, { extra = "vbench" }],
 ]
 
 [tool.uv.sources]
 gptqmodel = { index = "pruna_internal", marker = "sys_platform != 'darwin' or platform_machine != 'arm64'" }
-intel-extension-for-pytorch = { index = "intel-pytorch-extension" }
 stable-fast-pruna = { index = "pruna_internal", extra = "stable-fast-extraindex" }
 
 [project]
@@ -171,6 +160,21 @@ vllm = [
     "vllm>=0.16.0",
     "ray",
 ]
+rapidata = [
+    "rapidata>=3.0.0",
+]
+upscale = [
+    "realesrgan",
+]
+evaluation = [
+    "pruna[rapidata]",
+    "pruna[lmharness]",
+    "outlines>1.2.0,<2.0.0",
+    "litellm>=1.0.0",
+]
+oneig-reasoning = [
+    "hf_transfer>=0.1.9",
+]
 stable-fast = [
     "xformers>=0.0.30",
     "stable-fast-pruna>=1.0.8,<1.0.9",
@@ -195,18 +199,12 @@ awq = [
     "llmcompressor>=0.9",
     "torch>=2.9.0"
 ]
-upscale = [
-    "realesrgan",
-]
 full = [
     "pruna[stable-fast]",
 ]
 vbench = [
     "vbench-pruna; sys_platform != 'darwin'",
 ]
-rapidata = [
-    "rapidata>=3.0.0"
-]
 dev = [
     "wget",
     "python-dotenv",
@@ -233,22 +231,13 @@ dev = [
     "types-PyYAML",
     "logbar",
     "pytest-xdist>=3.8.0",
+    "pruna[evaluation]",
 ]
 cpu = []
 lmharness = [
     "lm-eval>=0.4.0"
 ]
-evaluation = [
-    "pruna[rapidata]",
-    "pruna[lmharness]"
-]
 
-# Intel extension is tightly coupled with the torch version
-intel = [
-    "intel-extension-for-pytorch>=2.7.0",
-    "torch>=2.7.0,<2.9.0",
-    "torchvision>=0.22.0,<0.24.0",
-]
 kvpress = [
     "kvpress>=0.5.2",
 ]

diff --git a/src/pruna/evaluation/metrics/__init__.py b/src/pruna/evaluation/metrics/__init__.py
@@ -26,6 +26,13 @@
 from pruna.evaluation.metrics.metric_rapiddata import RapidataMetric as RapidataMetric
 from pruna.evaluation.metrics.metric_sharpness import SharpnessMetric
 from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
+from pruna.evaluation.metrics.vlm_base import (
+    BaseVLM,
+    LitellmVLM,
+    StatefulVLMMeanScoresMetric,
+    TransformersVLM,
+    get_vlm,
+)
 
 __all__ = [
     "MetricRegistry",
@@ -47,4 +54,9 @@
     "AestheticLAION",
     "LMEvalMetric",
     "RapidataMetric",
+    "BaseVLM",
+    "LitellmVLM",
+    "StatefulVLMMeanScoresMetric",
+    "TransformersVLM",
+    "get_vlm",
 ]
diff --git a/src/pruna/evaluation/metrics/metric_torch.py b/src/pruna/evaluation/metrics/metric_torch.py
@@ -50,6 +50,26 @@
 )
 from pruna.logging.logger import pruna_logger
 
+_PRUNA_TASK_ROUTING_KWARGS: tuple[str, ...] = (
+    "vlm_type",
+    "model_name",
+    "structured_output",
+    "vlm_kwargs",
+    "api_key",
+)
+
+
+def _strip_task_routing_kwargs(kwargs: dict[str, Any]) -> None:
+    """
+    Drop kwargs :class:`~pruna.evaluation.task.Task` passes when building mixed metric lists.
+
+    Torchmetrics classes often end with ``**kwargs`` and would otherwise accept bogus keys
+    until a lower layer raises. Stripping here keeps :class:`TorchMetricWrapper` the single
+    choke point between Pruna routing and torchmetrics constructors.
+    """
+    for key in _PRUNA_TASK_ROUTING_KWARGS:
+        kwargs.pop(key, None)
+
 
 def default_update(metric: Metric, *args, **kwargs) -> None:
     """
@@ -124,9 +144,7 @@ def arniqa_update(metric: ARNIQA, preds: Any) -> None:
 
 
 def ssim_update(
-        metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure,
-        preds: Any,
-        target: Any
+    metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure, preds: Any, target: Any
 ) -> None:
     """
     Update handler for SSIM or MS-SSIM metric.
@@ -152,29 +170,22 @@ class TorchMetrics(Enum):
     """
     Enumeration of torchmetrics metrics for evaluation.
 
-    This enum provides a tuple per member (metric_factory, update_fn, call_type):
-    metric_factory builds the metric (typically a torchmetrics class, or
-    functools.partial when some constructor arguments are fixed); update_fn is
-    an optional custom update handler; call_type describes how inputs are paired
-    for the metric.
+    Each member value is a ``(metric_factory, update_fn, call_type)`` tuple.
 
     Parameters
     ----------
     value : tuple
-        Tuple holding metric_factory, update_fn, and call_type as described above.
+        ``(metric_factory, update_fn, call_type)`` for this enum member.
     names : str
-        The name of the enum member.
+        Enum member name.
     module : str
-        The module where the enum is defined.
+        Defining module name.
     qualname : str
-        The qualified name of the enum.
+        Qualified name of the enum class.
     type : type
-        The type of the enum.
+        Enum metaclass type.
     start : int
-        The start index for auto-numbering enum values.
-    boundary : enum.FlagBoundary or None
-        Boundary handling mode used by the Enum functional API for Flag and
-        IntFlag enums.
+        Auto-numbering start index for functional API enums.
     """
 
     fid = (FrechetInceptionDistance, fid_update, "gt_y")
@@ -246,6 +257,7 @@ def __new__(cls, metric_name: str, call_type: str = "", **kwargs) -> StatefulMet
         if metric_name == "clip_score" and call_type.startswith(PAIRWISE):
             from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
 
+            _strip_task_routing_kwargs(kwargs)
             return PairwiseClipScore(**kwargs)
         return super().__new__(cls)
 
@@ -259,6 +271,7 @@ def __init__(self, metric_name: str, call_type: str = "", **kwargs) -> None:
             If the metric name is not supported.
         """
         self.metric_name = metric_name
+        _strip_task_routing_kwargs(kwargs)
         super().__init__(kwargs.pop("device", None))
         try:
             self.metric = TorchMetrics[metric_name](**kwargs)

diff --git a/src/pruna/evaluation/metrics/utils.py b/src/pruna/evaluation/metrics/utils.py
@@ -56,13 +56,17 @@ def metric_data_processor(
     This function determines the order and selection of inputs to be passed to various metrics.
 
     The function supports different input arrangements through the 'call_type' configuration:
-    - 'x_y': Uses input data (x) and model outputs
-    - 'gt_y': Uses ground truth (gt) and model outputs
-    - 'y_x': Uses model outputs and input data (x)
-    - 'y_gt': Uses model outputs and ground truth (gt)
-    - 'pairwise_gt_y': Uses cached base model outputs (gt) and smashed model outputs (y).
-    - 'pairwise_y_gt': Uses smashed model outputs (y) and cached base model outputs (gt).
-    The evaluation agent is expected to pass the cached base model outputs as gt.
+
+    - 'y_gt': Model's output first, then ground truth. Returns [outputs, gt].
+    - 'gt_y': Ground truth first, then model's output. Returns [gt, outputs].
+    - 'y_x': Model's output first, then input data. Returns [outputs, x].
+      Used by CLIPScore, VQA, ImageEditScore, VIEScore.
+    - 'x_y': Input data first, then model's output. Returns [x, outputs].
+    - 'x_gt': Input data first, then ground truth. Returns [x, gt].
+    - 'gt_x': Ground truth first, then input data. Returns [gt, x].
+    - 'pairwise_y_gt': Base model's output first, then subsequent model's output.
+    - 'pairwise_gt_y': Subsequent model's output first, then base model's output.
+    - 'y': Only the output is used; the metric has an internal dataset. Returns [outputs].
 
     Parameters
     ----------
@@ -85,7 +89,8 @@ def metric_data_processor(
     Raises
     ------
     ValueError
-        If the specified call_type is not one of: 'x_y', 'gt_y', 'y_x', 'y_gt', 'pairwise'.
+        If the specified call_type is not one of: 'y_gt', 'gt_y', 'y_x', 'x_y',
+        'x_gt', 'gt_x', 'pairwise_y_gt', 'pairwise_gt_y', 'y'.
 
     Examples
     --------
@@ -106,11 +111,15 @@ def metric_data_processor(
         return [outputs, x]
     elif call_type == "y_gt":
         return [outputs, gt]
+    elif call_type == "x_gt":
+        return [x, gt]
+    elif call_type == "gt_x":
+        return [gt, x]
     elif call_type == "pairwise_gt_y":
         return [gt, outputs]
     elif call_type == "pairwise_y_gt":
         return [outputs, gt]
-    elif call_type == "y":  # IQA metrics that have an internal dataset
+    elif call_type == "y":
         return [outputs]
     else:
         raise ValueError(f"Invalid call type: {call_type}")