Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ on:
pull_request:
branches:
- main
# TEMP: remove feat/vlm-pr-* entries before merging the VLM stack to main
- feat/vlm-pr-1-vendor
- feat/vlm-pr-2-infrastructure
- feat/vlm-pr-3a-qa-accuracy
- feat/vlm-pr-3b-oneig-alignment
- feat/vlm-pr-3c-text-score-pair
- feat/vlm-pr-3d-oneig-reasoning
- feat/vlm-pr-4a-vqa
- feat/vlm-pr-4b-vie-score
- feat/vlm-pr-4c-img-edit-score
- feat/vlm-pr-5-e2e-tests

concurrency:
group: ci-${{ github.repository }}-tests-${{ github.ref }}
Expand Down
49 changes: 19 additions & 30 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ possibly-missing-attribute = "ignore"
missing-argument = "ignore"
unused-type-ignore-comment = "ignore"

[tool.bandit]
exclude_dirs = ["tests", "docs"]


[tool.coverage.run]
source = ["src/pruna"]
Expand Down Expand Up @@ -70,29 +67,21 @@ name = "pruna_internal"
url = "https://prunaai.pythonanywhere.com/simple/"
explicit = true

[[tool.uv.index]]
name = "intel-pytorch-extension"
url = "https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/"
explicit = true

[tool.uv]
index-strategy = "first-index"
exclude-newer = "1 week" # protection against compromised dependencies
# trusted dev wheels that are missing an upload date
exclude-newer-package = { gptqmodel = false, "stable-fast-pruna" = false }

conflicts = [
[{ extra = "awq" }, { extra = "vbench" }],
[{ extra = "vllm" }, { extra = "vbench" }],
[{ extra = "intel" }, { extra = "awq" }],
[{ extra = "gptq" }, { extra = "awq" }],
# intel is incompatible with all stable-fast variants and vllm
[{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
[{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
[{ extra = "intel" }, { extra = "vllm" }],
[{ extra = "kvpress" }, { extra = "vbench" }],
]

[tool.uv.sources]
gptqmodel = { index = "pruna_internal", marker = "sys_platform != 'darwin' or platform_machine != 'arm64'" }
intel-extension-for-pytorch = { index = "intel-pytorch-extension" }
stable-fast-pruna = { index = "pruna_internal", extra = "stable-fast-extraindex" }

[project]
Expand Down Expand Up @@ -171,6 +160,21 @@ vllm = [
"vllm>=0.16.0",
"ray",
]
rapidata = [
"rapidata>=3.0.0",
]
upscale = [
"realesrgan",
]
evaluation = [
"pruna[rapidata]",
"pruna[lmharness]",
"outlines>1.2.0,<2.0.0",
"litellm>=1.0.0",
]
oneig-reasoning = [
"hf_transfer>=0.1.9",
]
stable-fast = [
"xformers>=0.0.30",
"stable-fast-pruna>=1.0.8,<1.0.9",
Expand All @@ -195,18 +199,12 @@ awq = [
"llmcompressor>=0.9",
"torch>=2.9.0"
]
upscale = [
"realesrgan",
]
full = [
"pruna[stable-fast]",
]
vbench = [
"vbench-pruna; sys_platform != 'darwin'",
]
rapidata = [
"rapidata>=3.0.0"
]
dev = [
"wget",
"python-dotenv",
Expand All @@ -233,22 +231,13 @@ dev = [
"types-PyYAML",
"logbar",
"pytest-xdist>=3.8.0",
"pruna[evaluation]",
]
cpu = []
lmharness = [
"lm-eval>=0.4.0"
]
evaluation = [
"pruna[rapidata]",
"pruna[lmharness]"
]

# Intel extension is tightly coupled with the torch version
intel = [
"intel-extension-for-pytorch>=2.7.0",
"torch>=2.7.0,<2.9.0",
"torchvision>=0.22.0,<0.24.0",
]
kvpress = [
"kvpress>=0.5.2",
]
Expand Down
12 changes: 12 additions & 0 deletions src/pruna/evaluation/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
from pruna.evaluation.metrics.metric_rapiddata import RapidataMetric as RapidataMetric
from pruna.evaluation.metrics.metric_sharpness import SharpnessMetric
from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
from pruna.evaluation.metrics.vlm_base import (
BaseVLM,
LitellmVLM,
StatefulVLMMeanScoresMetric,
TransformersVLM,
get_vlm,
)

__all__ = [
"MetricRegistry",
Expand All @@ -47,4 +54,9 @@
"AestheticLAION",
"LMEvalMetric",
"RapidataMetric",
"BaseVLM",
"LitellmVLM",
"StatefulVLMMeanScoresMetric",
"TransformersVLM",
"get_vlm",
]
47 changes: 30 additions & 17 deletions src/pruna/evaluation/metrics/metric_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,26 @@
)
from pruna.logging.logger import pruna_logger

_PRUNA_TASK_ROUTING_KWARGS: tuple[str, ...] = (
"vlm_type",
"model_name",
"structured_output",
"vlm_kwargs",
"api_key",
)


def _strip_task_routing_kwargs(kwargs: dict[str, Any]) -> None:
"""
Drop kwargs :class:`~pruna.evaluation.task.Task` passes when building mixed metric lists.

Torchmetrics classes often end with ``**kwargs`` and would otherwise accept bogus keys
until a lower layer raises. Stripping here keeps :class:`TorchMetricWrapper` the single
choke point between Pruna routing and torchmetrics constructors.
"""
for key in _PRUNA_TASK_ROUTING_KWARGS:
kwargs.pop(key, None)


def default_update(metric: Metric, *args, **kwargs) -> None:
"""
Expand Down Expand Up @@ -124,9 +144,7 @@ def arniqa_update(metric: ARNIQA, preds: Any) -> None:


def ssim_update(
metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure,
preds: Any,
target: Any
metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure, preds: Any, target: Any
) -> None:
"""
Update handler for SSIM or MS-SSIM metric.
Expand All @@ -152,29 +170,22 @@ class TorchMetrics(Enum):
"""
Enumeration of torchmetrics metrics for evaluation.

This enum provides a tuple per member (metric_factory, update_fn, call_type):
metric_factory builds the metric (typically a torchmetrics class, or
functools.partial when some constructor arguments are fixed); update_fn is
an optional custom update handler; call_type describes how inputs are paired
for the metric.
Each member value is a ``(metric_factory, update_fn, call_type)`` tuple.

Parameters
----------
value : tuple
Tuple holding metric_factory, update_fn, and call_type as described above.
``(metric_factory, update_fn, call_type)`` for this enum member.
names : str
The name of the enum member.
Enum member name.
module : str
The module where the enum is defined.
Defining module name.
qualname : str
The qualified name of the enum.
Qualified name of the enum class.
type : type
The type of the enum.
Enum metaclass type.
start : int
The start index for auto-numbering enum values.
boundary : enum.FlagBoundary or None
Boundary handling mode used by the Enum functional API for Flag and
IntFlag enums.
Auto-numbering start index for functional API enums.
"""

fid = (FrechetInceptionDistance, fid_update, "gt_y")
Expand Down Expand Up @@ -246,6 +257,7 @@ def __new__(cls, metric_name: str, call_type: str = "", **kwargs) -> StatefulMet
if metric_name == "clip_score" and call_type.startswith(PAIRWISE):
from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore

_strip_task_routing_kwargs(kwargs)
return PairwiseClipScore(**kwargs)
return super().__new__(cls)

Expand All @@ -259,6 +271,7 @@ def __init__(self, metric_name: str, call_type: str = "", **kwargs) -> None:
If the metric name is not supported.
"""
self.metric_name = metric_name
_strip_task_routing_kwargs(kwargs)
super().__init__(kwargs.pop("device", None))
try:
self.metric = TorchMetrics[metric_name](**kwargs)
Expand Down
27 changes: 18 additions & 9 deletions src/pruna/evaluation/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,17 @@ def metric_data_processor(
This function determines the order and selection of inputs to be passed to various metrics.

The function supports different input arrangements through the 'call_type' configuration:
- 'x_y': Uses input data (x) and model outputs
- 'gt_y': Uses ground truth (gt) and model outputs
- 'y_x': Uses model outputs and input data (x)
- 'y_gt': Uses model outputs and ground truth (gt)
- 'pairwise_gt_y': Uses cached base model outputs (gt) and smashed model outputs (y).
- 'pairwise_y_gt': Uses smashed model outputs (y) and cached base model outputs (gt).
The evaluation agent is expected to pass the cached base model outputs as gt.

- 'y_gt': Model's output first, then ground truth. Returns [outputs, gt].
- 'gt_y': Ground truth first, then model's output. Returns [gt, outputs].
- 'y_x': Model's output first, then input data. Returns [outputs, x].
Used by CLIPScore, VQA, ImageEditScore, VIEScore.
- 'x_y': Input data first, then model's output. Returns [x, outputs].
- 'x_gt': Input data first, then ground truth. Returns [x, gt].
- 'gt_x': Ground truth first, then input data. Returns [gt, x].
- 'pairwise_y_gt': Base model's output first, then subsequent model's output.
- 'pairwise_gt_y': Subsequent model's output first, then base model's output.
- 'y': Only the output is used; the metric has an internal dataset. Returns [outputs].

Parameters
----------
Expand All @@ -85,7 +89,8 @@ def metric_data_processor(
Raises
------
ValueError
If the specified call_type is not one of: 'x_y', 'gt_y', 'y_x', 'y_gt', 'pairwise'.
If the specified call_type is not one of: 'y_gt', 'gt_y', 'y_x', 'x_y',
'x_gt', 'gt_x', 'pairwise_y_gt', 'pairwise_gt_y', 'y'.

Examples
--------
Expand All @@ -106,11 +111,15 @@ def metric_data_processor(
return [outputs, x]
elif call_type == "y_gt":
return [outputs, gt]
elif call_type == "x_gt":
return [x, gt]
elif call_type == "gt_x":
return [gt, x]
elif call_type == "pairwise_gt_y":
return [gt, outputs]
elif call_type == "pairwise_y_gt":
return [outputs, gt]
elif call_type == "y": # IQA metrics that have an internal dataset
elif call_type == "y":
return [outputs]
else:
raise ValueError(f"Invalid call type: {call_type}")
Expand Down
Loading
Loading