Skip to content

Commit 6b2c731

Browse files
[feat] eval: add audio metrics (#1352)
Co-authored-by: klhhhhh <1412841649@qq.com>
1 parent e6022c2 commit 6b2c731

39 files changed

Lines changed: 5665 additions & 38 deletions

docs/contributing/eval-metrics.md

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,26 +11,33 @@ Use this guide when you are:
1111
- Adding a new metric (native or wrapping a third-party library).
1212
- Porting a benchmark (e.g. VBench, MIND, EvalCrafter) whose Python
1313
code needs to be importable from a pinned upstream.
14-
- Adding a new metric group (audio, vlm, etc.).
14+
- Adding a new metric group (audio, videoscore2, etc.).
1515

1616
## TL;DR
1717

1818
Metrics are auto-discovered from
1919
`fastvideo/eval/metrics/<group>/<name>/metric.py`. Each declares itself
20-
with `@register("<group>.<name>")` and subclasses `BaseMetric`. Three
21-
recipes:
20+
with `@register("<group>.<name>")` and subclasses `BaseMetric`. Five
21+
recipes, depending on how the metric ships and what its licence allows:
2222

2323
1. **Native metric** (pure-PyTorch, no submodule). Drop a file,
2424
declare deps, implement `compute(sample)`.
2525
2. **Library-wrapped metric** (CLIP, torch.hub, transformers, pyiqa).
2626
Same as above, plus route the library's cache through
2727
`get_cache_dir()` if it has a `download_root=` / `cache_dir=`
2828
kwarg.
29-
3. **Upstream-submodule-wrapped metric** (vbench-style). Pin upstream
30-
as a git submodule under `fastvideo/third_party/eval/<bench>/`. The
31-
adapter `__init__.py` does the `sys.path` insert and any runtime
32-
compat shims for modern dep versions. Patches live as Python in
33-
that file rather than as on-disk patches to the submodule.
29+
3. **Submodule-wrapped metric** (vbench-style). Pin upstream as a git
30+
submodule under `fastvideo/third_party/eval/<bench>/`. The adapter
31+
`__init__.py` does the `sys.path` insert and any runtime compat
32+
shims. Best for large research packages with stable layouts.
33+
4. **Vendored upstream** (synchformer / glmasr-style). Copy a small,
34+
surgical piece of upstream into `fastvideo/third_party/eval/<name>/`
35+
with its `LICENSE` alongside. Best for permissive-licensed
36+
(MIT / Apache-2.0) source you need a few files from.
37+
5. **Git-source dep via `[tool.uv.sources]`** (ImageBind-style). For
38+
license-restricted upstream (e.g. CC BY-NC-SA) that cannot be
39+
redistributed in the FastVideo tree. uv pulls the source at install
40+
time pinned to a SHA in `pyproject.toml`.
3441

3542
The full recipes are below.
3643

@@ -43,7 +50,8 @@ fastvideo/eval/metrics/
4350
├── base.py # BaseMetric + lifecycle contract
4451
├── common/ # group: SSIM, PSNR, LPIPS
4552
├── optical_flow/ # group: gt_optical_flow, synthetic_optical_flow
46-
├── vlm/ # group: VideoScore-2
53+
├── audio/ # group: CLAP, AudioBox, KL, FAD, WER, DeSync, ImageBind
54+
├── videoscore2/ # VideoScore-2 (single metric at group level)
4755
├── physics_iq/ # group + sub-metrics
4856
└── vbench/ # group: 16 sub-metrics
4957
├── __init__.py # sys.path bootstrap + runtime compat shims
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""Generate one LTX2 video and score its audio track.
2+
3+
LTX2 produces video + audio and muxes both into the output mp4. The
4+
eval suite reads the audio track straight out of that mp4 (librosa
5+
decodes via ffmpeg under the hood), so we just point the audio
6+
metrics at the same path.
7+
8+
Only the reference-free audio metrics run here — they're the ones
9+
that make sense for a single one-shot generation:
10+
11+
* ``audio.clap_score`` — CLAP text↔audio cosine similarity
12+
* ``audio.audiobox_aesthetics`` — AudioBox 4-axis quality score
13+
14+
The other audio metrics need extra inputs we don't have for a
15+
one-shot run: ``audio.frechet_distance`` and ``audio.kl_divergence``
16+
need a reference audio, ``audio.wer`` needs a ground-truth transcript.
17+
18+
Install: ``uv pip install -e .[eval-audio]`` covers both metrics here
19+
(and the rest of the audio suite).
20+
"""
21+
import torch
22+
23+
from fastvideo import VideoGenerator
24+
from fastvideo.eval import create_evaluator
25+
26+
PROMPT = (
27+
"A warm sunny backyard. The camera starts in a tight cinematic close-up "
28+
"of a woman and a man in their 30s, facing each other with serious "
29+
"expressions. The woman, emotional and dramatic, says softly, \"That's "
30+
"it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
31+
"annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
32+
"then mutters defensively, \"He's just having fun.\""
33+
)
34+
35+
METRICS = [
36+
"audio.clap_score",
37+
"audio.audiobox_aesthetics",
38+
]
39+
40+
41+
def main() -> None:
42+
generator = VideoGenerator.from_pretrained(
43+
"Davids048/LTX2-Base-Diffusers",
44+
num_gpus=1,
45+
)
46+
47+
output_path = "outputs_video/ltx2_audio_eval/output.mp4"
48+
generator.generate_video(
49+
prompt=PROMPT,
50+
output_path=output_path,
51+
save_video=True,
52+
num_frames=121,
53+
height=1088,
54+
width=1920,
55+
)
56+
generator.shutdown()
57+
torch.cuda.empty_cache()
58+
59+
print(f"\n[eval] building evaluator: {METRICS}")
60+
evaluator = create_evaluator(metrics=METRICS)
61+
results = evaluator.evaluate(audio=output_path, text_prompt=PROMPT)
62+
63+
print("\n=== Audio scores ===")
64+
for name in METRICS:
65+
r = results[name]
66+
if r.score is None:
67+
print(f" {name}: SKIPPED ({r.details.get('skipped', 'no score')})")
68+
else:
69+
print(f" {name}: {r.score:.4f}")
70+
if r.details:
71+
for k, v in r.details.items():
72+
print(f" {k}: {v}")
73+
74+
75+
if __name__ == "__main__":
76+
main()

fastvideo/eval/README.md

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,66 @@
22

33
In-process evaluation suite for video generations. Includes pixel
44
metrics (SSIM, PSNR, LPIPS), optical-flow comparisons, the full VBench
5-
suite, Physics-IQ, and a VLM scorer (VideoScore-2) behind a single
5+
suite, Physics-IQ, audio metrics, and a VLM scorer behind a single
66
registry-driven API.
77

88
## Install
99

1010
| Use case | Install |
1111
|---|---|
12-
| Default (common, optical_flow, vbench-light, physics_iq, videoscore2) | `uv pip install -e .[eval]` |
13-
| Just VBench (12 of 16 sub-metrics) | `uv pip install -e .[eval-vbench]` |
12+
| Default (common, optical_flow, vbench, physics_iq, videoscore2) | `uv pip install -e .[eval]` |
13+
| Just VBench (11 of 16 by default; +4 with detectron2) | `uv pip install -e .[eval-vbench]` |
1414
| Just Physics-IQ (covered by `[eval]`) | `uv pip install -e .[eval-physics-iq]` |
15-
| Plus `vbench.scene` (AVoCaDO) | `uv pip install -e .[eval-full]` |
15+
| Audio metrics (CLAP, FAD, KL, WER, AudioBox, DeSync, ImageBind) | `uv pip install -e .[eval-audio]` |
16+
| Everything: `[eval]` + `[eval-audio]` + `vbench.scene` (AVoCaDO) | `uv pip install -e .[eval-full]` |
17+
18+
`[eval-audio]` covers every `audio.*` metric. ImageBind
19+
(`facebookresearch/ImageBind`, CC BY-NC-SA 4.0) is git-sourced via
20+
`[tool.uv.sources]` rather than vendored. `torchaudio` at the cu128
21+
wheel is pulled transitively by `audiobox_aesthetics`; on cu128 hosts
22+
using raw `pip`, install `torchaudio` from
23+
`https://download.pytorch.org/whl/cu128` first.
24+
25+
`audio.desync` and `audio.wer (glm_asr)` import vendored upstream from
26+
`fastvideo/third_party/eval/synchformer/` (MIT) and
27+
`fastvideo/third_party/eval/glmasr/` (Apache-2.0). Both trees keep
28+
their upstream `LICENSE` files alongside.
29+
30+
### `audio.*` metric input contracts
31+
32+
Every audio metric reads from these sample-dict keys (extra keys are
33+
ignored):
34+
35+
| Metric | Per-sample? | Required keys |
36+
|---|---|---|
37+
| `audio.clap_score` | yes | `audio` (path), `text_prompt` (str) |
38+
| `audio.audiobox_aesthetics` | yes | `audio` (path) |
39+
| `audio.kl_divergence` | yes | `audio` (path), `reference_audio` (path) |
40+
| `audio.frechet_distance` | **set-vs-set** | `audio` (path), `reference_audio` (path) — accumulated across ≥2 samples; `corpus["audio.frechet_distance"]` carries the score |
41+
| `audio.wer` | yes | `audio` (path), `reference_text` (str) or `text_prompt` (str) |
42+
| `audio.desync` | yes | `video` (decoded tensor or path), `audio` (path) |
43+
| `audio.imagebind_score` | yes | `video_path` (str) **and** `audio` (path) — needs the path, not the pool-decoded tensor, because ImageBind's preprocessing decodes its own clips |
44+
45+
`audio.frechet_distance` is the only set-vs-set metric. The kwargs
46+
form (`ev.evaluate(audio=...)`) raises with a clear message because a
47+
single sample cannot produce a corpus result; use
48+
`ev.evaluate(samples=[...])`.
49+
50+
### Reference repos for audio
51+
52+
The audio set ports its math 1:1 from `hkchengrex/av-benchmark` (the
53+
V2A literature's de-facto eval harness — used by MMAudio, FoleyCrafter,
54+
V2A-Mapper). Per-metric upstream:
55+
56+
| Metric | Upstream |
57+
|---|---|
58+
| `audio.frechet_distance` (PaSST-FAD) | `av_bench/metrics/fad.py::compute_fd` over `hear21passt` 768-d embeds |
59+
| `audio.kl_divergence` | `av_bench/metrics/kl.py::compute_kl` over PaSST 527-d logits |
60+
| `audio.clap_score` | HF `transformers.ClapModel` (`laion/clap-htsat-fused` — closest HF mirror of `630k-audioset-fusion-best`) |
61+
| `audio.audiobox_aesthetics` | `facebookresearch/audiobox-aesthetics` (PQ as primary score, CE/CU/PC in details) |
62+
| `audio.wer` | MagiHuman-style: NFKC + CJK char-level via `jiwer`, GLM-ASR or Whisper backbone |
63+
| `audio.desync` | `av_bench/synchformer/` (vendored under `third_party/eval/synchformer/`); checkpoint from `hkchengrex/MMAudio/releases/v0.1/synchformer_state_dict.pth` |
64+
| `audio.imagebind_score` | `facebookresearch/ImageBind` (`imagebind_huge` pretrained) |
1665
| Plus `vbench.{color, multiple_objects, object_class, spatial_relationship}` (GRiT) | `uv pip install -e .[eval-vbench]` then `uv pip install --no-build-isolation 'git+https://github.com/facebookresearch/detectron2.git'` |
1766

1867
To use VBench, also pull the upstream submodule:
@@ -80,14 +129,18 @@ fastvideo/
80129
│ ├── base.py # BaseMetric + @register contract
81130
│ ├── common/ # SSIM, PSNR, LPIPS
82131
│ ├── optical_flow/ # gt_optical_flow, synthetic_optical_flow
132+
│ ├── audio/ # clap_score, audiobox_aesthetics, kl_divergence,
133+
│ │ # frechet_distance, wer, desync, imagebind_score
83134
│ ├── videoscore2/ # VideoScore-2 (Qwen2.5-VL)
84135
│ ├── physics_iq/ # PhysicsIQ + sub-metrics
85136
│ └── vbench/ # adapter: sys.path bootstrap + shims
86137
│ ├── __init__.py
87138
│ └── <16 sub-metric pkgs>
88139
└── third_party/
89140
└── eval/
90-
└── vbench/ # git submodule (Vchitect/VBench)
141+
├── vbench/ # git submodule (Vchitect/VBench)
142+
├── synchformer/ # vendored (MIT), used by audio.desync
143+
└── glmasr/ # vendored (Apache-2.0), used by audio.wer (glm_asr)
91144
```
92145

93146
### Prompt datasets
@@ -134,31 +187,29 @@ class YourMetric(BaseMetric):
134187
needs_gpu = False
135188
dependencies: list[str] = [] # e.g. ["pyiqa"] if relevant
136189

137-
def compute(self, sample) -> list[MetricResult]:
190+
def compute(self, sample) -> MetricResult:
138191
...
139192
```
140193

141194
The metric is auto-discovered by `fastvideo/eval/metrics/__init__.py`,
142195
which walks all non-underscore subdirectories and imports their
143196
`metric` module.
144197

145-
### Wrapping upstream code via a submodule
146-
147-
See `fastvideo/eval/metrics/vbench/` for a worked example. The
148-
contract is:
198+
### Wrapping upstream code
149199

150-
1. Upstream lives as a git submodule under
151-
`fastvideo/third_party/eval/<bench>/`, pinned to a SHA in repo-root
152-
`.gitmodules`.
153-
2. The metric package's `__init__.py`
154-
(`fastvideo/eval/metrics/<bench>/__init__.py`) inserts that
155-
submodule path on `sys.path` and installs any compat shims for
156-
modern torch/transformers/numpy. Do not modify upstream files on
157-
disk.
158-
3. Per-sub-metric `metric.py` files use `@register("<bench>.<name>")`.
200+
Three patterns coexist depending on how the upstream ships and what
201+
licence it's under. All three keep upstream files on disk unmodified;
202+
behavioural patches live as runtime shims in the consuming code.
159203

160-
Patches live as Python in the metric's `__init__.py` so they are
161-
grep-able and reviewable.
204+
1. **Git submodule** — large research packages pinned to a SHA, accessed
205+
via `sys.path` bootstrap. See `fastvideo/eval/metrics/vbench/` (with
206+
`fastvideo/third_party/eval/vbench/`).
207+
2. **Vendored under `third_party/eval/<name>/`** — small/surgical upstream
208+
trees with permissive licences (MIT, Apache-2.0). See
209+
`fastvideo/third_party/eval/synchformer/` and `.../glmasr/`.
210+
3. **Git-source via `[tool.uv.sources]`** — license-restricted upstream
211+
that cannot be redistributed in the FastVideo source tree. See
212+
ImageBind (CC BY-NC-SA 4.0) in `pyproject.toml`.
162213

163214
## Caches
164215

@@ -167,7 +218,7 @@ Eval cache root: `${FASTVIDEO_CACHE_ROOT}/eval/`, default
167218

168219
```
169220
${FASTVIDEO_CACHE_ROOT}/eval/
170-
├── models/ # URL-fetched checkpoints (LAION head, AMT, GRiT)
221+
├── models/ # URL-fetched checkpoints (LAION head, GRiT, Synchformer, ImageBind)
171222
├── torch/ # redirected TORCH_HOME (DINO via torch.hub, lpips)
172223
├── clip/ # passed as download_root= to clip.load callsites
173224
└── datasets/ # auto-fetched dataset assets, one subdir per benchmark

fastvideo/eval/metrics/audio/__init__.py

Whitespace-only changes.

fastvideo/eval/metrics/audio/audiobox_aesthetics/__init__.py

Whitespace-only changes.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""AudioBox Aesthetics (CE, CU, PC, PQ).
2+
3+
Thin wrapper around Meta's ``audiobox_aesthetics`` predictor. Returns
4+
four per-clip dimensions (CE — Content Enjoyment, CU — Content
5+
Usefulness, PC — Production Complexity, PQ — Production Quality);
6+
``score`` exposes PQ, the dimension V2A papers typically report on.
7+
The remaining three are surfaced under ``details``.
8+
9+
The earlier Verse-Bench combined score ``(CE + CU + PQ + (11 − PC)) / 4``
10+
is non-standard and is deliberately not used.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
from typing import Any
16+
17+
import torch
18+
19+
from fastvideo.eval.metrics.base import BaseMetric
20+
from fastvideo.eval.registry import register
21+
from fastvideo.eval.types import MetricResult
22+
23+
24+
@register("audio.audiobox_aesthetics")
25+
class AudioBoxAestheticsMetric(BaseMetric):
26+
"""AudioBox Aesthetics: PQ as the primary score, CE/CU/PC/PQ in details."""
27+
28+
name = "audio.audiobox_aesthetics"
29+
requires_reference = False
30+
higher_is_better = True
31+
needs_gpu = True
32+
is_set_metric = False
33+
dependencies = ["audiobox_aesthetics"]
34+
35+
def __init__(self) -> None:
36+
super().__init__()
37+
self._predictor: Any = None
38+
39+
def to(self, device):
40+
super().to(device)
41+
if self._predictor is not None:
42+
self._predictor.model.to(self.device)
43+
self._predictor.device = self.device
44+
return self
45+
46+
def setup(self) -> None:
47+
if self._predictor is not None:
48+
return
49+
from audiobox_aesthetics.infer import initialize_predictor
50+
predictor = initialize_predictor()
51+
# Upstream's setup_model() always lands on default CUDA (cuda:0).
52+
# Re-pin onto this worker's device so multi-GPU eval actually parallelizes.
53+
predictor.model.to(self.device)
54+
predictor.device = self.device
55+
self._predictor = predictor
56+
57+
@torch.no_grad()
58+
def compute(self, sample: dict) -> MetricResult:
59+
from pathlib import Path
60+
61+
if self._predictor is None:
62+
self.setup()
63+
64+
audio_path = sample.get("audio")
65+
if audio_path is None:
66+
return self._skip(sample, "missing 'audio'")
67+
if not Path(audio_path).exists():
68+
return self._skip(sample, f"audio file not found: {audio_path}")
69+
70+
try:
71+
score = self._predictor.forward([{"path": audio_path}])[0]
72+
except Exception as exc: # pragma: no cover — upstream raises vary
73+
return self._skip(sample, f"audiobox predictor failed: {type(exc).__name__}: {exc}")
74+
try:
75+
return MetricResult(
76+
name=self.name,
77+
score=float(score["PQ"]),
78+
details={
79+
"CE": float(score["CE"]),
80+
"CU": float(score["CU"]),
81+
"PC": float(score["PC"]),
82+
"PQ": float(score["PQ"]),
83+
},
84+
)
85+
except (KeyError, TypeError) as exc:
86+
return self._skip(sample, f"audiobox returned unexpected shape: {exc}")

fastvideo/eval/metrics/audio/clap_score/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)