hao-ai-lab
diff --git a/‎docs/contributing/eval-metrics.md‎
Lines changed: 17 additions & 9 deletions b/‎docs/contributing/eval-metrics.md‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎examples/inference/eval/basic_ltx2_audio_eval.py‎
Lines changed: 76 additions & 0 deletions b/‎examples/inference/eval/basic_ltx2_audio_eval.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎fastvideo/eval/README.md‎
Lines changed: 73 additions & 22 deletions b/‎fastvideo/eval/README.md‎
Lines changed: 73 additions & 22 deletions
diff --git a/‎fastvideo/eval/metrics/audio/__init__.py‎ b/‎fastvideo/eval/metrics/audio/__init__.py‎
diff --git a/‎fastvideo/eval/metrics/audio/audiobox_aesthetics/__init__.py‎ b/‎fastvideo/eval/metrics/audio/audiobox_aesthetics/__init__.py‎
diff --git a/‎fastvideo/eval/metrics/audio/audiobox_aesthetics/metric.py‎
Lines changed: 86 additions & 0 deletions b/‎fastvideo/eval/metrics/audio/audiobox_aesthetics/metric.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎fastvideo/eval/metrics/audio/clap_score/__init__.py‎ b/‎fastvideo/eval/metrics/audio/clap_score/__init__.py‎
@@ -11,26 +11,33 @@ Use this guide when you are:
 - Adding a new metric (native or wrapping a third-party library).
 - Porting a benchmark (e.g. VBench, MIND, EvalCrafter) whose Python
   code needs to be importable from a pinned upstream.
-- Adding a new metric group (audio, vlm, etc.).
+- Adding a new metric group (audio, videoscore2, etc.).
 
 ## TL;DR
 
 Metrics are auto-discovered from
 `fastvideo/eval/metrics/<group>/<name>/metric.py`. Each declares itself
-with `@register("<group>.<name>")` and subclasses `BaseMetric`. Three
-recipes:
+with `@register("<group>.<name>")` and subclasses `BaseMetric`. Five
+recipes, depending on how the metric ships and what its licence allows:
 
 1. **Native metric** (pure-PyTorch, no submodule). Drop a file,
    declare deps, implement `compute(sample)`.
 2. **Library-wrapped metric** (CLIP, torch.hub, transformers, pyiqa).
    Same as above, plus route the library's cache through
    `get_cache_dir()` if it has a `download_root=` / `cache_dir=`
    kwarg.
-3. **Upstream-submodule-wrapped metric** (vbench-style). Pin upstream
-   as a git submodule under `fastvideo/third_party/eval/<bench>/`. The
-   adapter `__init__.py` does the `sys.path` insert and any runtime
-   compat shims for modern dep versions. Patches live as Python in
-   that file rather than as on-disk patches to the submodule.
+3. **Submodule-wrapped metric** (vbench-style). Pin upstream as a git
+   submodule under `fastvideo/third_party/eval/<bench>/`. The adapter
+   `__init__.py` does the `sys.path` insert and any runtime compat
+   shims. Best for large research packages with stable layouts.
+4. **Vendored upstream** (synchformer / glmasr-style). Copy a small,
+   surgical piece of upstream into `fastvideo/third_party/eval/<name>/`
+   with its `LICENSE` alongside. Best for permissive-licensed
+   (MIT / Apache-2.0) source you need a few files from.
+5. **Git-source dep via `[tool.uv.sources]`** (ImageBind-style). For
+   license-restricted upstream (e.g. CC BY-NC-SA) that cannot be
+   redistributed in the FastVideo tree. uv pulls the source at install
+   time pinned to a SHA in `pyproject.toml`.
 
 The full recipes are below.
 
@@ -43,7 +50,8 @@ fastvideo/eval/metrics/
 ├── base.py                  # BaseMetric + lifecycle contract
 ├── common/                  # group: SSIM, PSNR, LPIPS
 ├── optical_flow/            # group: gt_optical_flow, synthetic_optical_flow
-├── vlm/                     # group: VideoScore-2
+├── audio/                   # group: CLAP, AudioBox, KL, FAD, WER, DeSync, ImageBind
+├── videoscore2/             # VideoScore-2 (single metric at group level)
 ├── physics_iq/              # group + sub-metrics
 └── vbench/                  # group: 16 sub-metrics
     ├── __init__.py          # sys.path bootstrap + runtime compat shims
 
@@ -0,0 +1,76 @@
+"""Generate one LTX2 video and score its audio track.
+
+LTX2 produces video + audio and muxes both into the output mp4. The
+eval suite reads the audio track straight out of that mp4 (librosa
+decodes via ffmpeg under the hood), so we just point the audio
+metrics at the same path.
+
+Only the reference-free audio metrics run here — they're the ones
+that make sense for a single one-shot generation:
+
+  * ``audio.clap_score``         — CLAP text↔audio cosine similarity
+  * ``audio.audiobox_aesthetics`` — AudioBox 4-axis quality score
+
+The other audio metrics need extra inputs we don't have for a
+one-shot run: ``audio.frechet_distance`` and ``audio.kl_divergence``
+need a reference audio, ``audio.wer`` needs a ground-truth transcript.
+
+Install: ``uv pip install -e .[eval-audio]`` covers both metrics here
+(and the rest of the audio suite).
+"""
+import torch
+
+from fastvideo import VideoGenerator
+from fastvideo.eval import create_evaluator
+
+PROMPT = (
+    "A warm sunny backyard. The camera starts in a tight cinematic close-up "
+    "of a woman and a man in their 30s, facing each other with serious "
+    "expressions. The woman, emotional and dramatic, says softly, \"That's "
+    "it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
+    "annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
+    "then mutters defensively, \"He's just having fun.\""
+)
+
+METRICS = [
+    "audio.clap_score",
+    "audio.audiobox_aesthetics",
+]
+
+
+def main() -> None:
+    generator = VideoGenerator.from_pretrained(
+        "Davids048/LTX2-Base-Diffusers",
+        num_gpus=1,
+    )
+
+    output_path = "outputs_video/ltx2_audio_eval/output.mp4"
+    generator.generate_video(
+        prompt=PROMPT,
+        output_path=output_path,
+        save_video=True,
+        num_frames=121,
+        height=1088,
+        width=1920,
+    )
+    generator.shutdown()
+    torch.cuda.empty_cache()
+
+    print(f"\n[eval] building evaluator: {METRICS}")
+    evaluator = create_evaluator(metrics=METRICS)
+    results = evaluator.evaluate(audio=output_path, text_prompt=PROMPT)
+
+    print("\n=== Audio scores ===")
+    for name in METRICS:
+        r = results[name]
+        if r.score is None:
+            print(f"  {name}: SKIPPED ({r.details.get('skipped', 'no score')})")
+        else:
+            print(f"  {name}: {r.score:.4f}")
+            if r.details:
+                for k, v in r.details.items():
+                    print(f"      {k}: {v}")
+
+
+if __name__ == "__main__":
+    main()
@@ -2,17 +2,66 @@
 
 In-process evaluation suite for video generations. Includes pixel
 metrics (SSIM, PSNR, LPIPS), optical-flow comparisons, the full VBench
-suite, Physics-IQ, and a VLM scorer (VideoScore-2) behind a single
+suite, Physics-IQ, audio metrics, and a VLM scorer behind a single
 registry-driven API.
 
 ## Install
 
 | Use case | Install |
 |---|---|
-| Default (common, optical_flow, vbench-light, physics_iq, videoscore2) | `uv pip install -e .[eval]` |
-| Just VBench (12 of 16 sub-metrics) | `uv pip install -e .[eval-vbench]` |
+| Default (common, optical_flow, vbench, physics_iq, videoscore2) | `uv pip install -e .[eval]` |
+| Just VBench (11 of 16 by default; +4 with detectron2) | `uv pip install -e .[eval-vbench]` |
 | Just Physics-IQ (covered by `[eval]`) | `uv pip install -e .[eval-physics-iq]` |
-| Plus `vbench.scene` (AVoCaDO) | `uv pip install -e .[eval-full]` |
+| Audio metrics (CLAP, FAD, KL, WER, AudioBox, DeSync, ImageBind) | `uv pip install -e .[eval-audio]` |
+| Everything: `[eval]` + `[eval-audio]` + `vbench.scene` (AVoCaDO) | `uv pip install -e .[eval-full]` |
+
+`[eval-audio]` covers every `audio.*` metric. ImageBind
+(`facebookresearch/ImageBind`, CC BY-NC-SA 4.0) is git-sourced via
+`[tool.uv.sources]` rather than vendored. `torchaudio` at the cu128
+wheel is pulled transitively by `audiobox_aesthetics`; on cu128 hosts
+using raw `pip`, install `torchaudio` from
+`https://download.pytorch.org/whl/cu128` first.
+
+`audio.desync` and `audio.wer (glm_asr)` import vendored upstream from
+`fastvideo/third_party/eval/synchformer/` (MIT) and
+`fastvideo/third_party/eval/glmasr/` (Apache-2.0). Both trees keep
+their upstream `LICENSE` files alongside.
+
+### `audio.*` metric input contracts
+
+Every audio metric reads from these sample-dict keys (extra keys are
+ignored):
+
+| Metric | Per-sample? | Required keys |
+|---|---|---|
+| `audio.clap_score` | yes | `audio` (path), `text_prompt` (str) |
+| `audio.audiobox_aesthetics` | yes | `audio` (path) |
+| `audio.kl_divergence` | yes | `audio` (path), `reference_audio` (path) |
+| `audio.frechet_distance` | **set-vs-set** | `audio` (path), `reference_audio` (path) — accumulated across ≥2 samples; `corpus["audio.frechet_distance"]` carries the score |
+| `audio.wer` | yes | `audio` (path), `reference_text` (str) or `text_prompt` (str) |
+| `audio.desync` | yes | `video` (decoded tensor or path), `audio` (path) |
+| `audio.imagebind_score` | yes | `video_path` (str) **and** `audio` (path) — needs the path, not the pool-decoded tensor, because ImageBind's preprocessing decodes its own clips |
+
+`audio.frechet_distance` is the only set-vs-set metric. The kwargs
+form (`ev.evaluate(audio=...)`) raises with a clear message because a
+single sample cannot produce a corpus result; use
+`ev.evaluate(samples=[...])`.
+
+### Reference repos for audio
+
+The audio set ports its math 1:1 from `hkchengrex/av-benchmark` (the
+V2A literature's de-facto eval harness — used by MMAudio, FoleyCrafter,
+V2A-Mapper). Per-metric upstream:
+
+| Metric | Upstream |
+|---|---|
+| `audio.frechet_distance` (PaSST-FAD) | `av_bench/metrics/fad.py::compute_fd` over `hear21passt` 768-d embeds |
+| `audio.kl_divergence` | `av_bench/metrics/kl.py::compute_kl` over PaSST 527-d logits |
+| `audio.clap_score` | HF `transformers.ClapModel` (`laion/clap-htsat-fused` — closest HF mirror of `630k-audioset-fusion-best`) |
+| `audio.audiobox_aesthetics` | `facebookresearch/audiobox-aesthetics` (PQ as primary score, CE/CU/PC in details) |
+| `audio.wer` | MagiHuman-style: NFKC + CJK char-level via `jiwer`, GLM-ASR or Whisper backbone |
+| `audio.desync` | `av_bench/synchformer/` (vendored under `third_party/eval/synchformer/`); checkpoint from `hkchengrex/MMAudio/releases/v0.1/synchformer_state_dict.pth` |
+| `audio.imagebind_score` | `facebookresearch/ImageBind` (`imagebind_huge` pretrained) |
 | Plus `vbench.{color, multiple_objects, object_class, spatial_relationship}` (GRiT) | `uv pip install -e .[eval-vbench]` then `uv pip install --no-build-isolation 'git+https://github.com/facebookresearch/detectron2.git'` |
 
 To use VBench, also pull the upstream submodule:
@@ -80,14 +129,18 @@ fastvideo/
 │       ├── base.py                # BaseMetric + @register contract
 │       ├── common/                # SSIM, PSNR, LPIPS
 │       ├── optical_flow/          # gt_optical_flow, synthetic_optical_flow
+│       ├── audio/                 # clap_score, audiobox_aesthetics, kl_divergence,
+│       │                          # frechet_distance, wer, desync, imagebind_score
 │       ├── videoscore2/           # VideoScore-2 (Qwen2.5-VL)
 │       ├── physics_iq/            # PhysicsIQ + sub-metrics
 │       └── vbench/                # adapter: sys.path bootstrap + shims
 │           ├── __init__.py
 │           └── <16 sub-metric pkgs>
 └── third_party/
     └── eval/
-        └── vbench/                # git submodule (Vchitect/VBench)
+        ├── vbench/                # git submodule (Vchitect/VBench)
+        ├── synchformer/           # vendored (MIT), used by audio.desync
+        └── glmasr/                # vendored (Apache-2.0), used by audio.wer (glm_asr)
 ```
 
 ### Prompt datasets
@@ -134,31 +187,29 @@ class YourMetric(BaseMetric):
     needs_gpu = False
     dependencies: list[str] = []  # e.g. ["pyiqa"] if relevant
 
-    def compute(self, sample) -> list[MetricResult]:
+    def compute(self, sample) -> MetricResult:
         ...
 ```
 
 The metric is auto-discovered by `fastvideo/eval/metrics/__init__.py`,
 which walks all non-underscore subdirectories and imports their
 `metric` module.
 
-### Wrapping upstream code via a submodule
-
-See `fastvideo/eval/metrics/vbench/` for a worked example. The
-contract is:
+### Wrapping upstream code
 
-1. Upstream lives as a git submodule under
-   `fastvideo/third_party/eval/<bench>/`, pinned to a SHA in repo-root
-   `.gitmodules`.
-2. The metric package's `__init__.py`
-   (`fastvideo/eval/metrics/<bench>/__init__.py`) inserts that
-   submodule path on `sys.path` and installs any compat shims for
-   modern torch/transformers/numpy. Do not modify upstream files on
-   disk.
-3. Per-sub-metric `metric.py` files use `@register("<bench>.<name>")`.
+Three patterns coexist depending on how the upstream ships and what
+licence it's under. All three keep upstream files on disk unmodified;
+behavioural patches live as runtime shims in the consuming code.
 
-Patches live as Python in the metric's `__init__.py` so they are
-grep-able and reviewable.
+1. **Git submodule** — large research packages pinned to a SHA, accessed
+   via `sys.path` bootstrap. See `fastvideo/eval/metrics/vbench/` (with
+   `fastvideo/third_party/eval/vbench/`).
+2. **Vendored under `third_party/eval/<name>/`** — small/surgical upstream
+   trees with permissive licences (MIT, Apache-2.0). See
+   `fastvideo/third_party/eval/synchformer/` and `.../glmasr/`.
+3. **Git-source via `[tool.uv.sources]`** — license-restricted upstream
+   that cannot be redistributed in the FastVideo source tree. See
+   ImageBind (CC BY-NC-SA 4.0) in `pyproject.toml`.
 
 ## Caches
 
@@ -167,7 +218,7 @@ Eval cache root: `${FASTVIDEO_CACHE_ROOT}/eval/`, default
 
 ```
 ${FASTVIDEO_CACHE_ROOT}/eval/
-├── models/      # URL-fetched checkpoints (LAION head, AMT, GRiT)
+├── models/      # URL-fetched checkpoints (LAION head, GRiT, Synchformer, ImageBind)
 ├── torch/       # redirected TORCH_HOME (DINO via torch.hub, lpips)
 ├── clip/        # passed as download_root= to clip.load callsites
 └── datasets/    # auto-fetched dataset assets, one subdir per benchmark
 
@@ -0,0 +1,86 @@
+"""AudioBox Aesthetics (CE, CU, PC, PQ).
+
+Thin wrapper around Meta's ``audiobox_aesthetics`` predictor. Returns
+four per-clip dimensions (CE — Content Enjoyment, CU — Content
+Usefulness, PC — Production Complexity, PQ — Production Quality);
+``score`` exposes PQ, the dimension V2A papers typically report on.
+The remaining three are surfaced under ``details``.
+
+The earlier Verse-Bench combined score ``(CE + CU + PQ + (11 − PC)) / 4``
+is non-standard and is deliberately not used.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+from fastvideo.eval.metrics.base import BaseMetric
+from fastvideo.eval.registry import register
+from fastvideo.eval.types import MetricResult
+
+
+@register("audio.audiobox_aesthetics")
+class AudioBoxAestheticsMetric(BaseMetric):
+    """AudioBox Aesthetics: PQ as the primary score, CE/CU/PC/PQ in details."""
+
+    name = "audio.audiobox_aesthetics"
+    requires_reference = False
+    higher_is_better = True
+    needs_gpu = True
+    is_set_metric = False
+    dependencies = ["audiobox_aesthetics"]
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._predictor: Any = None
+
+    def to(self, device):
+        super().to(device)
+        if self._predictor is not None:
+            self._predictor.model.to(self.device)
+            self._predictor.device = self.device
+        return self
+
+    def setup(self) -> None:
+        if self._predictor is not None:
+            return
+        from audiobox_aesthetics.infer import initialize_predictor
+        predictor = initialize_predictor()
+        # Upstream's setup_model() always lands on default CUDA (cuda:0).
+        # Re-pin onto this worker's device so multi-GPU eval actually parallelizes.
+        predictor.model.to(self.device)
+        predictor.device = self.device
+        self._predictor = predictor
+
+    @torch.no_grad()
+    def compute(self, sample: dict) -> MetricResult:
+        from pathlib import Path
+
+        if self._predictor is None:
+            self.setup()
+
+        audio_path = sample.get("audio")
+        if audio_path is None:
+            return self._skip(sample, "missing 'audio'")
+        if not Path(audio_path).exists():
+            return self._skip(sample, f"audio file not found: {audio_path}")
+
+        try:
+            score = self._predictor.forward([{"path": audio_path}])[0]
+        except Exception as exc:  # pragma: no cover — upstream raises vary
+            return self._skip(sample, f"audiobox predictor failed: {type(exc).__name__}: {exc}")
+        try:
+            return MetricResult(
+                name=self.name,
+                score=float(score["PQ"]),
+                details={
+                    "CE": float(score["CE"]),
+                    "CU": float(score["CU"]),
+                    "PC": float(score["PC"]),
+                    "PQ": float(score["PQ"]),
+                },
+            )
+        except (KeyError, TypeError) as exc:
+            return self._skip(sample, f"audiobox returned unexpected shape: {exc}")