Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions docs/evaluation/speech-audio.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,40 @@ MMAU-Pro (Multimodal Audio Understanding - Pro) is a comprehensive benchmark for

These benchmarks require audio files for meaningful evaluation. **Audio files are downloaded by default** to ensure proper evaluation.

### Audio path convention

Prepared audio manifests should write audio paths using the in-container audio
root, not the host filesystem path. The default in-container root is `/data`.
Override it with `--audio-prefix` when a different mount point is needed; if
`--audio-prefix` is omitted, prepare scripts fall back to
`NEMO_SKILLS_AUDIO_ROOT` and then `/data`.

`--audio-prefix` is the global in-container audio root. Do not include the
benchmark name in it; the prepare script appends the benchmark directory.
For example, use `--audio-prefix /data`, not
`--audio-prefix /data/contextasr-bench`.

For example, preparing data with `--audio-prefix /data` writes manifest paths
like:

```text
/data/asr-leaderboard/...
/data/contextasr-bench/...
```

At evaluation time, mount the host prepared-data directory to the same
in-container root:

```bash
ns eval ... --data_dir=/lustre/.../skills_data --mount-paths=/lustre/.../skills_data:/data
```

### Data Preparation

To prepare the dataset with audio files:

```bash
ns prepare_data asr-leaderboard --data_dir=/path/to/data --cluster=<cluster>
ns prepare_data asr-leaderboard --data_dir=/path/to/data --cluster=<cluster> --audio-prefix=/data
```

Prepare specific datasets only:
Expand Down Expand Up @@ -74,7 +102,7 @@ eval(
model="/workspace/checkpoint",
server_entrypoint="/workspace/megatron-lm/server.py",
server_container="/path/to/container.sqsh",
data_dir="/dataset",
data_dir="/data",
installation_command="pip install -r requirements/audio.txt",
server_args="--inference-max-requests 1 --model-config /workspace/checkpoint/config.yaml",
)
Expand All @@ -98,7 +126,7 @@ eval(benchmarks="asr-leaderboard", split="librispeech_clean", ...)
--model=/workspace/path/to/checkpoint \
--server_entrypoint=/workspace/megatron-lm/server.py \
--server_container=/path/to/container.sqsh \
--data_dir=/dataset \
--data_dir=/data \
--installation_command="pip install -r requirements/audio.txt"
```

Expand All @@ -120,7 +148,7 @@ eval(
model="/workspace/checkpoint",
server_entrypoint="/workspace/megatron-lm/server.py",
server_container="/path/to/container.sqsh",
data_dir="/dataset",
data_dir="/data",
installation_command="pip install sacrebleu",
server_args="--inference-max-requests 1 --model-config /workspace/checkpoint/config.yaml",
)
Expand Down Expand Up @@ -150,7 +178,7 @@ eval(benchmarks="mmau-pro.closed_form", ...)
--model=/workspace/path/to/checkpoint \
--server_entrypoint=/workspace/megatron-lm/server.py \
--server_container=/path/to/container.sqsh \
--data_dir=/dataset \
--data_dir=/data \
--installation_command="pip install sacrebleu"
```

Expand Down Expand Up @@ -534,7 +562,7 @@ used directly. If the file is missing, data is downloaded there automatically.
To use a custom audio path prefix (e.g., for container mount points):

```bash
ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data/contextasr
ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data
```

### Running ContextASR-Bench Evaluation
Expand Down
59 changes: 51 additions & 8 deletions nemo_skills/dataset/asr-leaderboard/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
same data source used by the official leaderboard and the offline NeMo eval
pipeline, ensuring apples-to-apples WER comparison.

Audio paths in JSONL: /dataset/asr-leaderboard/data/{dataset}/{sample_id}.flac
Audio paths in JSONL: {audio_prefix}/asr-leaderboard/data/{dataset}/{sample_id}.flac

Usage:
ns prepare_data asr-leaderboard
Expand All @@ -36,6 +36,12 @@
from datasets import load_dataset
from tqdm import tqdm

from nemo_skills.dataset.utils import (
DEFAULT_CONTAINER_AUDIO_ROOT,
build_container_audio_path,
get_container_audio_root,
)

SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
MIN_AUDIO_DURATION = 0.1 # Skip audio shorter than this (causes mel spectrogram errors)

Expand All @@ -53,7 +59,14 @@


def save_audio_and_format_entry(
entry, dataset_name, audio_dir, sample_idx, text_field="text", id_field="id", with_audio=True
entry,
dataset_name,
audio_dir,
sample_idx,
text_field="text",
id_field="id",
with_audio=True,
audio_root=DEFAULT_CONTAINER_AUDIO_ROOT,
):
"""Format a dataset entry and optionally save audio file."""
text = entry[text_field].strip()
Expand All @@ -77,7 +90,11 @@ def save_audio_and_format_entry(
if with_audio:
sf.write(str(audio_dir / audio_filename), audio_array, sampling_rate)

audio_meta = {"path": f"/dataset/asr-leaderboard/data/{dataset_name}/{audio_filename}"}
audio_meta = {
"path": build_container_audio_path(
"asr-leaderboard", "data", dataset_name, audio_filename, audio_prefix=audio_root
)
}
if duration is not None:
audio_meta["duration"] = float(duration)
user_message["audio"] = audio_meta
Expand All @@ -96,7 +113,7 @@ def save_audio_and_format_entry(
return formatted_entry


def prepare_dataset(dataset_name, output_dir, with_audio=True):
def prepare_dataset(dataset_name, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
"""Prepare a single ASR dataset."""
if dataset_name not in DATASET_CONFIGS:
raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(DATASET_CONFIGS.keys())}")
Expand All @@ -120,7 +137,14 @@ def prepare_dataset(dataset_name, output_dir, with_audio=True):
with open(output_file, "w", encoding="utf-8") as fout:
for idx, entry in enumerate(tqdm(dataset, desc=dataset_name)):
formatted = save_audio_and_format_entry(
entry, dataset_name, audio_dir, idx, text_field=text_field, id_field=id_field, with_audio=with_audio
entry,
dataset_name,
audio_dir,
idx,
text_field=text_field,
id_field=id_field,
with_audio=with_audio,
audio_root=audio_root,
)
if formatted is None:
skipped += 1
Expand All @@ -145,29 +169,48 @@ def main():
choices=list(DATASET_CONFIGS.keys()) + ["all"],
help="Datasets to prepare (default: all)",
)
parser.add_argument(
"--data_dir",
type=str,
default=None,
help=(
"Output directory. If provided, outputs go under <data_dir>/asr-leaderboard. "
"If omitted, writes into this package's dataset directory."
),
)
parser.add_argument(
"--no-audio",
action="store_true",
help="Skip saving audio files (JSONL still includes audio paths)",
)
parser.add_argument(
"--audio-prefix",
type=str,
default=None,
help="In-container audio root written into JSONL paths. Defaults to $NEMO_SKILLS_AUDIO_ROOT or /data.",
)
args = parser.parse_args()

data_dir = Path("/dataset/asr-leaderboard")
output_dir = data_dir if data_dir.exists() else Path(__file__).parent
if args.data_dir:
output_dir = Path(args.data_dir) / "asr-leaderboard"
else:
output_dir = Path(__file__).parent
output_dir.mkdir(parents=True, exist_ok=True)

with_audio = not args.no_audio
audio_root = get_container_audio_root(args.audio_prefix)

if args.no_audio:
print("Running without saving audio files.")
else:
print("Running with audio. Saving to data/{dataset}/")
print(f"Audio paths in JSONL will use: {audio_root}/asr-leaderboard/data/...")

datasets_to_prepare = list(DATASET_CONFIGS.keys()) if "all" in args.datasets else args.datasets

total_samples = 0
for dataset_name in datasets_to_prepare:
total_samples += prepare_dataset(dataset_name, output_dir, with_audio=with_audio)
total_samples += prepare_dataset(dataset_name, output_dir, with_audio=with_audio, audio_root=audio_root)

# Combine all dataset JSONLs into test.jsonl
combined_file = output_dir / "test.jsonl"
Expand Down
24 changes: 21 additions & 3 deletions nemo_skills/dataset/audiobench/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
import soundfile as sf
from tqdm import tqdm

from nemo_skills.dataset.utils import (
DEFAULT_CONTAINER_AUDIO_ROOT,
build_container_audio_path,
get_container_audio_root,
)

# AudioBench datasets categorized by evaluation type
JUDGE_DATASETS = [
"alpaca_audio_test",
Expand Down Expand Up @@ -140,6 +146,7 @@ def create_manifest_entry(
dataset_name: str,
sample_id: int,
category: str,
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
) -> Dict:
"""Create a nemo-skills compatible manifest entry.

Expand All @@ -158,9 +165,9 @@ def create_manifest_entry(
reference = sample.get("reference", sample.get("answer", ""))
task_type = sample.get("task_type", "unknown")

# Create absolute audio path with /data/ prefix for cluster deployment
# Format: /data/audiobench/{category}/audio/{dataset_name}/{filename}
audio_rel_path = f"/data/audiobench/{category}/audio/{dataset_name}/{audio_filename}"
audio_rel_path = build_container_audio_path(
"audiobench", category, "audio", dataset_name, audio_filename, audio_prefix=audio_root
)

# Create audio metadata (both singular and plural forms for compatibility)
audio_metadata = {"path": audio_rel_path, "duration": duration}
Expand Down Expand Up @@ -209,6 +216,7 @@ def process_dataset(
save_audio: bool = True,
split: str = "test",
max_samples: int = -1,
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
) -> tuple[int, List[Dict]]:
"""Process a single AudioBench dataset.

Expand Down Expand Up @@ -459,6 +467,7 @@ def process_dataset(
dataset_name=dataset_name,
sample_id=idx,
category=category,
audio_root=audio_root,
)

manifest_entries.append(entry)
Expand Down Expand Up @@ -519,6 +528,12 @@ def main():
default=-1,
help="Maximum number of samples to process per dataset (-1 for all)",
)
parser.add_argument(
"--audio-prefix",
type=str,
default=None,
help="In-container audio root written into JSONL paths. Defaults to $NEMO_SKILLS_AUDIO_ROOT or /data.",
)
parser.set_defaults(save_audio=True)

args = parser.parse_args()
Expand All @@ -531,6 +546,7 @@ def main():
output_dir = Path(__file__).parent

output_dir.mkdir(parents=True, exist_ok=True)
audio_root = get_container_audio_root(args.audio_prefix)

print("\n" + "=" * 60)
print("AudioBench Dataset Preparation")
Expand All @@ -539,6 +555,7 @@ def main():
print(f"Output directory: {output_dir}")
print(f"Save audio files: {args.save_audio}")
print(f"Split: {args.split}")
print(f"Audio paths in JSONL will use: {audio_root}/audiobench/...")
print("=" * 60 + "\n")

# Determine which datasets to process
Expand Down Expand Up @@ -585,6 +602,7 @@ def main():
save_audio=args.save_audio,
split=args.split,
max_samples=args.max_samples,
audio_root=audio_root,
)
total_samples += num_samples
total_datasets += 1
Expand Down
21 changes: 15 additions & 6 deletions nemo_skills/dataset/contextasr-bench/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
import tarfile
from pathlib import Path

from nemo_skills.dataset.utils import build_container_audio_path

BENCHMARK_NAME = "contextasr-bench"
HF_REPO_ID = "MrSupW/ContextASR-Bench"
JSONL_FILENAME = "ContextASR-Speech_English.jsonl"
AUDIO_TAR_PREFIX = "audio/ContextASR-Speech/English/ContextASR-Speech_English"
Expand Down Expand Up @@ -142,6 +145,11 @@ def build_messages(prompt_text, audio_path, duration):
]


def resolve_audio_prefix(audio_prefix: str | None = None) -> str:
"""Return the in-container ContextASR audio prefix used in JSONL paths."""
return build_container_audio_path(BENCHMARK_NAME, audio_prefix=audio_prefix)


def format_entry(sample, mode, audio_prefix):
"""Format a single dataset sample into a JSONL record for a given mode."""
audio_path = f"{audio_prefix}/{sample['audio']}"
Expand Down Expand Up @@ -191,8 +199,9 @@ def main():
default=None,
help=(
"Override audio path prefix written into JSONL files. "
"Defaults to the data_dir value. Useful for container mount points "
"(e.g., --audio-prefix /data/contextasr-bench)."
"This is the global in-container audio root; the script appends "
f"{BENCHMARK_NAME}/. Defaults to $NEMO_SKILLS_AUDIO_ROOT or /data "
f"(e.g., --audio-prefix /data writes /data/{BENCHMARK_NAME}/...)."
),
)
parser.add_argument(
Expand Down Expand Up @@ -220,8 +229,7 @@ def main():
print(f"Data not found at {data_dir}. Downloading there...")
download_dataset(data_dir)

audio_prefix = args.audio_prefix if args.audio_prefix else str(data_dir)
audio_prefix = audio_prefix.rstrip("/")
audio_prefix = resolve_audio_prefix(args.audio_prefix)

jsonl_path = data_dir / JSONL_FILENAME

Expand All @@ -236,14 +244,15 @@ def main():
print(f"Loaded {len(samples)} samples")

if not args.no_audio:
sample_audio = Path(audio_prefix) / samples[0]["audio"]
sample_audio = data_dir / samples[0]["audio"]
if not sample_audio.exists():
print(
f"WARNING: Sample audio file not found at {sample_audio}. "
f"Audio paths may need adjustment via --audio-prefix."
f"Audio paths may need adjustment via --data_dir."
)
else:
print(f"Audio files verified (sample check: {sample_audio})")
print(f"Audio paths in JSONL will use: {audio_prefix}/...")

modes = {
"contextless": output_dir / "contextless" / "test.jsonl",
Expand Down
Loading
Loading