Skip to content

Commit f5900a3

Browse files
committed
Use shared audio root defaults in prepare docs
Signed-off-by: Dongji Gao <dongjig@nvidia.com>
1 parent 0aa29b3 commit f5900a3

9 files changed

Lines changed: 69 additions & 23 deletions

File tree

docs/evaluation/speech-audio.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ eval(benchmarks="mmau-pro.closed_form", ...)
178178
--model=/workspace/path/to/checkpoint \
179179
--server_entrypoint=/workspace/megatron-lm/server.py \
180180
--server_container=/path/to/container.sqsh \
181-
--data_dir=/dataset \
181+
--data_dir=/data \
182182
--installation_command="pip install sacrebleu"
183183
```
184184

@@ -562,7 +562,7 @@ used directly. If the file is missing, data is downloaded there automatically.
562562
To use a custom audio path prefix (e.g., for container mount points):
563563

564564
```bash
565-
ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data/contextasr
565+
ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data
566566
```
567567

568568
### Running ContextASR-Bench Evaluation

nemo_skills/dataset/asr-leaderboard/prepare.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@
3636
from datasets import load_dataset
3737
from tqdm import tqdm
3838

39-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
39+
from nemo_skills.dataset.utils import (
40+
DEFAULT_CONTAINER_AUDIO_ROOT,
41+
build_container_audio_path,
42+
get_container_audio_root,
43+
)
4044

4145
SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
4246
MIN_AUDIO_DURATION = 0.1 # Skip audio shorter than this (causes mel spectrogram errors)
@@ -62,7 +66,7 @@ def save_audio_and_format_entry(
6266
text_field="text",
6367
id_field="id",
6468
with_audio=True,
65-
audio_root="/data",
69+
audio_root=DEFAULT_CONTAINER_AUDIO_ROOT,
6670
):
6771
"""Format a dataset entry and optionally save audio file."""
6872
text = entry[text_field].strip()
@@ -109,7 +113,7 @@ def save_audio_and_format_entry(
109113
return formatted_entry
110114

111115

112-
def prepare_dataset(dataset_name, output_dir, with_audio=True, audio_root="/data"):
116+
def prepare_dataset(dataset_name, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
113117
"""Prepare a single ASR dataset."""
114118
if dataset_name not in DATASET_CONFIGS:
115119
raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(DATASET_CONFIGS.keys())}")

nemo_skills/dataset/audiobench/prepare.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@
3535
import soundfile as sf
3636
from tqdm import tqdm
3737

38-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
38+
from nemo_skills.dataset.utils import (
39+
DEFAULT_CONTAINER_AUDIO_ROOT,
40+
build_container_audio_path,
41+
get_container_audio_root,
42+
)
3943

4044
# AudioBench datasets categorized by evaluation type
4145
JUDGE_DATASETS = [
@@ -142,7 +146,7 @@ def create_manifest_entry(
142146
dataset_name: str,
143147
sample_id: int,
144148
category: str,
145-
audio_root: str = "/data",
149+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
146150
) -> Dict:
147151
"""Create a nemo-skills compatible manifest entry.
148152
@@ -212,7 +216,7 @@ def process_dataset(
212216
save_audio: bool = True,
213217
split: str = "test",
214218
max_samples: int = -1,
215-
audio_root: str = "/data",
219+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
216220
) -> tuple[int, List[Dict]]:
217221
"""Process a single AudioBench dataset.
218222

nemo_skills/dataset/covost2/prepare.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@
2525
import soundfile as sf
2626
from tqdm import tqdm
2727

28-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
28+
from nemo_skills.dataset.utils import (
29+
DEFAULT_CONTAINER_AUDIO_ROOT,
30+
build_container_audio_path,
31+
get_container_audio_root,
32+
)
2933

3034
COVOST_URL_TEMPLATE = "https://dl.fbaipublicfiles.com/covost/covost_v2.{src_lang}_{tgt_lang}.tsv.tar.gz"
3135
SPLITS = ["validation", "test"]
@@ -198,7 +202,7 @@ def prepare_covost2(
198202
cv_data_dir: Path,
199203
validated_tsv: Path,
200204
task_type: str,
201-
audio_root: str = "/data",
205+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
202206
) -> None:
203207
if not languages:
204208
raise ValueError("No languages to process")

nemo_skills/dataset/fleurs/prepare.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@
2525
from huggingface_hub import hf_hub_download
2626
from tqdm import tqdm
2727

28-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
28+
from nemo_skills.dataset.utils import (
29+
DEFAULT_CONTAINER_AUDIO_ROOT,
30+
build_container_audio_path,
31+
get_container_audio_root,
32+
)
2933

3034

3135
def load_fleurs_module():
@@ -175,7 +179,7 @@ def prepare_fleurs(
175179
languages: list[str],
176180
no_audio: bool,
177181
task_type: str,
178-
audio_root: str = "/data",
182+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
179183
) -> None:
180184
if not languages:
181185
raise ValueError("No languages to process")

nemo_skills/dataset/librispeech-pc/prepare.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@
3333

3434
from tqdm import tqdm
3535

36-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
36+
from nemo_skills.dataset.utils import (
37+
DEFAULT_CONTAINER_AUDIO_ROOT,
38+
build_container_audio_path,
39+
get_container_audio_root,
40+
)
3741

3842

3943
def download_with_progress(url: str, output_path: Path, desc: str):
@@ -102,7 +106,13 @@ def download_audio(split: str, audio_dir: Path):
102106
os.remove(tar_path)
103107

104108

105-
def process_split(split: str, data_dir: Path, audio_dir: Path, with_audio: bool, audio_root: str = "/data") -> int:
109+
def process_split(
110+
split: str,
111+
data_dir: Path,
112+
audio_dir: Path,
113+
with_audio: bool,
114+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
115+
) -> int:
106116
"""Process one LibriSpeech-PC split into nemo-skills format."""
107117

108118
output_file = data_dir / f"{split}.jsonl"

nemo_skills/dataset/mmau-pro/prepare.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,12 @@
2222
from datasets import load_dataset
2323
from tqdm import tqdm
2424

25-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root, get_mcq_fields
25+
from nemo_skills.dataset.utils import (
26+
DEFAULT_CONTAINER_AUDIO_ROOT,
27+
build_container_audio_path,
28+
get_container_audio_root,
29+
get_mcq_fields,
30+
)
2631

2732

2833
def download_mmau_data(download_dir, hf_token):
@@ -69,7 +74,7 @@ def _normalize_audio_path(path: str, audio_root: str) -> str:
6974
return build_container_audio_path("mmau-pro", rel_path, audio_prefix=audio_root)
7075

7176

72-
def format_entry(entry, with_audio=False, audio_root="/data"):
77+
def format_entry(entry, with_audio=False, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
7378
"""Format entry for nemo-skills with OpenAI messages and audio support."""
7479
choices = entry.get("choices", []) or []
7580

nemo_skills/dataset/musan/prepare.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@
4040
import soundfile as sf
4141
from tqdm import tqdm
4242

43-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
43+
from nemo_skills.dataset.utils import (
44+
DEFAULT_CONTAINER_AUDIO_ROOT,
45+
build_container_audio_path,
46+
get_container_audio_root,
47+
)
4448

4549
# HuggingFace dataset label mappings
4650
CATEGORY_LABELS = {
@@ -168,7 +172,7 @@ def create_manifest_entry(
168172
category: str,
169173
sample_id: int,
170174
label: str,
171-
audio_root: str = "/data",
175+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
172176
) -> Dict:
173177
"""Create nemo-skills manifest entry."""
174178
audio_rel_path = build_container_audio_path("musan", category, "audio", audio_filename, audio_prefix=audio_root)
@@ -209,7 +213,7 @@ def process_category_from_files(
209213
save_audio: bool = True,
210214
split: str = "train",
211215
max_samples: int = -1,
212-
audio_root: str = "/data",
216+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
213217
) -> tuple[int, List[Dict]]:
214218
"""Process MUSAN category from WAV files (Kaggle/OpenSLR format)."""
215219
category_path = dataset_path / category
@@ -287,7 +291,7 @@ def process_category(
287291
save_audio: bool = True,
288292
split: str = "train",
289293
max_samples: int = -1,
290-
audio_root: str = "/data",
294+
audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
291295
) -> tuple[int, List[Dict]]:
292296
"""Process a single MUSAN category."""
293297
print(f"\n{'=' * 60}")

nemo_skills/dataset/numb3rs/prepare.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,11 @@
4444
from datasets import load_dataset
4545
from tqdm import tqdm
4646

47-
from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
47+
from nemo_skills.dataset.utils import (
48+
DEFAULT_CONTAINER_AUDIO_ROOT,
49+
build_container_audio_path,
50+
get_container_audio_root,
51+
)
4852

4953
SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
5054

@@ -74,7 +78,14 @@ def build_messages_with_prompt(audio_metadata, prompt_text):
7478
return [system_message, user_message]
7579

7680

77-
def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_audio=True, audio_root="/data"):
81+
def save_audio_and_format_entry(
82+
entry,
83+
category,
84+
audio_dir,
85+
sample_idx,
86+
with_audio=True,
87+
audio_root=DEFAULT_CONTAINER_AUDIO_ROOT,
88+
):
7889
"""Format a dataset entry and optionally save audio file.
7990
8091
Returns a base entry dict with audio metadata. Messages are added separately
@@ -136,7 +147,7 @@ def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_aud
136147
return formatted_entry
137148

138149

139-
def prepare_category(category, dataset, output_dir, with_audio=True, audio_root="/data"):
150+
def prepare_category(category, dataset, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
140151
"""Prepare a single category from the Numb3rs dataset.
141152
142153
Generates 3 files per category in categories/ subfolder:

0 commit comments

Comments
 (0)