Use shared audio root defaults in prepare docs

DongjiGao · DongjiGao · commit f5900a32a90f · 2026-05-08T15:15:14.000-07:00
Signed-off-by: Dongji Gao &lt;dongjig@nvidia.com&gt;
diff --git a/docs/evaluation/speech-audio.md b/docs/evaluation/speech-audio.md
@@ -178,7 +178,7 @@ eval(benchmarks="mmau-pro.closed_form", ...)
         --model=/workspace/path/to/checkpoint \
         --server_entrypoint=/workspace/megatron-lm/server.py \
         --server_container=/path/to/container.sqsh \
-        --data_dir=/dataset \
+        --data_dir=/data \
         --installation_command="pip install sacrebleu"
     ```
 
@@ -562,7 +562,7 @@ used directly. If the file is missing, data is downloaded there automatically.
 To use a custom audio path prefix (e.g., for container mount points):
 
 ```bash
-ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data/contextasr
+ns prepare_data contextasr-bench --data_dir=/path/to/ContextASR-Bench --audio-prefix /data
 ```
 
 ### Running ContextASR-Bench Evaluation
diff --git a/nemo_skills/dataset/asr-leaderboard/prepare.py b/nemo_skills/dataset/asr-leaderboard/prepare.py
@@ -36,7 +36,11 @@
 from datasets import load_dataset
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
 MIN_AUDIO_DURATION = 0.1  # Skip audio shorter than this (causes mel spectrogram errors)
@@ -62,7 +66,7 @@ def save_audio_and_format_entry(
     text_field="text",
     id_field="id",
     with_audio=True,
-    audio_root="/data",
+    audio_root=DEFAULT_CONTAINER_AUDIO_ROOT,
 ):
     """Format a dataset entry and optionally save audio file."""
     text = entry[text_field].strip()
@@ -109,7 +113,7 @@ def save_audio_and_format_entry(
     return formatted_entry
 
 
-def prepare_dataset(dataset_name, output_dir, with_audio=True, audio_root="/data"):
+def prepare_dataset(dataset_name, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
     """Prepare a single ASR dataset."""
     if dataset_name not in DATASET_CONFIGS:
         raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(DATASET_CONFIGS.keys())}")
diff --git a/nemo_skills/dataset/audiobench/prepare.py b/nemo_skills/dataset/audiobench/prepare.py
@@ -35,7 +35,11 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 # AudioBench datasets categorized by evaluation type
 JUDGE_DATASETS = [
@@ -142,7 +146,7 @@ def create_manifest_entry(
     dataset_name: str,
     sample_id: int,
     category: str,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> Dict:
     """Create a nemo-skills compatible manifest entry.
 
@@ -212,7 +216,7 @@ def process_dataset(
     save_audio: bool = True,
     split: str = "test",
     max_samples: int = -1,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> tuple[int, List[Dict]]:
     """Process a single AudioBench dataset.
 
diff --git a/nemo_skills/dataset/covost2/prepare.py b/nemo_skills/dataset/covost2/prepare.py
@@ -25,7 +25,11 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 COVOST_URL_TEMPLATE = "https://dl.fbaipublicfiles.com/covost/covost_v2.{src_lang}_{tgt_lang}.tsv.tar.gz"
 SPLITS = ["validation", "test"]
@@ -198,7 +202,7 @@ def prepare_covost2(
     cv_data_dir: Path,
     validated_tsv: Path,
     task_type: str,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> None:
     if not languages:
         raise ValueError("No languages to process")
diff --git a/nemo_skills/dataset/fleurs/prepare.py b/nemo_skills/dataset/fleurs/prepare.py
@@ -25,7 +25,11 @@
 from huggingface_hub import hf_hub_download
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 
 def load_fleurs_module():
@@ -175,7 +179,7 @@ def prepare_fleurs(
     languages: list[str],
     no_audio: bool,
     task_type: str,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> None:
     if not languages:
         raise ValueError("No languages to process")
diff --git a/nemo_skills/dataset/librispeech-pc/prepare.py b/nemo_skills/dataset/librispeech-pc/prepare.py
@@ -33,7 +33,11 @@
 
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 
 def download_with_progress(url: str, output_path: Path, desc: str):
@@ -102,7 +106,13 @@ def download_audio(split: str, audio_dir: Path):
     os.remove(tar_path)
 
 
-def process_split(split: str, data_dir: Path, audio_dir: Path, with_audio: bool, audio_root: str = "/data") -> int:
+def process_split(
+    split: str,
+    data_dir: Path,
+    audio_dir: Path,
+    with_audio: bool,
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
+) -> int:
     """Process one LibriSpeech-PC split into nemo-skills format."""
 
     output_file = data_dir / f"{split}.jsonl"
diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py
@@ -22,7 +22,12 @@
 from datasets import load_dataset
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root, get_mcq_fields
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+    get_mcq_fields,
+)
 
 
 def download_mmau_data(download_dir, hf_token):
@@ -69,7 +74,7 @@ def _normalize_audio_path(path: str, audio_root: str) -> str:
     return build_container_audio_path("mmau-pro", rel_path, audio_prefix=audio_root)
 
 
-def format_entry(entry, with_audio=False, audio_root="/data"):
+def format_entry(entry, with_audio=False, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
     """Format entry for nemo-skills with OpenAI messages and audio support."""
     choices = entry.get("choices", []) or []
 
diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
@@ -40,7 +40,11 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 # HuggingFace dataset label mappings
 CATEGORY_LABELS = {
@@ -168,7 +172,7 @@ def create_manifest_entry(
     category: str,
     sample_id: int,
     label: str,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> Dict:
     """Create nemo-skills manifest entry."""
     audio_rel_path = build_container_audio_path("musan", category, "audio", audio_filename, audio_prefix=audio_root)
@@ -209,7 +213,7 @@ def process_category_from_files(
     save_audio: bool = True,
     split: str = "train",
     max_samples: int = -1,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> tuple[int, List[Dict]]:
     """Process MUSAN category from WAV files (Kaggle/OpenSLR format)."""
     category_path = dataset_path / category
@@ -287,7 +291,7 @@ def process_category(
     save_audio: bool = True,
     split: str = "train",
     max_samples: int = -1,
-    audio_root: str = "/data",
+    audio_root: str = DEFAULT_CONTAINER_AUDIO_ROOT,
 ) -> tuple[int, List[Dict]]:
     """Process a single MUSAN category."""
     print(f"\n{'=' * 60}")
diff --git a/nemo_skills/dataset/numb3rs/prepare.py b/nemo_skills/dataset/numb3rs/prepare.py
@@ -44,7 +44,11 @@
 from datasets import load_dataset
 from tqdm import tqdm
 
-from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root
+from nemo_skills.dataset.utils import (
+    DEFAULT_CONTAINER_AUDIO_ROOT,
+    build_container_audio_path,
+    get_container_audio_root,
+)
 
 SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
 
@@ -74,7 +78,14 @@ def build_messages_with_prompt(audio_metadata, prompt_text):
     return [system_message, user_message]
 
 
-def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_audio=True, audio_root="/data"):
+def save_audio_and_format_entry(
+    entry,
+    category,
+    audio_dir,
+    sample_idx,
+    with_audio=True,
+    audio_root=DEFAULT_CONTAINER_AUDIO_ROOT,
+):
     """Format a dataset entry and optionally save audio file.
 
     Returns a base entry dict with audio metadata. Messages are added separately
@@ -136,7 +147,7 @@ def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_aud
     return formatted_entry
 
 
-def prepare_category(category, dataset, output_dir, with_audio=True, audio_root="/data"):
+def prepare_category(category, dataset, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT):
     """Prepare a single category from the Numb3rs dataset.
 
     Generates 3 files per category in categories/ subfolder: