|
44 | 44 | from datasets import load_dataset |
45 | 45 | from tqdm import tqdm |
46 | 46 |
|
47 | | -from nemo_skills.dataset.utils import build_container_audio_path, get_container_audio_root |
| 47 | +from nemo_skills.dataset.utils import ( |
| 48 | + DEFAULT_CONTAINER_AUDIO_ROOT, |
| 49 | + build_container_audio_path, |
| 50 | + get_container_audio_root, |
| 51 | +) |
48 | 52 |
|
49 | 53 | SYSTEM_MESSAGE = "You are a helpful assistant. /no_think" |
50 | 54 |
|
@@ -74,7 +78,14 @@ def build_messages_with_prompt(audio_metadata, prompt_text): |
74 | 78 | return [system_message, user_message] |
75 | 79 |
|
76 | 80 |
|
77 | | -def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_audio=True, audio_root="/data"): |
| 81 | +def save_audio_and_format_entry( |
| 82 | + entry, |
| 83 | + category, |
| 84 | + audio_dir, |
| 85 | + sample_idx, |
| 86 | + with_audio=True, |
| 87 | + audio_root=DEFAULT_CONTAINER_AUDIO_ROOT, |
| 88 | +): |
78 | 89 | """Format a dataset entry and optionally save audio file. |
79 | 90 |
|
80 | 91 | Returns a base entry dict with audio metadata. Messages are added separately |
@@ -136,7 +147,7 @@ def save_audio_and_format_entry(entry, category, audio_dir, sample_idx, with_aud |
136 | 147 | return formatted_entry |
137 | 148 |
|
138 | 149 |
|
139 | | -def prepare_category(category, dataset, output_dir, with_audio=True, audio_root="/data"): |
| 150 | +def prepare_category(category, dataset, output_dir, with_audio=True, audio_root=DEFAULT_CONTAINER_AUDIO_ROOT): |
140 | 151 | """Prepare a single category from the Numb3rs dataset. |
141 | 152 |
|
142 | 153 | Generates 3 files per category in categories/ subfolder: |
|
0 commit comments