Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
418 changes: 413 additions & 5 deletions app.py

Large diffs are not rendered by default.

349 changes: 266 additions & 83 deletions app_onnx.py

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,31 @@ def maybe_print_voice_clone_text_chunks(
print()


def _patch_torchaudio_backend() -> None:
"""Patch torchaudio to avoid the SoX backend, which segfaults on some systems."""
try:
import torchaudio
_original_load = torchaudio.load
_original_save = torchaudio.save

def _load_with_soundfile(uri, *args, backend=None, **kwargs):
if backend is None:
backend = "soundfile"
return _original_load(uri, *args, backend=backend, **kwargs)

def _save_with_soundfile(uri, src, sample_rate, *args, backend=None, **kwargs):
if backend is None:
backend = "soundfile"
return _original_save(uri, src, sample_rate, *args, backend=backend, **kwargs)

torchaudio.load = _load_with_soundfile
torchaudio.save = _save_with_soundfile
except ImportError:
pass


def main(argv: Optional[Sequence[str]] = None) -> dict[str, object]:
_patch_torchaudio_backend()
set_logging()
args = parse_args(argv)
if args.debug == 1:
Expand Down
10 changes: 2 additions & 8 deletions moss_tts_nano_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,15 @@

_DEFAULT_VOICE_FILES: dict[str, tuple[str, str]] = {
"Junhao": ("zh_1.wav", "Chinese male voice A"),
"Zhiming": ("zh_2.wav", "Chinese male voice B"),
"Weiguo": ("zh_5.wav", "Chinese male voice C"),
"Xiaoyu": ("zh_3.wav", "Chinese female voice A"),
"Yuewen": ("zh_4.wav", "Chinese female voice B"),
"Lingyu": ("zh_6.wav", "Chinese female voice C"),
"Trump": ("en_1.wav", "Trump reference voice"),
"Ava": ("en_2.wav", "English female voice A"),
"Bella": ("en_3.wav", "English female voice B"),
"Adam": ("en_4.wav", "English male voice A"),
"Nathan": ("en_5.wav", "English male voice B"),
"Sakura": ("jp_1.mp3", "Japanese female voice A"),
"Yui": ("jp_2.wav", "Japanese female voice B"),
"Aoi": ("jp_3.wav", "Japanese female voice C"),
"Hina": ("jp_4.wav", "Japanese female voice D"),
"Mei": ("jp_5.wav", "Japanese female voice E"),
"男播音": ("zh_10.wav", "Chinese male broadcaster voice"),
"杨幂": ("zh_11.wav", "Chinese female voice YangMi style"),
}

DEFAULT_VOICE = "Junhao"
Expand Down
23 changes: 20 additions & 3 deletions onnx_tts_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,9 +490,26 @@ def resolve_prompt_audio_codes(
return self.encode_reference_audio(prompt_audio_path)
resolved_voice = str(voice or self.list_builtin_voices()[0]["voice"])
voice_row = next((item for item in self.list_builtin_voices() if item["voice"] == resolved_voice), None)
if voice_row is None:
raise ValueError(f"Built-in voice not found: {resolved_voice}")
return list(voice_row["prompt_audio_codes"])
if voice_row is not None:
return list(voice_row["prompt_audio_codes"])
# Fallback: try to find wav file in preset voices directory
from moss_tts_nano.defaults import DEFAULT_PROMPT_AUDIO_DIR
wav_candidates = [
DEFAULT_PROMPT_AUDIO_DIR / f"{resolved_voice}.wav",
DEFAULT_PROMPT_AUDIO_DIR / f"{resolved_voice}_reference.wav",
]
# Also check the PyTorch preset map for wav filename
try:
from moss_tts_nano_runtime import _DEFAULT_VOICE_FILES
if resolved_voice in _DEFAULT_VOICE_FILES:
wav_candidates.insert(0, DEFAULT_PROMPT_AUDIO_DIR / _DEFAULT_VOICE_FILES[resolved_voice][0])
except ImportError:
pass
for wav_path in wav_candidates:
if wav_path.exists():
logging.info("ONNX voice %r not in builtin manifest, encoding from %s", resolved_voice, wav_path)
return self.encode_reference_audio(wav_path)
raise ValueError(f"Built-in voice not found: {resolved_voice}")

def decode_full_audio_safe(self, generated_frames: list[list[int]]) -> np.ndarray:
try:
Expand Down
Loading