44import argparse
55import time
66import wave
7+ import json
78from pathlib import Path
89
910import riva .client
@@ -21,12 +22,12 @@ def parse_args() -> argparse.Namespace:
2122 help = "A voice name to use. If this parameter is missing, then the server will try a first available model "
2223 "based on parameter `--language-code`." ,
2324 )
24- parser .add_argument ("--text" , type = str , required = True , help = "Text input to synthesize." )
25+ parser .add_argument ("--text" , type = str , required = False , help = "Text input to synthesize." )
2526 parser .add_argument (
2627 "--audio_prompt_file" ,
2728 type = Path ,
2829 help = "An input audio prompt (.wav) file for zero shot model. This is required to do zero shot inferencing." )
29- parser .add_argument ("-o" , "--output" , type = Path , help = "Output file .wav file to write synthesized audio." )
30+ parser .add_argument ("-o" , "--output" , type = Path , default = "output.wav" , help = "Output file .wav file to write synthesized audio." )
3031 parser .add_argument ("--quality" , type = int , help = "Number of times decoder should be run on the output audio. A higher number improves quality of the produced output but introduces latencies." )
3132 parser .add_argument (
3233 "--play-audio" ,
@@ -35,6 +36,7 @@ def parse_args() -> argparse.Namespace:
3536 "then the default output audio device will be used." ,
3637 )
3738 parser .add_argument ("--list-devices" , action = "store_true" , help = "List output audio devices indices." )
39+ parser .add_argument ("--list-voices" , action = "store_true" , help = "List available voices." )
3840 parser .add_argument ("--output-device" , type = int , help = "Output device to use." )
3941 parser .add_argument ("--language-code" , default = 'en-US' , help = "A language of input text." )
4042 parser .add_argument (
@@ -49,11 +51,6 @@ def parse_args() -> argparse.Namespace:
4951 )
5052 parser = add_connection_argparse_parameters (parser )
5153 args = parser .parse_args ()
52- if args .output is None and not args .play_audio and args .output_device is None and not args .list_devices :
53- parser .error (
54- f"You have to provide at least one of arguments: `--play-audio`, `--output-device`, `--output`, "
55- f"`--list-devices`."
56- )
5754 if args .output is not None :
5855 args .output = args .output .expanduser ()
5956 if args .list_devices or args .output_device or args .play_audio :
@@ -65,12 +62,36 @@ def main() -> None:
6562 args = parse_args ()
6663 if args .list_devices :
6764 riva .client .audio_io .list_output_devices ()
68- return
65+
6966 auth = riva .client .Auth (args .ssl_cert , args .use_ssl , args .server , args .metadata )
7067 service = riva .client .SpeechSynthesisService (auth )
7168 nchannels = 1
7269 sampwidth = 2
7370 sound_stream , out_f = None , None
71+
72+ if args .list_voices :
73+ config_response = service .stub .GetRivaSynthesisConfig (
74+ riva .client .proto .riva_tts_pb2 .RivaSynthesisConfigRequest ()
75+ )
76+ tts_models = dict ()
77+ for model_config in config_response .model_config :
78+ language_code = model_config .parameters ['language_code' ]
79+ voice_name = model_config .parameters ['voice_name' ]
80+ subvoices = [voice .split (':' )[0 ] for voice in model_config .parameters ['subvoices' ].split (',' )]
81+ full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices ]
82+
83+ if language_code in tts_models :
84+ tts_models [language_code ]['voices' ].extend (full_voice_names )
85+ else :
86+ tts_models [language_code ] = {"voices" : full_voice_names }
87+
88+ tts_models = dict (sorted (tts_models .items ()))
89+ print (json .dumps (tts_models , indent = 4 ))
90+
91+ if not args .text :
92+ print ("No input text provided" )
93+ return
94+
7495 try :
7596 if args .output_device is not None or args .play_audio :
7697 sound_stream = riva .client .audio_io .SoundCallBack (
0 commit comments