@@ -137,7 +137,7 @@ def run_async():
137137from llama_cpp import Llama
138138
139139# Set the path to your GGUF model file (update this to the correct path)
140- MODEL_PATH = "./models/orpheus-3b-0.1-ft-q4_k_m.gguf" # Replace with your GGUF file path
140+ MODEL_PATH = "../voice /models/orpheus-3b-0.1-ft-q4_k_m.gguf" # Replace with your GGUF file path
141141
142142# Number of layers to offload to GPU (adjust based on your GPU memory, e.g., 30 for 8GB VRAM)
143143N_GPU_LAYERS = 20
@@ -161,6 +161,23 @@ def run_async():
161161END_TOKEN_IDS = [128009 , 128260 , 128261 , 128257 ]
162162CUSTOM_TOKEN_PREFIX = "<custom_token_"
163163
164+ # Default text to be spoken if no text is provided
165+ DEFAULT_TEXT = "This is a default sentence."
166+ BATCH_SENTENCES = [
167+ "Good morning Kabeer!" ,
168+ "You've got a busy day ahead." ,
169+ "Meetings, presentations and even a night out with the boys! <chuckle>" ,
170+ "You ready to crush this?" ,
171+ ]
172+
173+ def create_filename (sentence , max_words = 3 , max_length = 50 ):
174+ words = sentence .split ()[:max_words ]
175+ base = "_" .join (words )
176+ safe_base = "" .join (c for c in base if c .isalnum () or c in ("_" , "-" ))
177+ if len (safe_base ) > max_length :
178+ safe_base = safe_base [:max_length ]
179+ return safe_base + ".wav"
180+
164181def format_prompt (prompt , voice = DEFAULT_VOICE ):
165182 """Format prompt for Orpheus model with voice prefix and special tokens."""
166183 if voice not in AVAILABLE_VOICES :
@@ -351,55 +368,76 @@ def list_available_voices():
351368 print ("<laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>" )
352369
353370def main ():
354- # Parse command line arguments
355- parser = argparse .ArgumentParser (description = "Orpheus Text-to-Speech using local GGUF model" )
356- parser .add_argument ("--text" , type = str , help = "Text to convert to speech" )
357- parser .add_argument ("--voice" , type = str , default = DEFAULT_VOICE , help = f"Voice to use (default: { DEFAULT_VOICE } )" )
358- parser .add_argument ("--output" , type = str , help = "Output WAV file path" )
371+ parser = argparse .ArgumentParser (description = "Generate speech from text." )
372+ parser .add_argument ("text" , nargs = "*" , help = "Text to convert to speech" )
373+ parser .add_argument ("--voice" , default = "default_voice" , help = "Voice to use" )
374+ parser .add_argument ("--output" , help = "Output file or directory (in batch mode)" )
375+ parser .add_argument ("--batch" , action = "store_true" , help = "Process predefined batch of sentences" )
376+ parser .add_argument ("--temperature" , type = float , default = 0.7 , help = "Temperature for generation" )
377+ parser .add_argument ("--top_p" , type = float , default = 0.9 , help = "Top-p sampling" )
378+ parser .add_argument ("--repetition_penalty" , type = float , default = 1.0 , help = "Repetition penalty" )
359379 parser .add_argument ("--list-voices" , action = "store_true" , help = "List available voices" )
360- parser .add_argument ("--temperature" , type = float , default = TEMPERATURE , help = "Temperature for generation" )
361- parser .add_argument ("--top_p" , type = float , default = TOP_P , help = "Top-p sampling parameter" )
362- parser .add_argument ("--repetition_penalty" , type = float , default = REPETITION_PENALTY ,
363- help = "Repetition penalty (>=1.1 required for stable generation)" )
364380
365381 args = parser .parse_args ()
366-
382+
367383 if args .list_voices :
368384 list_available_voices ()
369385 return
370-
371- # Use text from command line or prompt user
372- prompt = args .text
373- if not prompt :
374- if len (sys .argv ) > 1 and sys .argv [1 ] not in ("--voice" , "--output" , "--temperature" , "--top_p" , "--repetition_penalty" ):
375- prompt = " " .join ([arg for arg in sys .argv [1 :] if not arg .startswith ("--" )])
386+
387+ if args .batch :
388+ # Batch mode
389+ if args .output :
390+ batch_dir = args .output
391+ if not os .path .isdir (batch_dir ):
392+ os .makedirs (batch_dir , exist_ok = True )
376393 else :
377- prompt = input ("Enter text to synthesize: " )
378- if not prompt :
379- prompt = "Hello, I am Orpheus, an AI assistant with emotional speech capabilities."
380-
381- # Default output file if none provided
382- output_file = args .output
383- if not output_file :
384- os .makedirs ("outputs" , exist_ok = True )
385- timestamp = time .strftime ("%Y%m%d_%H%M%S" )
386- output_file = f"outputs/{ args .voice } _{ timestamp } .wav"
387- print (f"No output file specified. Saving to { output_file } " )
388-
389- # Generate speech
390- start_time = time .time ()
391- audio_segments = generate_speech_from_api (
392- prompt = prompt ,
393- voice = args .voice ,
394- temperature = args .temperature ,
395- top_p = args .top_p ,
396- repetition_penalty = args .repetition_penalty ,
397- output_file = output_file
398- )
399- end_time = time .time ()
400-
401- print (f"Speech generation completed in { end_time - start_time :.2f} seconds" )
402- print (f"Audio saved to { output_file } " )
394+ batch_dir = "outputs"
395+ os .makedirs (batch_dir , exist_ok = True )
396+
397+ for sentence in BATCH_SENTENCES :
398+ filename = create_filename (sentence )
399+ output_file = os .path .join (batch_dir , filename )
400+ print (f"Generating audio for: { sentence } " )
401+ start_time = time .time ()
402+ audio_segments = generate_speech_from_api (
403+ prompt = sentence ,
404+ voice = args .voice ,
405+ temperature = args .temperature ,
406+ top_p = args .top_p ,
407+ repetition_penalty = args .repetition_penalty ,
408+ output_file = output_file
409+ )
410+ end_time = time .time ()
411+ print (f"Speech generation for '{ sentence } ' completed in { end_time - start_time :.2f} seconds" )
412+ print (f"Audio saved to { output_file } " )
413+ else :
414+ # Non-batch mode
415+ if args .text :
416+ prompt = " " .join (args .text )
417+ else :
418+ prompt = DEFAULT_TEXT
419+ print (f"No text provided. Using default text: { DEFAULT_TEXT } " )
420+
421+ if args .output :
422+ output_file = args .output
423+ else :
424+ os .makedirs ("outputs" , exist_ok = True )
425+ timestamp = time .strftime ("%Y%m%d_%H%M%S" )
426+ output_file = f"outputs/{ args .voice } _{ timestamp } .wav"
427+ print (f"No output file specified. Saving to { output_file } " )
428+
429+ start_time = time .time ()
430+ audio_segments = generate_speech_from_api (
431+ prompt = prompt ,
432+ voice = args .voice ,
433+ temperature = args .temperature ,
434+ top_p = args .top_p ,
435+ repetition_penalty = args .repetition_penalty ,
436+ output_file = output_file
437+ )
438+ end_time = time .time ()
439+ print (f"Speech generation completed in { end_time - start_time :.2f} seconds" )
440+ print (f"Audio saved to { output_file } " )
403441
404442if __name__ == "__main__" :
405443 main ()
0 commit comments