From 1b32415a237b94fe33ae0ff4de2a18c323f5e51d Mon Sep 17 00:00:00 2001 From: eliranwong Date: Thu, 16 May 2024 20:57:24 +0100 Subject: [PATCH 1/2] added method 'recognize_whispercpp' to support whisper.cpp --- speech_recognition/__init__.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 022cd7d5..825bf7f0 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1430,6 +1430,56 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti else: return result["text"] + def recognize_whispercpp(self, audio_data, whispercpp_main, model_path, language="en", additional_options=""): + """ + Adapted from code: https://github.com/eliranwong/freegenius/blob/96d2fd7751ca26f2c7adaa63082a3cb79681f3ed/package/freegenius/utils/prompts.py#L118 + + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. + + ``whispercpp_main`` is the local path of the main file of whisper.cpp, it depends on how users set up their local copies of whisper.cpp + + e.g., with the following setup, set '~/whisper.cpp/main' as ``whispercpp_main``: + + > cd ~ + + > git clone https://github.com/ggerganov/whisper.cpp.git + + > cd whisper.cpp + + > make + + ``model_path`` is the local file path of any of *.bin files downloaded from https://huggingface.co/ggerganov/whisper.cpp/tree/main. + + e.g. download 'ggml-large-v3-q5_0.bin' to home directory, then ``model_path`` is '~/ggml-large-v3-q5_0.bin' + + The recognition language is determined by ``language``, an uncapitalized language code like "en" or "zh". 'auto' for auto-detect. See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py + + e.g. set 'en' as ``language`` for English + + e.g. set 'auto' as ``language`` for non-English languages + + ``additional_options`` are additional options that are passed directly to whisper.cpp. See https://github.com/ggerganov/whisper.cpp/tree/master/examples/main for all options + + e.g. set '-t 12' as ``additional_options``, to use 12 threads during computation + + e.g. set '-tr' as ``additional_options``, to translate from the speech to english + """ + assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" + assert os.path.isfile(whispercpp_main), "``whispercpp_main`` must be a valid file path" + assert os.path.isfile(model_path), "``model_path`` must be a valid file path" + wav_data = audio.get_wav_data( + convert_rate=16000, # audio samples must be 8kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit + ) + folder = os.path.dirname(os.path.realpath(__file__)) + wav_file = os.path.join(folder, "speech.wav") + with open(wav_file, "wb") as fileObj: + fileObj.write(wav_data) + cli = f'''"{whispercpp_main}" -m "{model_path}" -f "{wav_file} -np -nt -l {language} {additional_options}"''' + process = subprocess.Popen(cli.rstrip(), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + return stderr.decode("utf-8") if stderr and not stdout else stdout.decode("utf-8").strip() + def recognize_vosk(self, audio_data, language='en'): from vosk import KaldiRecognizer, Model From d0337b17eb9928cb7b024150eda42783d9cdcdfb Mon Sep 17 00:00:00 2001 From: eliranwong Date: Thu, 16 May 2024 21:28:23 +0100 Subject: [PATCH 2/2] Fix mistake in previous commit --- speech_recognition/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 825bf7f0..165e9960 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1467,7 +1467,7 @@ def recognize_whispercpp(self, audio_data, whispercpp_main, model_path, language assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" assert os.path.isfile(whispercpp_main), "``whispercpp_main`` must be a valid file path" assert os.path.isfile(model_path), "``model_path`` must be a valid file path" - wav_data = audio.get_wav_data( + wav_data = audio_data.get_wav_data( convert_rate=16000, # audio samples must be 8kHz or 16 kHz convert_width=2 # audio samples should be 16-bit )