From 1b32415a237b94fe33ae0ff4de2a18c323f5e51d Mon Sep 17 00:00:00 2001
From: eliranwong <hkwebtech@gmail.com>
Date: Thu, 16 May 2024 20:57:24 +0100
Subject: [PATCH 1/2] added method 'recognize_whispercpp' to support
 whisper.cpp

---
 speech_recognition/__init__.py | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index 022cd7d5..825bf7f0 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -1430,6 +1430,56 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti
         else:
             return result["text"]
 
+    def recognize_whispercpp(self, audio_data, whispercpp_main, model_path, language="en", additional_options=""):
+        """
+        Adapted from code: https://github.com/eliranwong/freegenius/blob/96d2fd7751ca26f2c7adaa63082a3cb79681f3ed/package/freegenius/utils/prompts.py#L118
+
+        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+        ``whispercpp_main`` is the local path of the main file of whisper.cpp, it depends on how users set up their local copies of whisper.cpp
+
+        e.g., with the following setup, set '~/whisper.cpp/main' as ``whispercpp_main``:
+
+        > cd ~
+
+        > git clone https://github.com/ggerganov/whisper.cpp.git
+
+        > cd whisper.cpp
+
+        > make
+
+        ``model_path`` is the local file path of any of *.bin files downloaded from https://huggingface.co/ggerganov/whisper.cpp/tree/main.
+
+        e.g. download 'ggml-large-v3-q5_0.bin' to home directory, then ``model_path`` is '~/ggml-large-v3-q5_0.bin'
+
+        The recognition language is determined by ``language``, an uncapitalized language code like "en" or "zh". 'auto' for auto-detect. See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+        e.g. set 'en' as ``language`` for English
+
+        e.g. set 'auto' as ``language`` for non-English languages
+
+        ``additional_options`` are additional options that are passed directly to whisper.cpp. See https://github.com/ggerganov/whisper.cpp/tree/master/examples/main for all options
+
+        e.g. set '-t 12' as ``additional_options``, to use 12 threads during computation
+
+        e.g. set '-tr' as ``additional_options``, to translate from the speech to english
+        """
+        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
+        assert os.path.isfile(whispercpp_main), "``whispercpp_main`` must be a valid file path"
+        assert os.path.isfile(model_path), "``model_path`` must be a valid file path"
+        wav_data = audio.get_wav_data(
+            convert_rate=16000,  # audio samples must be 8kHz or 16 kHz
+            convert_width=2  # audio samples should be 16-bit
+        )
+        folder = os.path.dirname(os.path.realpath(__file__))
+        wav_file = os.path.join(folder, "speech.wav")
+        with open(wav_file, "wb") as fileObj:
+            fileObj.write(wav_data)
+        cli = f'''"{whispercpp_main}" -m "{model_path}" -f "{wav_file} -np -nt -l {language} {additional_options}"'''
+        process = subprocess.Popen(cli.rstrip(), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        return stderr.decode("utf-8") if stderr and not stdout else stdout.decode("utf-8").strip()
+
     def recognize_vosk(self, audio_data, language='en'):
         from vosk import KaldiRecognizer, Model
 

From d0337b17eb9928cb7b024150eda42783d9cdcdfb Mon Sep 17 00:00:00 2001
From: eliranwong <hkwebtech@gmail.com>
Date: Thu, 16 May 2024 21:28:23 +0100
Subject: [PATCH 2/2] Fix mistake in previous commit

---
 speech_recognition/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index 825bf7f0..165e9960 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -1467,7 +1467,7 @@ def recognize_whispercpp(self, audio_data, whispercpp_main, model_path, language
         assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
         assert os.path.isfile(whispercpp_main), "``whispercpp_main`` must be a valid file path"
         assert os.path.isfile(model_path), "``model_path`` must be a valid file path"
-        wav_data = audio.get_wav_data(
+        wav_data = audio_data.get_wav_data(
             convert_rate=16000,  # audio samples must be 8kHz or 16 kHz
             convert_width=2  # audio samples should be 16-bit
         )