Merge pull request #889 from ftnext/feature/cohere-transcribe-api

ftnext · web-flow · commit 920526d925f6 · 2026-03-29T13:48:24.000+09:00
feat: Add Cohere Transcribe API support (recognize_cohere_api)
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -46,16 +46,16 @@ jobs:
       - name: Install Python dependencies (Ubuntu, <=3.12)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13'
         run: |
-          python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq,vosk]
+          python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq,vosk,cohere-api]
       - name: Install Python dependencies (Ubuntu, 3.13)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
         run: |
           python -m pip install standard-aifc setuptools
-          python -m pip install .[dev,audio,pocketsphinx,google-cloud,openai,groq,vosk]
+          python -m pip install .[dev,audio,pocketsphinx,google-cloud,openai,groq,vosk,cohere-api]
       - name: Install Python dependencies (Windows)
         if: matrix.os == 'windows-latest'
         run: |
-          python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq,vosk]
+          python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq,vosk,cohere-api]
       - name: Set up vosk model
         run: python -m speech_recognition.cli download vosk
       - name: Test with unittest
diff --git a/README.rst b/README.rst
@@ -65,6 +65,7 @@ Speech recognition engine/API support:
 * `OpenAI Whisper API <https://platform.openai.com/docs/guides/speech-to-text>`__
     * OpenAI compatible self-hosted endpoints (e.g. vLLM, Ollama)
 * `Groq Whisper API <https://console.groq.com/docs/speech-to-text>`__
+* `Cohere Transcribe API <https://docs.cohere.com/docs/transcribe>`__
 
 **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
 
@@ -123,6 +124,7 @@ To use all of the functionality of the library, you should have:
 * **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``)
     * includes OpenAI compatible self-hosted endpoints (e.g. vLLM, Ollama)
 * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``)
+* **cohere** (required only if you need to use Cohere Transcribe API speech recognition ``recognizer_instance.recognize_cohere_api``; install with ``pip install SpeechRecognition[cohere-api]``. Set ``CO_API_KEY`` as documented by the Cohere SDK.)
 
 The following requirements are optional, but can improve or extend functionality in some situations:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,9 @@ groq = [
   "groq",
   "httpx < 0.28",
 ]
+cohere-api = [
+  "cohere>=5.21.0",
+]
 assemblyai = ["requests"]
 vosk = ["vosk"]
 
diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -291,6 +291,11 @@ Raises a ``speech_recognition.UnknownValueError`` exception if the speech is uni
 
 .. autofunction:: speech_recognition.recognizers.whisper_api.groq.recognize
 
+``recognizer_instance.recognize_cohere_api(audio_data: AudioData, *, language: str, model = "cohere-transcribe-03-2026")``
+--------------------------------------------------------------------------------------------------------------------------
+
+.. autofunction:: speech_recognition.recognizers.cohere_api.recognize
+
 ``AudioSource``
 ---------------
 
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1281,7 +1281,7 @@ def flush(self, *args, **kwargs):
 # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError.
 # This is a workaround to resolve this issue
 try:
-    from .recognizers import google, google_cloud, pocketsphinx, vosk
+    from .recognizers import cohere_api, google, google_cloud, pocketsphinx, vosk
     from .recognizers.whisper_api import groq, openai
     from .recognizers.whisper_local import faster_whisper, whisper
 except (ModuleNotFoundError, ImportError):
@@ -1293,6 +1293,7 @@ def flush(self, *args, **kwargs):
     Recognizer.recognize_faster_whisper = faster_whisper.recognize  # type: ignore[attr-defined]
     Recognizer.recognize_openai = openai.recognize  # type: ignore[attr-defined]
     Recognizer.recognize_groq = groq.recognize  # type: ignore[attr-defined]
+    Recognizer.recognize_cohere_api = cohere_api.recognize  # type: ignore[attr-defined]
     Recognizer.recognize_sphinx = pocketsphinx.recognize  # type: ignore[attr-defined]
     Recognizer.recognize_vosk = vosk.recognize  # type: ignore[attr-defined]
 
diff --git a/speech_recognition/recognizers/cohere_api.py b/speech_recognition/recognizers/cohere_api.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import logging
+from io import BytesIO
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+
+logger = logging.getLogger(__name__)
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    *,
+    language: str,
+    model: str = "cohere-transcribe-03-2026",
+) -> str:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the `Cohere Transcribe <https://docs.cohere.com/docs/transcribe>`__ API via the official Python SDK.
+
+    Requires the ``cohere`` package (install with ``pip install SpeechRecognition[cohere-api]``).
+    Set environment variable ``CO_API_KEY`` as documented by Cohere; this library does not read or override it in code.
+
+    ``language`` is required by the Cohere transcription API (e.g. ``\"en\"``, ``\"ja\"``).
+
+    Detail: https://docs.cohere.com/reference/create-audio-transcription
+    """
+    try:
+        import cohere
+    except ImportError:
+        raise SetupError(
+            "missing cohere module: ensure that cohere is set up correctly "
+            "(e.g. pip install SpeechRecognition[cohere-api])."
+        )
+
+    if not isinstance(audio_data, AudioData):
+        raise ValueError("``audio_data`` must be an ``AudioData`` instance")
+
+    wav_data = BytesIO(audio_data.get_wav_data())
+    wav_data.name = "SpeechRecognition_audio.wav"
+
+    client = cohere.ClientV2()
+    logger.debug(
+        "cohere audio.transcriptions.create: model=%r language=%r",
+        model,
+        language,
+    )
+    response = client.audio.transcriptions.create(
+        model=model,
+        file=wav_data,
+        language=language,
+    )
+    return response.text
diff --git a/tests/recognizers/test_cohere_api.py b/tests/recognizers/test_cohere_api.py
@@ -0,0 +1,50 @@
+from unittest.mock import MagicMock, patch
+
+from speech_recognition import AudioData, Recognizer
+from speech_recognition.recognizers import cohere_api
+
+
+@patch("cohere.ClientV2")
+def test_transcribe_default_model(mock_client_cls):
+    mock_response = MagicMock()
+    mock_response.text = "Transcription by Cohere"
+    mock_client = MagicMock()
+    mock_client.audio.transcriptions.create.return_value = mock_response
+    mock_client_cls.return_value = mock_client
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.get_wav_data.return_value = b"fake_wav"
+
+    actual = cohere_api.recognize(
+        MagicMock(spec=Recognizer), audio_data, language="en"
+    )
+
+    assert actual == "Transcription by Cohere"
+    audio_data.get_wav_data.assert_called_once()
+    mock_client_cls.assert_called_once_with()
+    mock_client.audio.transcriptions.create.assert_called_once()
+    call_kw = mock_client.audio.transcriptions.create.call_args.kwargs
+    assert call_kw["model"] == "cohere-transcribe-03-2026"
+    assert call_kw["language"] == "en"
+    assert "file" in call_kw
+
+
+@patch("cohere.ClientV2")
+def test_transcribe_with_language(mock_client_cls):
+    mock_response = MagicMock()
+    mock_response.text = "Japanese transcription"
+    mock_client = MagicMock()
+    mock_client.audio.transcriptions.create.return_value = mock_response
+    mock_client_cls.return_value = mock_client
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.get_wav_data.return_value = b"fake_wav"
+
+    actual = cohere_api.recognize(
+        MagicMock(spec=Recognizer), audio_data, language="ja"
+    )
+
+    assert actual == "Japanese transcription"
+    call_kw = mock_client.audio.transcriptions.create.call_args.kwargs
+    assert call_kw["model"] == "cohere-transcribe-03-2026"
+    assert call_kw["language"] == "ja"

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,9 @@ groq = [`
`75`	`75`	`"groq",`
`76`	`76`	`"httpx < 0.28",`
`77`	`77`	`]`
	`78`	`+cohere-api = [`
	`79`	`+ "cohere>=5.21.0",`
	`80`	`+]`
`78`	`81`	`assemblyai = ["requests"]`
`79`	`82`	`vosk = ["vosk"]`
`80`	`83`