feat: add speech-to-text for audio (WAV) via partition_audio and optional Whisper STT agent

claytonlin1110 · claytonlin1110 · commit 9b75c94c2149 · 2026-02-24T13:30:13.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.21.7
+
+### Enhancements
+- **Add speech-to-text to multimodal pipeline**: Audio files (WAV) can now be partitioned into document elements via speech-to-text. Install the optional `audio` extra (`pip install "unstructured[audio]"`) to use the Whisper-based partitioner. Call `partition()` or `partition_audio()` with a WAV file to get a transcript as `NarrativeText` elements. The `STT_AGENT` environment variable selects the speech-to-text implementation (default: Whisper).
+
 ## 0.21.6
 
 ### Enhancements
diff --git a/pyproject.toml b/pyproject.toml
@@ -111,8 +111,12 @@ xlsx = [
     "pandas>=2.0.0, <4.0.0",
     "xlrd>=2.0.1, <3.0.0",
 ]
+# Speech-to-text for partition_audio (multimodal: audio -> elements)
+audio = [
+    "openai-whisper>=20231117, <20260000",
+]
 all-docs = [
-    "unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
+    "unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
 ]
 # Feature extras
 chunking-tokens = [
diff --git a/test_unstructured/partition/test_audio.py b/test_unstructured/partition/test_audio.py
@@ -0,0 +1,94 @@
+# pyright: reportPrivateUsage=false
+
+"""Tests for partition_audio (speech-to-text in multimodal pipeline)."""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from unstructured.documents.elements import NarrativeText
+from unstructured.file_utils.model import FileType
+from unstructured.partition.audio import partition_audio
+
+
+def test_partition_audio_raises_with_neither_filename_nor_file():
+    with pytest.raises(ValueError, match="Exactly one of .* must be specified"):
+        partition_audio()
+
+
+def test_partition_audio_raises_with_both_filename_and_file():
+    with pytest.raises(ValueError, match="Exactly one of .* must be specified"):
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            partition_audio(filename=tmp.name, file=tmp)
+
+
+@patch(
+    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+)
+def test_partition_audio_from_filename_returns_transcript_elements(mock_get_instance):
+    mock_agent = mock_get_instance.return_value
+    mock_agent.transcribe.return_value = "Hello, this is a test transcript."
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+        tmp.write(b"\x00" * 44)  # minimal WAV-like header
+        tmp.flush()
+
+    try:
+        elements = partition_audio(filename=path)
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+    assert len(elements) == 1
+    assert isinstance(elements[0], NarrativeText)
+    assert elements[0].text == "Hello, this is a test transcript."
+    assert elements[0].metadata.detection_origin == "speech_to_text"
+    mock_agent.transcribe.assert_called_once_with(path, language=None)
+
+
+@patch(
+    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+)
+def test_partition_audio_from_file_uses_temp_path_and_cleans_up(mock_get_instance):
+    mock_agent = mock_get_instance.return_value
+    mock_agent.transcribe.return_value = "From file object."
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+        tmp.seek(0)
+        elements = partition_audio(file=tmp, metadata_filename="recording.wav")
+
+    assert len(elements) == 1
+    assert elements[0].text == "From file object."
+    assert elements[0].metadata.filename == "recording.wav"
+
+
+@patch(
+    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+)
+def test_partition_audio_empty_transcript_returns_empty_list(mock_get_instance):
+    mock_agent = mock_get_instance.return_value
+    mock_agent.transcribe.return_value = "   "
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+
+    try:
+        elements = partition_audio(filename=path)
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+    assert elements == []
+
+
+def test_wav_file_type_is_partitionable():
+    assert FileType.WAV.is_partitionable
+    assert FileType.WAV.partitioner_shortname == "audio"
+    assert FileType.WAV.partitioner_function_name == "partition_audio"
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.21.6"  # pragma: no cover
+__version__ = "0.21.7"  # pragma: no cover
diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py
@@ -441,9 +441,9 @@ def partitioner_shortname(self) -> str | None:
     )
     WAV = (
         "wav",
-        None,
-        cast(list[str], []),
-        None,
+        "audio",
+        ["whisper"],
+        "audio",
         [".wav"],
         "audio/wav",
         [
diff --git a/unstructured/partition/audio.py b/unstructured/partition/audio.py
@@ -0,0 +1,95 @@
+"""Partition audio files into elements using speech-to-text transcription."""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from typing import IO, Any
+
+from unstructured.chunking import add_chunking_strategy
+from unstructured.documents.elements import Element, NarrativeText
+from unstructured.file_utils.model import FileType
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
+from unstructured.partition.utils.config import env_config
+from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+    SpeechToTextAgent,
+)
+
+
+@apply_metadata(FileType.WAV)
+@add_chunking_strategy
+def partition_audio(
+    filename: str | None = None,
+    *,
+    file: IO[bytes] | None = None,
+    language: str | None = None,
+    stt_agent: str | None = None,
+    metadata_filename: str | None = None,
+    metadata_last_modified: str | None = None,
+    **kwargs: Any,
+) -> list[Element]:
+    """Partition an audio file (e.g. WAV) into elements using speech-to-text.
+
+    Transcribes the audio and returns a single NarrativeText element containing
+    the full transcript. Requires the optional `audio` extra with Whisper:
+    ``pip install "unstructured[audio]"``.
+
+    Parameters
+    ----------
+    filename
+        Path to the audio file.
+    file
+        File-like object opened in binary mode (e.g. ``open("audio.wav", "rb")``).
+    language
+        Optional ISO 639-1 language code for the spoken language (e.g. "en").
+        When None, the speech-to-text agent may auto-detect.
+    stt_agent
+        Optional fully-qualified class name of the SpeechToTextAgent implementation.
+        Defaults to the Whisper agent when the audio extra is installed.
+    metadata_filename
+        Filename to store in element metadata when partitioning from a file object.
+    metadata_last_modified
+        Last modified date to store in element metadata.
+    """
+    exactly_one(filename=filename, file=file)
+
+    audio_path: str
+    if filename is not None:
+        audio_path = filename
+    else:
+        if file is None:
+            raise ValueError("Either filename or file must be provided.")
+        file.seek(0)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp.write(file.read())
+            audio_path = tmp.name
+
+    try:
+        agent_module = stt_agent or env_config.STT_AGENT
+        agent = SpeechToTextAgent.get_instance(agent_module)
+        text = agent.transcribe(audio_path, language=language)
+    finally:
+        if filename is None and audio_path.startswith(tempfile.gettempdir()):
+            Path(audio_path).unlink(missing_ok=True)
+
+    if not text.strip():
+        return []
+
+    metadata_kwargs: dict[str, Any] = {}
+    if metadata_filename:
+        metadata_kwargs["filename"] = metadata_filename
+    elif filename:
+        metadata_kwargs["filename"] = filename
+    if metadata_last_modified:
+        metadata_kwargs["last_modified"] = metadata_last_modified
+    elif filename:
+        last_modified = get_last_modified_date(filename)
+        if last_modified:
+            metadata_kwargs["last_modified"] = last_modified
+
+    element = NarrativeText(text=text)
+    element.metadata.detection_origin = "speech_to_text"
+    element.metadata.update(element.metadata.__class__(**metadata_kwargs))
+
+    return [element]
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -116,6 +116,13 @@ def OCR_AGENT_CACHE_SIZE(self) -> int:
         """Maximum number of OCR agents to cache per process"""
         return self._get_int("OCR_AGENT_CACHE_SIZE", 1)
 
+    @property
+    def STT_AGENT(self) -> str:
+        """Speech-to-text agent module for partition_audio (e.g. Whisper)."""
+        from unstructured.partition.utils.constants import STT_AGENT_WHISPER
+
+        return self._get_string("STT_AGENT", STT_AGENT_WHISPER)
+
     @property
     def EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD(self) -> int:
         """extra image block content to add around an identified element(`Image`, `Table`) region
diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -41,6 +41,15 @@ class PartitionStrategy:
     "unstructured.partition.utils.ocr_models.google_vision_ocr",
 ).split(",")
 
+# Speech-to-text agent (used by partition_audio)
+STT_AGENT_WHISPER = "unstructured.partition.utils.speech_to_text.whisper_stt.SpeechToTextAgentWhisper"
+STT_AGENT_MODULES_WHITELIST = (
+    os.getenv(
+        "STT_AGENT_MODULES_WHITELIST",
+        "unstructured.partition.utils.speech_to_text.whisper_stt",
+    ).split(",")
+)
+
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
 
 # this field is defined by unstructured_pytesseract
diff --git a/unstructured/partition/utils/speech_to_text/__init__.py b/unstructured/partition/utils/speech_to_text/__init__.py
@@ -0,0 +1,7 @@
+"""Speech-to-text agents for transcribing audio in the multimodal partition pipeline."""
+
+from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+    SpeechToTextAgent,
+)
+
+__all__ = ["SpeechToTextAgent"]
diff --git a/unstructured/partition/utils/speech_to_text/speech_to_text_interface.py b/unstructured/partition/utils/speech_to_text/speech_to_text_interface.py
@@ -0,0 +1,61 @@
+"""Abstract interface for speech-to-text (STT) agents used by the audio partitioner."""
+
+from __future__ import annotations
+
+import functools
+import importlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from unstructured.logger import logger
+from unstructured.partition.utils.constants import STT_AGENT_MODULES_WHITELIST
+
+if TYPE_CHECKING:
+    pass
+
+
+class SpeechToTextAgent(ABC):
+    """Defines the interface for a speech-to-text transcription service."""
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def get_instance(agent_module: str) -> "SpeechToTextAgent":
+        """Load and return the configured SpeechToTextAgent implementation.
+
+        The implementation is determined by the `STT_AGENT` environment variable
+        or the passed `agent_module` (e.g. whisper implementation).
+        """
+        module_name, class_name = agent_module.rsplit(".", 1)
+        if module_name not in STT_AGENT_MODULES_WHITELIST:
+            raise ValueError(
+                f"Speech-to-text agent module {module_name} must be in the whitelist: "
+                f"{STT_AGENT_MODULES_WHITELIST}."
+            )
+        try:
+            mod = importlib.import_module(module_name)
+            cls = getattr(mod, class_name)
+            return cls()
+        except (ImportError, AttributeError) as e:
+            logger.error(f"Failed to get SpeechToTextAgent instance: {e}")
+            raise RuntimeError(
+                "Could not load the SpeechToText agent. Install the audio extra: "
+                'pip install "unstructured[audio]"'
+            ) from e
+
+    @abstractmethod
+    def transcribe(self, audio_path: str, *, language: str | None = None) -> str:
+        """Transcribe audio from a file path to text.
+
+        Parameters
+        ----------
+        audio_path
+            Path to an audio file (e.g. WAV, MP3).
+        language
+            Optional ISO 639-1 language code for the spoken language (e.g. "en").
+            When None, the agent may auto-detect.
+
+        Returns
+        -------
+        Transcribed text.
+        """
+        pass
diff --git a/unstructured/partition/utils/speech_to_text/whisper_stt.py b/unstructured/partition/utils/speech_to_text/whisper_stt.py
@@ -0,0 +1,32 @@
+"""Whisper-based speech-to-text agent for the audio partitioner."""
+
+from __future__ import annotations
+
+from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+    SpeechToTextAgent,
+)
+
+
+class SpeechToTextAgentWhisper(SpeechToTextAgent):
+    """Speech-to-text implementation using OpenAI Whisper."""
+
+    def __init__(self, model_size: str = "base") -> None:
+        """Initialize the Whisper model.
+
+        Parameters
+        ----------
+        model_size
+            Whisper model size: "tiny", "base", "small", "medium", "large", or "large-v3".
+            Larger models are more accurate but slower and use more memory.
+        """
+        import whisper
+
+        self._model = whisper.load_model(model_size)
+
+    def transcribe(self, audio_path: str, *, language: str | None = None) -> str:
+        """Transcribe audio file to text using Whisper."""
+        options: dict = {}
+        if language is not None:
+            options["language"] = language
+        result = self._model.transcribe(audio_path, **options)
+        return result.get("text", "").strip()

Original file line number	Diff line number	Diff line change
`@@ -111,8 +111,12 @@ xlsx = [`
`111`	`111`	`"pandas>=2.0.0, <4.0.0",`
`112`	`112`	`"xlrd>=2.0.1, <3.0.0",`
`113`	`113`	`]`
	`114`	`+# Speech-to-text for partition_audio (multimodal: audio -> elements)`
	`115`	`+audio = [`
	`116`	`+ "openai-whisper>=20231117, <20260000",`
	`117`	`+]`
`114`	`118`	`all-docs = [`
`115`		`- "unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",`
	`119`	`+ "unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",`
`116`	`120`	`]`
`117`	`121`	`# Feature extras`
`118`	`122`	`chunking-tokens = [`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.21.6" # pragma: no cover`
	`1`	`+__version__ = "0.21.7" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -441,9 +441,9 @@ def partitioner_shortname(self) -> str \| None:`
`441`	`441`	`)`
`442`	`442`	`WAV = (`
`443`	`443`	`"wav",`
`444`		`- None,`
`445`		`- cast(list[str], []),`
`446`		`- None,`
	`444`	`+ "audio",`
	`445`	`+ ["whisper"],`
	`446`	`+ "audio",`
`447`	`447`	`[".wav"],`
`448`	`448`	`"audio/wav",`
`449`	`449`	`[`