fix: update

claytonlin1110 · claytonlin1110 · commit 282431b12e96 · 2026-02-26T16:42:35.000-06:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -113,7 +113,7 @@ xlsx = [
 ]
 # Speech-to-text for partition_audio (multimodal: audio -> elements)
 audio = [
-    "openai-whisper>=20231117, <20260000",
+    "openai-whisper>=20231117, <20270000",
 ]
 all-docs = [
     "unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
diff --git a/test_unstructured/partition/test_audio.py b/test_unstructured/partition/test_audio.py
@@ -22,16 +22,21 @@ def test_partition_audio_raises_with_neither_filename_nor_file():
 
 
 def test_partition_audio_raises_with_both_filename_and_file():
-    with pytest.raises(ValueError, match="Exactly one of .* must be specified"):
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            partition_audio(filename=tmp.name, file=tmp)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+    try:
+        with pytest.raises(ValueError, match="Exactly one of .* must be specified"):
+            with open(path, "rb") as f:
+                partition_audio(filename=path, file=f)
+    finally:
+        Path(path).unlink(missing_ok=True)
 
 
 @patch(
-    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+    "unstructured.partition.audio.SpeechToTextAgent.get_agent",
 )
-def test_partition_audio_from_filename_returns_transcript_elements(mock_get_instance):
-    mock_agent = mock_get_instance.return_value
+def test_partition_audio_from_filename_returns_transcript_elements(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
     mock_agent.transcribe_segments.return_value = [
         {"text": "Hello, this is a test transcript.", "start": 0.0, "end": 2.5},
     ]
@@ -52,34 +57,73 @@ def test_partition_audio_from_filename_returns_transcript_elements(mock_get_inst
     assert elements[0].metadata.detection_origin == "speech_to_text"
     assert elements[0].metadata.segment_start_seconds == 0.0
     assert elements[0].metadata.segment_end_seconds == 2.5
+    mock_get_agent.assert_called_once_with(None)
     mock_agent.transcribe_segments.assert_called_once_with(path, language=None)
 
 
 @patch(
-    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+    "unstructured.partition.audio.SpeechToTextAgent.get_agent",
 )
-def test_partition_audio_from_file_uses_temp_path_and_cleans_up(mock_get_instance):
-    mock_agent = mock_get_instance.return_value
+def test_partition_audio_from_file_uses_temp_path_and_cleans_up(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
     mock_agent.transcribe_segments.return_value = [
         {"text": "From file object.", "start": 0.0, "end": 1.0},
     ]
 
+    captured_temp_path: list[str] = []
+    real_named_temp = tempfile.NamedTemporaryFile
+
+    def spy_named_temp(*args, **kwargs):
+        ctx = real_named_temp(*args, **kwargs)
+        captured_temp_path.append(ctx.name)
+        return ctx
+
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp.write(b"\x00" * 44)
         tmp.flush()
         tmp.seek(0)
-        elements = partition_audio(file=tmp, metadata_filename="recording.wav")
+        with patch("unstructured.partition.audio.tempfile.NamedTemporaryFile", spy_named_temp):
+            elements = partition_audio(file=tmp, metadata_filename="recording.wav")
 
     assert len(elements) == 1
     assert elements[0].text == "From file object."
     assert elements[0].metadata.filename == "recording.wav"
+    assert len(captured_temp_path) == 1, "expected exactly one temp file to be created"
+    assert not Path(captured_temp_path[0]).exists(), "temp file was not deleted after partitioning"
+
+
+@patch(
+    "unstructured.partition.audio.SpeechToTextAgent.get_agent",
+)
+def test_partition_audio_cleans_up_temp_file_when_transcription_raises(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
+    mock_agent.transcribe_segments.side_effect = RuntimeError("transcription failed")
+
+    captured_temp_path: list[str] = []
+    real_named_temp = tempfile.NamedTemporaryFile
+
+    def spy_named_temp(*args, **kwargs):
+        ctx = real_named_temp(*args, **kwargs)
+        captured_temp_path.append(ctx.name)
+        return ctx
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+        tmp.seek(0)
+        with patch("unstructured.partition.audio.tempfile.NamedTemporaryFile", spy_named_temp):
+            with pytest.raises(RuntimeError, match="transcription failed"):
+                partition_audio(file=tmp)
+
+    assert len(captured_temp_path) == 1, "expected exactly one temp file to be created"
+    assert not Path(captured_temp_path[0]).exists(), "temp file was not deleted after exception"
 
 
 @patch(
-    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+    "unstructured.partition.audio.SpeechToTextAgent.get_agent",
 )
-def test_partition_audio_empty_transcript_returns_empty_list(mock_get_instance):
-    mock_agent = mock_get_instance.return_value
+def test_partition_audio_empty_transcript_returns_empty_list(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
     mock_agent.transcribe_segments.return_value = []
 
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
@@ -96,10 +140,10 @@ def test_partition_audio_empty_transcript_returns_empty_list(mock_get_instance):
 
 
 @patch(
-    "unstructured.partition.audio.SpeechToTextAgent.get_instance",
+    "unstructured.partition.audio.SpeechToTextAgent.get_agent",
 )
-def test_partition_audio_returns_one_element_per_segment(mock_get_instance):
-    mock_agent = mock_get_instance.return_value
+def test_partition_audio_returns_one_element_per_segment(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
     mock_agent.transcribe_segments.return_value = [
         {"text": "First segment.", "start": 0.0, "end": 1.0},
         {"text": "Second segment.", "start": 1.0, "end": 2.5},
@@ -153,3 +197,152 @@ def test_wav_file_type_is_partitionable():
     assert FileType.WAV.is_partitionable
     assert FileType.WAV.partitioner_shortname == "audio"
     assert FileType.WAV.partitioner_function_name == "partition_audio"
+
+
+# ================================================================================================
+# partition_audio parameter forwarding
+# ================================================================================================
+
+
+@patch("unstructured.partition.audio.SpeechToTextAgent.get_agent")
+def test_partition_audio_forwards_custom_stt_agent_to_get_agent(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
+    mock_agent.transcribe_segments.return_value = [
+        {"text": "Custom agent output.", "start": 0.0, "end": 1.0},
+    ]
+    custom_module = (
+        "unstructured.partition.utils.speech_to_text.whisper_stt.SpeechToTextAgentWhisper"
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+
+    try:
+        partition_audio(filename=path, stt_agent=custom_module)
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+    mock_get_agent.assert_called_once_with(custom_module)
+
+
+@patch("unstructured.partition.audio.SpeechToTextAgent.get_agent")
+def test_partition_audio_forwards_language_to_transcribe_segments(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
+    mock_agent.transcribe_segments.return_value = [
+        {"text": "Hola mundo.", "start": 0.0, "end": 1.5},
+    ]
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+
+    try:
+        elements = partition_audio(filename=path, language="es")
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+    mock_agent.transcribe_segments.assert_called_once_with(path, language="es")
+    assert elements[0].text == "Hola mundo."
+
+
+# ================================================================================================
+# Whitespace-only segment filtering
+# ================================================================================================
+
+
+@patch("unstructured.partition.audio.SpeechToTextAgent.get_agent")
+def test_partition_audio_filters_whitespace_only_segments(mock_get_agent):
+    mock_agent = mock_get_agent.return_value
+    mock_agent.transcribe_segments.return_value = [
+        {"text": "  ", "start": 0.0, "end": 0.5},
+        {"text": "Real content.", "start": 0.5, "end": 2.0},
+        {"text": "\t\n", "start": 2.0, "end": 2.5},
+    ]
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+        tmp.write(b"\x00" * 44)
+        tmp.flush()
+
+    try:
+        elements = partition_audio(filename=path)
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+    assert len(elements) == 1
+    assert elements[0].text == "Real content."
+
+
+# ================================================================================================
+# SpeechToTextAgent unit tests
+# ================================================================================================
+
+
+class TestSpeechToTextAgentInterface:
+    """Unit tests for the SpeechToTextAgent base class."""
+
+    def test_get_agent_uses_env_config_when_no_module_given(self):
+        from unittest.mock import patch as _patch
+
+        from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+            SpeechToTextAgent,
+        )
+
+        with _patch.object(SpeechToTextAgent, "get_instance") as mock_get_instance:
+            SpeechToTextAgent.get_agent(None)
+            called_with = mock_get_instance.call_args[0][0]
+            assert "SpeechToTextAgent" in called_with or "Whisper" in called_with
+
+    def test_get_agent_passes_explicit_module_to_get_instance(self):
+        from unittest.mock import patch as _patch
+
+        from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+            SpeechToTextAgent,
+        )
+
+        custom = "unstructured.partition.utils.speech_to_text.whisper_stt.SpeechToTextAgentWhisper"
+        with _patch.object(SpeechToTextAgent, "get_instance") as mock_get_instance:
+            SpeechToTextAgent.get_agent(custom)
+            mock_get_instance.assert_called_once_with(custom)
+
+    def test_get_instance_rejects_non_whitelisted_module(self):
+        from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+            SpeechToTextAgent,
+        )
+
+        with pytest.raises(ValueError, match="must be in the whitelist"):
+            SpeechToTextAgent.get_instance("evil.module.EvilAgent")
+
+    def test_transcribe_segments_default_delegates_to_transcribe(self):
+        """Base transcribe_segments() wraps transcribe() in a single segment."""
+
+        from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+            SpeechToTextAgent,
+        )
+
+        # Create a minimal concrete subclass
+        class _StubAgent(SpeechToTextAgent):
+            def transcribe(self, audio_path: str, *, language=None) -> str:
+                return "stub text"
+
+        agent = _StubAgent()
+        segments = agent.transcribe_segments("fake.wav")
+        assert len(segments) == 1
+        assert segments[0]["text"] == "stub text"
+        assert segments[0]["start"] == 0.0
+        assert segments[0]["end"] == 0.0
+
+    def test_transcribe_segments_default_returns_empty_for_blank_text(self):
+        from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
+            SpeechToTextAgent,
+        )
+
+        class _BlankAgent(SpeechToTextAgent):
+            def transcribe(self, audio_path: str, *, language=None) -> str:
+                return "   "
+
+        agent = _BlankAgent()
+        assert agent.transcribe_segments("fake.wav") == []
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -527,6 +527,10 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
             "text_as_html": cls.STRING_CONCATENATE,
             "table_as_cells": cls.FIRST,  # -- only occurs in Table --
             "url": cls.FIRST,
+            # TODO: ideally a chunk spanning multiple audio segments would keep min(start) and
+            # max(end) across its constituent elements. ConsolidationStrategy currently has no
+            # MIN/MAX variants, so DROP is the safe fallback for now. Add MIN/MAX strategies
+            # and switch these to cls.MIN / cls.MAX when that work is done.
             "segment_start_seconds": cls.DROP,
             "segment_end_seconds": cls.DROP,
             "key_value_pairs": cls.DROP,  # -- only occurs in FormKeysValues --
diff --git a/unstructured/partition/audio.py b/unstructured/partition/audio.py
@@ -15,6 +15,7 @@
 from unstructured.partition.utils.speech_to_text.speech_to_text_interface import (
     SpeechToTextAgent,
 )
+from unstructured.utils import is_temp_file_path
 
 
 @apply_metadata(FileType.WAV)
@@ -70,16 +71,13 @@ def partition_audio(
         agent = SpeechToTextAgent.get_agent(stt_agent)
         segments = agent.transcribe_segments(audio_path, language=language)
     finally:
-        if filename is None and audio_path.startswith(tempfile.gettempdir()):
+        if filename is None and is_temp_file_path(audio_path):
             Path(audio_path).unlink(missing_ok=True)
 
     if not segments:
         return []
 
-    base_metadata = ElementMetadata(
-        last_modified=get_last_modified_date(filename) if filename else None,
-    )
-    base_metadata.detection_origin = "speech_to_text"
+    last_modified = get_last_modified_date(filename) if filename else None
 
     elements: list[Element] = []
     for seg in segments:
@@ -88,7 +86,7 @@ def partition_audio(
             continue
         element = NarrativeText(text=text)
         element.metadata = ElementMetadata(
-            last_modified=base_metadata.last_modified,
+            last_modified=last_modified,
             segment_start_seconds=seg["start"],
             segment_end_seconds=seg["end"],
         )
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import Optional
 
-from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
+from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT, STT_AGENT_WHISPER
 
 
 @lru_cache(maxsize=1)
@@ -124,8 +124,6 @@ def STT_AGENT_CACHE_SIZE(self) -> int:
     @property
     def STT_AGENT(self) -> str:
         """Speech-to-text agent module for partition_audio (e.g. Whisper)."""
-        from unstructured.partition.utils.constants import STT_AGENT_WHISPER
-
         return self._get_string("STT_AGENT", STT_AGENT_WHISPER)
 
     @property
@@ -149,12 +147,22 @@ def WHISPER_DEVICE(self) -> str:
 
     @property
     def WHISPER_FP16(self) -> bool:
-        """Use FP16 for Whisper transcription when True (default).
+        """Use FP16 for Whisper transcription.
 
-        FP16 gives roughly 2x GPU speedup on CUDA with minimal quality impact.
-        Set WHISPER_FP16=false to disable (e.g. for CPU or compatibility).
+        FP16 gives roughly 2x GPU speedup on CUDA with minimal quality impact, but is
+        unsupported on CPU and will raise a RuntimeError there. The default is auto-detected:
+        True when a CUDA GPU is available, False otherwise.
+        Set WHISPER_FP16=true/false explicitly to override.
         """
-        return self._get_bool("WHISPER_FP16", True)
+        env_val = self._get_string("WHISPER_FP16")
+        if env_val:
+            return env_val.lower() in ("true", "1", "t")
+        try:
+            import torch
+
+            return bool(torch.cuda.is_available())
+        except ImportError:
+            return False
 
     @property
     def EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD(self) -> int:
diff --git a/unstructured/partition/utils/speech_to_text/speech_to_text_interface.py b/unstructured/partition/utils/speech_to_text/speech_to_text_interface.py
diff --git a/unstructured/partition/utils/speech_to_text/whisper_stt.py b/unstructured/partition/utils/speech_to_text/whisper_stt.py

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ xlsx = [`
`113`	`113`	`]`
`114`	`114`	`# Speech-to-text for partition_audio (multimodal: audio -> elements)`
`115`	`115`	`audio = [`
`116`		`- "openai-whisper>=20231117, <20260000",`
	`116`	`+ "openai-whisper>=20231117, <20270000",`
`117`	`117`	`]`
`118`	`118`	`all-docs = [`
`119`	`119`	`"unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",`