Fix speccht5_tts pipeline (huggingface#42830)

jiqing-feng · vasqu · web-flow · commit 66623a1fd62d · 2025-12-15T09:51:03.000Z
* Fix speccht5_tts pipeline

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

* Update src/transformers/pipelines/text_to_audio.py

Co-authored-by: Anton Vlasjuk &lt;73884904+vasqu@users.noreply.github.com&gt;

---------

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
Co-authored-by: Anton Vlasjuk &lt;73884904+vasqu@users.noreply.github.com&gt;
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
@@ -117,8 +117,8 @@ def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
                 else vocoder
             )
 
-        if self.model.config.model_type in ["musicgen"]:
-            # MusicGen expect to use the tokenizer
+        if self.model.config.model_type in ["musicgen", "speecht5"]:
+            # MusicGen and SpeechT5 expect to use their tokenizer instead
             self.processor = None
 
         self.sampling_rate = sampling_rate
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+import torch
 
 from transformers import (
     MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
@@ -40,6 +41,38 @@ class TextToAudioPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING
     # for now only test text_to_waveform and not text_to_spectrogram
 
+    @require_torch
+    def test_small_speecht5_pt(self):
+        audio_generator = pipeline(task="text-to-audio", model="microsoft/speecht5_tts")
+        num_channels = 1  # model generates mono audio
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 5,
+            "speaker_embeddings": torch.rand(1, 512) * 0.2 - 0.1,
+        }
+
+        outputs = audio_generator("This is a test", forward_params=forward_params)
+        self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 16000}, outputs)
+        self.assertEqual(len(outputs["audio"].shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = audio_generator(["This is a test", "This is a second test"], forward_params=forward_params)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching, this time with parameterization in the forward pass
+        audio_generator = pipeline(task="text-to-audio", model="microsoft/speecht5_tts")
+        forward_params = {
+            "do_sample": False,
+            "max_new_tokens": 5,
+            "speaker_embeddings": torch.rand(1, 512) * 0.2 - 0.1,
+        }
+        outputs = audio_generator(
+            ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
     @require_torch
     def test_small_musicgen_pt(self):
         music_generator = pipeline(

Original file line number	Diff line number	Diff line change
`@@ -117,8 +117,8 @@ def __init__(self, args, vocoder=None, sampling_rate=None, *kwargs):`
`117`	`117`	`else vocoder`
`118`	`118`	`)`
`119`	`119`
`120`		`- if self.model.config.model_type in ["musicgen"]:`
`121`		`- # MusicGen expect to use the tokenizer`
	`120`	`+ if self.model.config.model_type in ["musicgen", "speecht5"]:`
	`121`	`+ # MusicGen and SpeechT5 expect to use their tokenizer instead`
`122`	`122`	`self.processor = None`
`123`	`123`
`124`	`124`	`self.sampling_rate = sampling_rate`