upd

RuixiangMa · RuixiangMa · commit e2ac8bd2f9cf · 2026-04-16T10:25:38.000+08:00
Signed-off-by: Lancer &lt;maruixiang6688@gmail.com&gt;
diff --git a/docs/source/en/api/pipelines/longcat_audio_dit.md b/docs/source/en/api/pipelines/longcat_audio_dit.md
@@ -46,8 +46,9 @@ sf.write("longcat.wav", audio, pipeline.sample_rate)
 ## Tips
 
 - `audio_duration_s` is the most direct way to control output duration.
-- `seed` makes generation reproducible (optional, defaults to None).
+- Use `generator=torch.Generator("cuda").manual_seed(42)` to make generation reproducible.
 - Output shape is `(batch, channels, samples)` - use `.audios[0, 0]` to get a single audio sample.
+- The pipeline outputs mono audio (1 channel). If you need stereo, you can duplicate the channel: `audio.unsqueeze(0).repeat(1, 2, 1)`.
 
 ## LongCatAudioDiTPipeline
 
diff --git a/src/diffusers/pipelines/longcat_audio_dit/pipeline_longcat_audio_dit.py b/src/diffusers/pipelines/longcat_audio_dit/pipeline_longcat_audio_dit.py
@@ -43,9 +43,7 @@
         >>> pipe.to("cuda")
 
         >>> prompt = "A calm ocean wave ambience with soft wind in the background."
-        >>> audio = pipe(prompt, audio_duration_s=5.0, num_inference_steps=20, guidance_scale=4.0, seed=42).audios[
-        ...     0, 0
-        ... ]
+        >>> audio = pipe(prompt, audio_duration_s=5.0, num_inference_steps=20, guidance_scale=4.0, generator=torch.Generator("cuda").manual_seed(42)).audios[0, 0]
         >>> sf.write("output.wav", audio, pipe.sample_rate)
         ```
 """
@@ -240,7 +238,6 @@ def __call__(
                 Pre-generated noisy latents of shape `(batch_size, duration, latent_dim)`.
             num_inference_steps (`int`, defaults to 16): Number of denoising steps.
             guidance_scale (`float`, defaults to 4.0): Guidance scale for classifier-free guidance.
-            seed (`int`, *optional*): A seed used to make generation deterministic.
             generator (`torch.Generator` or `list[torch.Generator]`, *optional*): Random generator(s).
             output_type (`str`, defaults to `"np"`): Output format: `"np"`, `"pt"`, or `"latent"`.
             return_dict (`bool`, defaults to `True`): Whether to return `AudioPipelineOutput`.
@@ -252,10 +249,6 @@ def __call__(
 
         Examples:
         """
-        # Create generator from seed if provided
-        if generator is None and seed is not None:
-            generator = torch.Generator(device=self.device).manual_seed(seed)
-
         if prompt is None:
             prompt = []
         elif isinstance(prompt, str):