temp commit

mi804 · mi804 · commit 8b0c7fca5e1e · 2026-03-11T10:23:44.000+08:00
diff --git a/diffsynth/utils/data/media_io_ltx2.py b/diffsynth/utils/data/media_io_ltx2.py
@@ -7,6 +7,7 @@
 import numpy as np
 from io import BytesIO
 from collections.abc import Generator, Iterator
+import torchaudio
 
 
 def _resample_audio(
@@ -137,6 +138,15 @@ def write_video_audio_ltx2(
     container.close()
 
 
+def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None) -> torch.Tensor:
+    waveform, sample_rate = torchaudio.load(path, channels_first=True)
+    start_frame = int(start_time * sample_rate)
+    if start_frame > waveform.shape[-1]:
+        raise ValueError(f"start_time of {start_time} exceeds max duration of {waveform.shape[-1] / sample_rate:.2f}")
+    end_frame = -1 if duration is None else int(duration * sample_rate)
+    return waveform[..., start_frame:end_frame]
+
+
 def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
     container = av.open(output_file, "w", format="mp4")
     try:
diff --git a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py
@@ -0,0 +1,59 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2
+
+audio = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3")
+
+vram_config = {
+    "offload_dtype": torch.bfloat16,
+    "offload_device": "cpu",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cuda",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+        ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config),
+        ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+    stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"),
+)
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+    "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+    "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+    "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+    "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+    "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+    "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+    "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+    "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+    "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    seed=43,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    tiled=True,
+    use_two_stage_pipeline=True,
+)
+write_video_audio_ltx2(
+    video=video,
+    audio=audio,
+    output_path='ltx2.3_twostage.mp4',
+    fps=24,
+    audio_sample_rate=pipe.audio_vocoder.output_sampling_rate,
+)
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ requires-python = ">=3.10.1"
 dependencies = [
     "torch>=2.0.0",
     "torchvision",
+    "torchaudio",
     "transformers",
     "imageio",
     "imageio[ffmpeg]",