|
| 1 | +import torch |
| 2 | +from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig |
| 3 | +from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 |
| 4 | + |
| 5 | +audio = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3") |
| 6 | + |
| 7 | +vram_config = { |
| 8 | + "offload_dtype": torch.bfloat16, |
| 9 | + "offload_device": "cpu", |
| 10 | + "onload_dtype": torch.bfloat16, |
| 11 | + "onload_device": "cuda", |
| 12 | + "preparing_dtype": torch.bfloat16, |
| 13 | + "preparing_device": "cuda", |
| 14 | + "computation_dtype": torch.bfloat16, |
| 15 | + "computation_device": "cuda", |
| 16 | +} |
| 17 | +pipe = LTX2AudioVideoPipeline.from_pretrained( |
| 18 | + torch_dtype=torch.bfloat16, |
| 19 | + device="cuda", |
| 20 | + model_configs=[ |
| 21 | + ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), |
| 22 | + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config), |
| 23 | + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config), |
| 24 | + ], |
| 25 | + tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), |
| 26 | + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), |
| 27 | +) |
| 28 | +prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" |
| 29 | +negative_prompt = ( |
| 30 | + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " |
| 31 | + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " |
| 32 | + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " |
| 33 | + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " |
| 34 | + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " |
| 35 | + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " |
| 36 | + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " |
| 37 | + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " |
| 38 | + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " |
| 39 | + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " |
| 40 | + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." |
| 41 | +) |
| 42 | +height, width, num_frames = 512 * 2, 768 * 2, 121 |
| 43 | +video, audio = pipe( |
| 44 | + prompt=prompt, |
| 45 | + negative_prompt=negative_prompt, |
| 46 | + seed=43, |
| 47 | + height=height, |
| 48 | + width=width, |
| 49 | + num_frames=num_frames, |
| 50 | + tiled=True, |
| 51 | + use_two_stage_pipeline=True, |
| 52 | +) |
| 53 | +write_video_audio_ltx2( |
| 54 | + video=video, |
| 55 | + audio=audio, |
| 56 | + output_path='ltx2.3_twostage.mp4', |
| 57 | + fps=24, |
| 58 | + audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, |
| 59 | +) |
0 commit comments