Skip to content

Commit 8b0c7fc

Browse files
committed
temp commit
1 parent c927062 commit 8b0c7fc

3 files changed

Lines changed: 70 additions & 0 deletions

File tree

diffsynth/utils/data/media_io_ltx2.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
from io import BytesIO
99
from collections.abc import Generator, Iterator
10+
import torchaudio
1011

1112

1213
def _resample_audio(
@@ -137,6 +138,15 @@ def write_video_audio_ltx2(
137138
container.close()
138139

139140

141+
def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None) -> torch.Tensor:
142+
waveform, sample_rate = torchaudio.load(path, channels_first=True)
143+
start_frame = int(start_time * sample_rate)
144+
if start_frame > waveform.shape[-1]:
145+
raise ValueError(f"start_time of {start_time} exceeds max duration of {waveform.shape[-1] / sample_rate:.2f}")
146+
end_frame = -1 if duration is None else int(duration * sample_rate)
147+
return waveform[..., start_frame:end_frame]
148+
149+
140150
def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
141151
container = av.open(output_file, "w", format="mp4")
142152
try:
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import torch
2+
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
3+
from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2
4+
5+
audio = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3")
6+
7+
vram_config = {
8+
"offload_dtype": torch.bfloat16,
9+
"offload_device": "cpu",
10+
"onload_dtype": torch.bfloat16,
11+
"onload_device": "cuda",
12+
"preparing_dtype": torch.bfloat16,
13+
"preparing_device": "cuda",
14+
"computation_dtype": torch.bfloat16,
15+
"computation_device": "cuda",
16+
}
17+
pipe = LTX2AudioVideoPipeline.from_pretrained(
18+
torch_dtype=torch.bfloat16,
19+
device="cuda",
20+
model_configs=[
21+
ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
22+
ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config),
23+
ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config),
24+
],
25+
tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
26+
stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"),
27+
)
28+
prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
29+
negative_prompt = (
30+
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
31+
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
32+
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
33+
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
34+
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
35+
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
36+
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
37+
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
38+
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
39+
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
40+
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
41+
)
42+
height, width, num_frames = 512 * 2, 768 * 2, 121
43+
video, audio = pipe(
44+
prompt=prompt,
45+
negative_prompt=negative_prompt,
46+
seed=43,
47+
height=height,
48+
width=width,
49+
num_frames=num_frames,
50+
tiled=True,
51+
use_two_stage_pipeline=True,
52+
)
53+
write_video_audio_ltx2(
54+
video=video,
55+
audio=audio,
56+
output_path='ltx2.3_twostage.mp4',
57+
fps=24,
58+
audio_sample_rate=pipe.audio_vocoder.output_sampling_rate,
59+
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ requires-python = ">=3.10.1"
1212
dependencies = [
1313
"torch>=2.0.0",
1414
"torchvision",
15+
"torchaudio",
1516
"transformers",
1617
"imageio",
1718
"imageio[ffmpeg]",

0 commit comments

Comments
 (0)