Skip to content

Commit e8f99f9

Browse files
committed
Add HunyuanVideo 1.5 image-to-video modular blocks
1 parent e8176d2 commit e8f99f9

6 files changed

Lines changed: 287 additions & 4 deletions

File tree

src/diffusers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@
453453
"HeliosPyramidDistilledModularPipeline",
454454
"HeliosPyramidModularPipeline",
455455
"HunyuanVideo15Blocks",
456+
"HunyuanVideo15Image2VideoBlocks",
456457
"HunyuanVideo15ModularPipeline",
457458
"QwenImageAutoBlocks",
458459
"QwenImageEditAutoBlocks",

src/diffusers/modular_pipelines/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
]
9191
_import_structure["hunyuan_video1_5"] = [
9292
"HunyuanVideo15Blocks",
93+
"HunyuanVideo15Image2VideoBlocks",
9394
"HunyuanVideo15ModularPipeline",
9495
]
9596
_import_structure["z_image"] = [
@@ -144,7 +145,7 @@
144145
QwenImageLayeredModularPipeline,
145146
QwenImageModularPipeline,
146147
)
147-
from .hunyuan_video1_5 import HunyuanVideo15Blocks, HunyuanVideo15ModularPipeline
148+
from .hunyuan_video1_5 import HunyuanVideo15Blocks, HunyuanVideo15Image2VideoBlocks, HunyuanVideo15ModularPipeline
148149
from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
149150
from .wan import (
150151
Wan22Blocks,

src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
2323
else:
24-
_import_structure["modular_blocks_hunyuan_video1_5"] = ["HunyuanVideo15Blocks"]
24+
_import_structure["modular_blocks_hunyuan_video1_5"] = ["HunyuanVideo15Blocks", "HunyuanVideo15Image2VideoBlocks"]
2525
_import_structure["modular_pipeline"] = ["HunyuanVideo15ModularPipeline"]
2626

2727
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -31,7 +31,7 @@
3131
except OptionalDependencyNotAvailable:
3232
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
3333
else:
34-
from .modular_blocks_hunyuan_video1_5 import HunyuanVideo15Blocks
34+
from .modular_blocks_hunyuan_video1_5 import HunyuanVideo15Blocks, HunyuanVideo15Image2VideoBlocks
3535
from .modular_pipeline import HunyuanVideo15ModularPipeline
3636
else:
3737
import sys

src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,125 @@ def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineSta
222222

223223
self.set_block_state(state, block_state)
224224
return components, state
225+
226+
227+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
228+
def retrieve_latents(encoder_output, generator=None, sample_mode="sample"):
229+
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
230+
return encoder_output.latent_dist.sample(generator)
231+
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
232+
return encoder_output.latent_dist.mode()
233+
elif hasattr(encoder_output, "latents"):
234+
return encoder_output.latents
235+
raise AttributeError("Could not access latents of provided encoder_output")
236+
237+
238+
class HunyuanVideo15Image2VideoPrepareLatentsStep(ModularPipelineBlocks):
239+
model_name = "hunyuan-video-1.5"
240+
241+
@property
242+
def description(self) -> str:
243+
return "Prepare latents, conditioning latents, mask, and image_embeds for I2V"
244+
245+
@property
246+
def expected_components(self) -> list[ComponentSpec]:
247+
from ...models import AutoencoderKLHunyuanVideo15
248+
from ...pipelines.hunyuan_video1_5.image_processor import HunyuanVideo15ImageProcessor
249+
from transformers import SiglipVisionModel, SiglipImageProcessor
250+
return [
251+
ComponentSpec("vae", AutoencoderKLHunyuanVideo15),
252+
ComponentSpec(
253+
"video_processor",
254+
HunyuanVideo15ImageProcessor,
255+
config=FrozenDict({"vae_scale_factor": 16}),
256+
default_creation_method="from_config",
257+
),
258+
ComponentSpec("image_encoder", SiglipVisionModel),
259+
ComponentSpec("feature_extractor", SiglipImageProcessor),
260+
]
261+
262+
@property
263+
def inputs(self) -> list[InputParam]:
264+
return [
265+
InputParam("image", required=True),
266+
InputParam("num_frames", type_hint=int, default=121),
267+
InputParam("latents", type_hint=torch.Tensor | None),
268+
InputParam("num_videos_per_prompt", type_hint=int, default=1),
269+
InputParam("generator"),
270+
InputParam("batch_size", required=True, type_hint=int),
271+
InputParam("dtype", type_hint=torch.dtype),
272+
]
273+
274+
@property
275+
def intermediate_outputs(self) -> list[OutputParam]:
276+
return [
277+
OutputParam("latents", type_hint=torch.Tensor),
278+
OutputParam("cond_latents_concat", type_hint=torch.Tensor),
279+
OutputParam("mask_concat", type_hint=torch.Tensor),
280+
OutputParam("image_embeds", type_hint=torch.Tensor),
281+
]
282+
283+
# Copied from pipeline_hunyuan_video1_5_image2video.py lines 756-839 with self->components
284+
@torch.no_grad()
285+
def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
286+
block_state = self.get_block_state(state)
287+
device = components._execution_device
288+
dtype = block_state.dtype
289+
290+
image = block_state.image
291+
batch_size = block_state.batch_size * block_state.num_videos_per_prompt
292+
num_frames = block_state.num_frames
293+
294+
# Resize/crop image to target resolution (line 756-759)
295+
height, width = components.video_processor.calculate_default_height_width(
296+
height=image.size[1], width=image.size[0], target_size=components.target_size
297+
)
298+
image = components.video_processor.resize(image, height=height, width=width, resize_mode="crop")
299+
300+
# Encode image with Siglip (lines 776-781)
301+
image_encoder_dtype = next(components.image_encoder.parameters()).dtype
302+
image_inputs = components.feature_extractor.preprocess(
303+
images=image, do_resize=True, return_tensors="pt", do_convert_rgb=True
304+
)
305+
image_inputs = image_inputs.to(device=device, dtype=image_encoder_dtype)
306+
image_embeds = components.image_encoder(**image_inputs).last_hidden_state
307+
image_embeds = image_embeds.repeat(batch_size, 1, 1)
308+
block_state.image_embeds = image_embeds.to(device=device, dtype=dtype)
309+
310+
# Prepare latents (lines 818-829)
311+
latents = block_state.latents
312+
if latents is not None:
313+
latents = latents.to(device=device, dtype=dtype)
314+
else:
315+
shape = (
316+
batch_size,
317+
components.num_channels_latents,
318+
(num_frames - 1) // components.vae_scale_factor_temporal + 1,
319+
int(height) // components.vae_scale_factor_spatial,
320+
int(width) // components.vae_scale_factor_spatial,
321+
)
322+
latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=dtype)
323+
block_state.latents = latents
324+
325+
# Prepare cond latents and mask (lines 594-632, 831-839)
326+
b, c, f, h, w = latents.shape
327+
328+
# Copied from _get_image_latents (lines 375-388) with self->components
329+
vae_dtype = components.vae.dtype
330+
image_tensor = components.video_processor.preprocess(
331+
image, height=h * components.vae_scale_factor_spatial, width=w * components.vae_scale_factor_spatial
332+
).to(device, dtype=vae_dtype)
333+
image_tensor = image_tensor.unsqueeze(2)
334+
image_latents = retrieve_latents(components.vae.encode(image_tensor), sample_mode="argmax")
335+
image_latents = image_latents * components.vae.config.scaling_factor
336+
337+
latent_condition = image_latents.repeat(batch_size, 1, f, 1, 1)
338+
latent_condition[:, :, 1:, :, :] = 0
339+
block_state.cond_latents_concat = latent_condition.to(device=device, dtype=dtype)
340+
341+
latent_mask = torch.zeros(b, 1, f, h, w, dtype=dtype, device=device)
342+
latent_mask[:, :, 0, :, :] = 1.0
343+
block_state.mask_concat = latent_mask
344+
345+
self.set_block_state(state, block_state)
346+
return components, state

src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,122 @@ def description(self) -> str:
237237
" - `HunyuanVideo15LoopAfterDenoiser`\n"
238238
"This block supports text-to-video tasks."
239239
)
240+
241+
242+
class HunyuanVideo15Image2VideoLoopDenoiser(ModularPipelineBlocks):
243+
model_name = "hunyuan-video-1.5"
244+
245+
def __init__(self, guider_input_fields=None):
246+
if guider_input_fields is None:
247+
guider_input_fields = {
248+
"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
249+
"encoder_attention_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
250+
"encoder_hidden_states_2": ("prompt_embeds_2", "negative_prompt_embeds_2"),
251+
"encoder_attention_mask_2": ("prompt_embeds_mask_2", "negative_prompt_embeds_mask_2"),
252+
}
253+
if not isinstance(guider_input_fields, dict):
254+
raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
255+
self._guider_input_fields = guider_input_fields
256+
super().__init__()
257+
258+
@property
259+
def expected_components(self) -> list[ComponentSpec]:
260+
return [
261+
ComponentSpec(
262+
"guider",
263+
ClassifierFreeGuidance,
264+
config=FrozenDict({"guidance_scale": 7.5}),
265+
default_creation_method="from_config",
266+
),
267+
ComponentSpec("transformer", HunyuanVideo15Transformer3DModel),
268+
]
269+
270+
@property
271+
def description(self) -> str:
272+
return "I2V denoiser with MeanFlow timestep_r support"
273+
274+
@property
275+
def inputs(self) -> list[InputParam]:
276+
inputs = [
277+
InputParam("attention_kwargs"),
278+
InputParam("num_inference_steps", required=True, type_hint=int),
279+
InputParam("image_embeds", type_hint=torch.Tensor),
280+
InputParam("timesteps", required=True, type_hint=torch.Tensor),
281+
]
282+
for value in self._guider_input_fields.values():
283+
if isinstance(value, tuple):
284+
inputs.append(InputParam(name=value[0], required=True, type_hint=torch.Tensor))
285+
for neg_name in value[1:]:
286+
inputs.append(InputParam(name=neg_name, type_hint=torch.Tensor))
287+
else:
288+
inputs.append(InputParam(name=value, required=True, type_hint=torch.Tensor))
289+
return inputs
290+
291+
# Copied from pipeline_hunyuan_video1_5_image2video.py lines 853-912 with self->components
292+
@torch.no_grad()
293+
def __call__(
294+
self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
295+
) -> PipelineState:
296+
timestep = t.expand(block_state.latent_model_input.shape[0]).to(block_state.latent_model_input.dtype)
297+
298+
# MeanFlow timestep_r (lines 855-862)
299+
if components.transformer.config.use_meanflow:
300+
if i == len(block_state.timesteps) - 1:
301+
timestep_r = torch.tensor([0.0], device=timestep.device)
302+
else:
303+
timestep_r = block_state.timesteps[i + 1]
304+
timestep_r = timestep_r.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
305+
else:
306+
timestep_r = None
307+
308+
guider_inputs = {
309+
input_name: tuple(getattr(block_state, v) for v in value) if isinstance(value, tuple) else getattr(block_state, value)
310+
for input_name, value in self._guider_input_fields.items()
311+
}
312+
313+
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
314+
guider_state = components.guider.prepare_inputs(guider_inputs)
315+
316+
for guider_state_batch in guider_state:
317+
components.guider.prepare_models(components.transformer)
318+
319+
cond_kwargs = {
320+
input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()
321+
}
322+
323+
context_name = getattr(guider_state_batch, components.guider._identifier_key)
324+
with components.transformer.cache_context(context_name):
325+
guider_state_batch.noise_pred = components.transformer(
326+
hidden_states=block_state.latent_model_input,
327+
image_embeds=block_state.image_embeds,
328+
timestep=timestep,
329+
timestep_r=timestep_r,
330+
attention_kwargs=block_state.attention_kwargs,
331+
return_dict=False,
332+
**cond_kwargs,
333+
)[0]
334+
335+
components.guider.cleanup_models(components.transformer)
336+
337+
block_state.noise_pred = components.guider(guider_state)[0]
338+
339+
return components, block_state
340+
341+
342+
class HunyuanVideo15Image2VideoDenoiseStep(HunyuanVideo15DenoiseLoopWrapper):
343+
block_classes = [
344+
HunyuanVideo15LoopBeforeDenoiser,
345+
HunyuanVideo15Image2VideoLoopDenoiser(),
346+
HunyuanVideo15LoopAfterDenoiser,
347+
]
348+
block_names = ["before_denoiser", "denoiser", "after_denoiser"]
349+
350+
@property
351+
def description(self) -> str:
352+
return (
353+
"Denoise step for image-to-video with MeanFlow support.\n"
354+
"At each iteration:\n"
355+
" - `HunyuanVideo15LoopBeforeDenoiser`\n"
356+
" - `HunyuanVideo15Image2VideoLoopDenoiser`\n"
357+
" - `HunyuanVideo15LoopAfterDenoiser`"
358+
)

src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
from ..modular_pipeline import SequentialPipelineBlocks
1717
from ..modular_pipeline_utils import OutputParam
1818
from .before_denoise import (
19+
HunyuanVideo15Image2VideoPrepareLatentsStep,
1920
HunyuanVideo15PrepareLatentsStep,
2021
HunyuanVideo15SetTimestepsStep,
2122
HunyuanVideo15TextInputStep,
2223
)
2324
from .decoders import HunyuanVideo15VaeDecoderStep
24-
from .denoise import HunyuanVideo15DenoiseStep
25+
from .denoise import HunyuanVideo15DenoiseStep, HunyuanVideo15Image2VideoDenoiseStep
2526
from .encoders import HunyuanVideo15TextEncoderStep
2627

2728

@@ -65,3 +66,42 @@ def description(self):
6566
@property
6667
def outputs(self):
6768
return [OutputParam.template("videos")]
69+
70+
71+
# auto_docstring
72+
class HunyuanVideo15Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
73+
model_name = "hunyuan-video-1.5"
74+
block_classes = [
75+
HunyuanVideo15TextInputStep,
76+
HunyuanVideo15SetTimestepsStep,
77+
HunyuanVideo15Image2VideoPrepareLatentsStep,
78+
HunyuanVideo15Image2VideoDenoiseStep,
79+
]
80+
block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
81+
82+
@property
83+
def description(self):
84+
return "Denoise block for image-to-video that takes encoded conditions and runs the denoising process."
85+
86+
@property
87+
def outputs(self):
88+
return [OutputParam.template("latents")]
89+
90+
91+
# auto_docstring
92+
class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
93+
model_name = "hunyuan-video-1.5"
94+
block_classes = [
95+
HunyuanVideo15TextEncoderStep,
96+
HunyuanVideo15Image2VideoCoreDenoiseStep,
97+
HunyuanVideo15VaeDecoderStep,
98+
]
99+
block_names = ["text_encoder", "denoise", "decode"]
100+
101+
@property
102+
def description(self):
103+
return "Modular pipeline blocks for HunyuanVideo 1.5 image-to-video."
104+
105+
@property
106+
def outputs(self):
107+
return [OutputParam.template("videos")]

0 commit comments

Comments
 (0)