Skip to content

Commit 77cd7d6

Browse files
committed
Add workflow map, workflow tests, auto docstrings, export only AutoBlocks
1 parent 86fe24a commit 77cd7d6

5 files changed

Lines changed: 242 additions & 62 deletions

File tree

src/diffusers/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,7 @@
456456
"HeliosPyramidDistilledAutoBlocks",
457457
"HeliosPyramidDistilledModularPipeline",
458458
"HeliosPyramidModularPipeline",
459-
"HunyuanVideo15Blocks",
460-
"HunyuanVideo15Image2VideoBlocks",
459+
"HunyuanVideo15AutoBlocks",
461460
"HunyuanVideo15ModularPipeline",
462461
"LTXAutoBlocks",
463462
"LTXModularPipeline",
@@ -1242,8 +1241,7 @@
12421241
HeliosPyramidDistilledAutoBlocks,
12431242
HeliosPyramidDistilledModularPipeline,
12441243
HeliosPyramidModularPipeline,
1245-
HunyuanVideo15Blocks,
1246-
HunyuanVideo15Image2VideoBlocks,
1244+
HunyuanVideo15AutoBlocks,
12471245
HunyuanVideo15ModularPipeline,
12481246
LTXAutoBlocks,
12491247
LTXModularPipeline,

src/diffusers/modular_pipelines/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@
9090
]
9191
_import_structure["hunyuan_video1_5"] = [
9292
"HunyuanVideo15AutoBlocks",
93-
"HunyuanVideo15Blocks",
94-
"HunyuanVideo15Image2VideoBlocks",
9593
"HunyuanVideo15ModularPipeline",
9694
]
9795
_import_structure["ltx"] = [
@@ -131,8 +129,6 @@
131129
)
132130
from .hunyuan_video1_5 import (
133131
HunyuanVideo15AutoBlocks,
134-
HunyuanVideo15Blocks,
135-
HunyuanVideo15Image2VideoBlocks,
136132
HunyuanVideo15ModularPipeline,
137133
)
138134
from .ltx import LTXAutoBlocks, LTXModularPipeline

src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py

Lines changed: 184 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,17 @@ class HunyuanVideo15CoreDenoiseStep(SequentialPipelineBlocks):
3939
Denoise block that takes encoded conditions and runs the denoising process.
4040
4141
Components:
42-
transformer (`HunyuanVideo15Transformer3DModel`)
43-
scheduler (`FlowMatchEulerDiscreteScheduler`)
44-
guider (`ClassifierFreeGuidance`)
42+
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
43+
(`ClassifierFreeGuidance`)
4544
4645
Inputs:
4746
num_videos_per_prompt (`int`, *optional*, defaults to 1):
4847
The number of images to generate per prompt.
4948
prompt_embeds (`Tensor`):
5049
text embeddings used to guide the image generation. Can be generated from text_encoder step.
5150
batch_size (`int`, *optional*):
52-
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
53-
generated in input step.
51+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
52+
be generated in input step.
5453
num_inference_steps (`int`, *optional*, defaults to 50):
5554
The number of denoising steps.
5655
sigmas (`list`, *optional*):
@@ -111,15 +110,10 @@ class HunyuanVideo15Blocks(SequentialPipelineBlocks):
111110
Modular pipeline blocks for HunyuanVideo 1.5 text-to-video.
112111
113112
Components:
114-
text_encoder (`Qwen2_5_VLTextModel`)
115-
tokenizer (`Qwen2TokenizerFast`)
116-
text_encoder_2 (`T5EncoderModel`)
117-
tokenizer_2 (`ByT5Tokenizer`)
118-
guider (`ClassifierFreeGuidance`)
119-
transformer (`HunyuanVideo15Transformer3DModel`)
120-
scheduler (`FlowMatchEulerDiscreteScheduler`)
121-
vae (`AutoencoderKLHunyuanVideo15`)
122-
video_processor (`HunyuanVideo15ImageProcessor`)
113+
text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
114+
tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`)
115+
transformer (`HunyuanVideo15Transformer3DModel`) vae (`AutoencoderKLHunyuanVideo15`) video_processor
116+
(`HunyuanVideo15ImageProcessor`)
123117
124118
Inputs:
125119
prompt (`str`, *optional*):
@@ -145,8 +139,8 @@ class HunyuanVideo15Blocks(SequentialPipelineBlocks):
145139
num_videos_per_prompt (`int`, *optional*, defaults to 1):
146140
The number of images to generate per prompt.
147141
batch_size (`int`, *optional*):
148-
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
149-
generated in input step.
142+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
143+
be generated in input step.
150144
num_inference_steps (`int`, *optional*, defaults to 50):
151145
The number of denoising steps.
152146
sigmas (`list`, *optional*):
@@ -194,34 +188,33 @@ class HunyuanVideo15Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
194188
Denoise block for image-to-video that takes encoded conditions and runs the denoising process.
195189
196190
Components:
197-
transformer (`HunyuanVideo15Transformer3DModel`)
198-
scheduler (`FlowMatchEulerDiscreteScheduler`)
199-
vae (`AutoencoderKLHunyuanVideo15`)
200-
video_processor (`HunyuanVideo15ImageProcessor`)
201-
image_encoder (`SiglipVisionModel`)
202-
feature_extractor (`SiglipImageProcessor`)
203-
guider (`ClassifierFreeGuidance`)
191+
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
192+
(`ClassifierFreeGuidance`)
204193
205194
Inputs:
206195
num_videos_per_prompt (`int`, *optional*, defaults to 1):
207196
The number of images to generate per prompt.
208197
prompt_embeds (`Tensor`):
209198
text embeddings used to guide the image generation. Can be generated from text_encoder step.
210199
batch_size (`int`, *optional*):
211-
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
212-
generated in input step.
200+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
201+
be generated in input step.
213202
num_inference_steps (`int`, *optional*, defaults to 50):
214203
The number of denoising steps.
215204
sigmas (`list`, *optional*):
216205
Custom sigmas for the denoising process.
217-
image (`Image | list`):
218-
Reference image(s) for denoising. Can be a single image or list of images.
206+
height (`int`, *optional*):
207+
The height in pixels of the generated image.
208+
width (`int`, *optional*):
209+
The width in pixels of the generated image.
219210
num_frames (`int`, *optional*, defaults to 121):
220211
TODO: Add description.
221212
latents (`Tensor`, *optional*):
222213
Pre-generated noisy latents for image generation.
223214
generator (`Generator`, *optional*):
224215
Torch generator for deterministic generation.
216+
image_latents (`Tensor`):
217+
TODO: Add description.
225218
attention_kwargs (`dict`, *optional*):
226219
Additional kwargs for attention processors.
227220
negative_prompt_embeds (`Tensor`, *optional*):
@@ -270,6 +263,25 @@ class HunyuanVideo15AutoVaeEncoderStep(AutoPipelineBlocks):
270263
This is an auto pipeline block that works for image-to-video tasks.
271264
- `HunyuanVideo15VaeEncoderStep` is used when `image` is provided.
272265
- If `image` is not provided, step will be skipped.
266+
267+
Components:
268+
vae (`AutoencoderKLHunyuanVideo15`) video_processor (`HunyuanVideo15ImageProcessor`)
269+
270+
Inputs:
271+
image (`Image | list`, *optional*):
272+
Reference image(s) for denoising. Can be a single image or list of images.
273+
height (`int`, *optional*):
274+
The height in pixels of the generated image.
275+
width (`int`, *optional*):
276+
The width in pixels of the generated image.
277+
278+
Outputs:
279+
image_latents (`Tensor`):
280+
Encoded image latents from the VAE encoder
281+
height (`int`):
282+
Target height resolved from image
283+
width (`int`):
284+
Target width resolved from image
273285
"""
274286

275287
model_name = "hunyuan-video-1.5"
@@ -294,6 +306,17 @@ class HunyuanVideo15AutoImageEncoderStep(AutoPipelineBlocks):
294306
This is an auto pipeline block that works for image-to-video tasks.
295307
- `HunyuanVideo15ImageEncoderStep` is used when `image` is provided.
296308
- If `image` is not provided, step will be skipped.
309+
310+
Components:
311+
image_encoder (`SiglipVisionModel`) feature_extractor (`SiglipImageProcessor`)
312+
313+
Inputs:
314+
image (`Image | list`, *optional*):
315+
Reference image(s) for denoising. Can be a single image or list of images.
316+
317+
Outputs:
318+
image_embeds (`Tensor`):
319+
Image embeddings from the Siglip vision encoder
297320
"""
298321

299322
model_name = "hunyuan-video-1.5"
@@ -315,8 +338,57 @@ def description(self):
315338
class HunyuanVideo15AutoCoreDenoiseStep(AutoPipelineBlocks):
316339
"""
317340
Auto denoise block that selects the appropriate denoise pipeline based on inputs.
318-
- `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.
319-
- `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video).
341+
- `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.
342+
- `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video).
343+
344+
Components:
345+
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
346+
(`ClassifierFreeGuidance`)
347+
348+
Inputs:
349+
num_videos_per_prompt (`int`, *optional*, defaults to 1):
350+
The number of images to generate per prompt.
351+
prompt_embeds (`Tensor`):
352+
text embeddings used to guide the image generation. Can be generated from text_encoder step.
353+
batch_size (`int`):
354+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
355+
be generated in input step.
356+
num_inference_steps (`int`):
357+
The number of denoising steps.
358+
sigmas (`list`, *optional*):
359+
Custom sigmas for the denoising process.
360+
height (`int`, *optional*):
361+
The height in pixels of the generated image.
362+
width (`int`, *optional*):
363+
The width in pixels of the generated image.
364+
num_frames (`int`, *optional*, defaults to 121):
365+
TODO: Add description.
366+
latents (`Tensor`):
367+
Pre-generated noisy latents for image generation.
368+
generator (`Generator`, *optional*):
369+
Torch generator for deterministic generation.
370+
image_latents (`Tensor`, *optional*):
371+
TODO: Add description.
372+
attention_kwargs (`dict`, *optional*):
373+
Additional kwargs for attention processors.
374+
negative_prompt_embeds (`Tensor`, *optional*):
375+
TODO: Add description.
376+
prompt_embeds_mask (`Tensor`):
377+
TODO: Add description.
378+
negative_prompt_embeds_mask (`Tensor`, *optional*):
379+
TODO: Add description.
380+
prompt_embeds_2 (`Tensor`):
381+
TODO: Add description.
382+
negative_prompt_embeds_2 (`Tensor`, *optional*):
383+
TODO: Add description.
384+
prompt_embeds_mask_2 (`Tensor`):
385+
TODO: Add description.
386+
negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
387+
TODO: Add description.
388+
389+
Outputs:
390+
latents (`Tensor`):
391+
Denoised latents.
320392
"""
321393

322394
model_name = "hunyuan-video-1.5"
@@ -338,9 +410,69 @@ class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks):
338410
"""
339411
Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows.
340412
341-
Supported workflows:
342-
- `text2video`: requires `prompt`
343-
- `image2video`: requires `image`, `prompt`
413+
Supported workflows:
414+
- `text2video`: requires `prompt`
415+
- `image2video`: requires `image`, `prompt`
416+
417+
Components:
418+
text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
419+
tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
420+
video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
421+
(`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
422+
(`HunyuanVideo15Transformer3DModel`)
423+
424+
Inputs:
425+
prompt (`str`, *optional*):
426+
The prompt or prompts to guide image generation.
427+
negative_prompt (`str`, *optional*):
428+
The prompt or prompts not to guide the image generation.
429+
prompt_embeds (`Tensor`, *optional*):
430+
text embeddings used to guide the image generation. Can be generated from text_encoder step.
431+
prompt_embeds_mask (`Tensor`, *optional*):
432+
mask for the text embeddings. Can be generated from text_encoder step.
433+
negative_prompt_embeds (`Tensor`, *optional*):
434+
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
435+
negative_prompt_embeds_mask (`Tensor`, *optional*):
436+
mask for the negative text embeddings. Can be generated from text_encoder step.
437+
prompt_embeds_2 (`Tensor`, *optional*):
438+
TODO: Add description.
439+
prompt_embeds_mask_2 (`Tensor`, *optional*):
440+
TODO: Add description.
441+
negative_prompt_embeds_2 (`Tensor`, *optional*):
442+
TODO: Add description.
443+
negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
444+
TODO: Add description.
445+
num_videos_per_prompt (`int`, *optional*, defaults to 1):
446+
The number of images to generate per prompt.
447+
image (`Image | list`, *optional*):
448+
Reference image(s) for denoising. Can be a single image or list of images.
449+
height (`int`, *optional*):
450+
The height in pixels of the generated image.
451+
width (`int`, *optional*):
452+
The width in pixels of the generated image.
453+
batch_size (`int`):
454+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
455+
be generated in input step.
456+
num_inference_steps (`int`):
457+
The number of denoising steps.
458+
sigmas (`list`, *optional*):
459+
Custom sigmas for the denoising process.
460+
num_frames (`int`, *optional*, defaults to 121):
461+
TODO: Add description.
462+
latents (`Tensor`):
463+
Pre-generated noisy latents for image generation.
464+
generator (`Generator`, *optional*):
465+
Torch generator for deterministic generation.
466+
image_latents (`Tensor`, *optional*):
467+
TODO: Add description.
468+
attention_kwargs (`dict`, *optional*):
469+
Additional kwargs for attention processors.
470+
output_type (`str`, *optional*, defaults to np):
471+
Output format: 'pil', 'np', 'pt'.
472+
473+
Outputs:
474+
videos (`list`):
475+
The generated videos.
344476
"""
345477

346478
model_name = "hunyuan-video-1.5"
@@ -352,14 +484,14 @@ class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks):
352484
HunyuanVideo15VaeDecoderStep,
353485
]
354486
block_names = ["text_encoder", "vae_encoder", "image_encoder", "denoise", "decode"]
487+
_workflow_map = {
488+
"text2video": {"prompt": True},
489+
"image2video": {"image": True, "prompt": True},
490+
}
355491

356492
@property
357493
def description(self):
358-
return (
359-
"Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows.\n"
360-
" - text2video: requires `prompt`\n"
361-
" - image2video: requires `image`, `prompt`"
362-
)
494+
return "Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows."
363495

364496
@property
365497
def outputs(self):
@@ -372,17 +504,11 @@ class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
372504
Modular pipeline blocks for HunyuanVideo 1.5 image-to-video.
373505
374506
Components:
375-
text_encoder (`Qwen2_5_VLTextModel`)
376-
tokenizer (`Qwen2TokenizerFast`)
377-
text_encoder_2 (`T5EncoderModel`)
378-
tokenizer_2 (`ByT5Tokenizer`)
379-
guider (`ClassifierFreeGuidance`)
380-
transformer (`HunyuanVideo15Transformer3DModel`)
381-
scheduler (`FlowMatchEulerDiscreteScheduler`)
382-
vae (`AutoencoderKLHunyuanVideo15`)
383-
video_processor (`HunyuanVideo15ImageProcessor`)
384-
image_encoder (`SiglipVisionModel`)
385-
feature_extractor (`SiglipImageProcessor`)
507+
text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
508+
tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
509+
video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
510+
(`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
511+
(`HunyuanVideo15Transformer3DModel`)
386512
387513
Inputs:
388514
prompt (`str`, *optional*):
@@ -407,21 +533,27 @@ class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
407533
TODO: Add description.
408534
num_videos_per_prompt (`int`, *optional*, defaults to 1):
409535
The number of images to generate per prompt.
536+
image (`Image | list`, *optional*):
537+
Reference image(s) for denoising. Can be a single image or list of images.
538+
height (`int`, *optional*):
539+
The height in pixels of the generated image.
540+
width (`int`, *optional*):
541+
The width in pixels of the generated image.
410542
batch_size (`int`, *optional*):
411-
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
412-
generated in input step.
543+
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
544+
be generated in input step.
413545
num_inference_steps (`int`, *optional*, defaults to 50):
414546
The number of denoising steps.
415547
sigmas (`list`, *optional*):
416548
Custom sigmas for the denoising process.
417-
image (`Image | list`):
418-
Reference image(s) for denoising. Can be a single image or list of images.
419549
num_frames (`int`, *optional*, defaults to 121):
420550
TODO: Add description.
421551
latents (`Tensor`, *optional*):
422552
Pre-generated noisy latents for image generation.
423553
generator (`Generator`, *optional*):
424554
Torch generator for deterministic generation.
555+
image_latents (`Tensor`):
556+
TODO: Add description.
425557
attention_kwargs (`dict`, *optional*):
426558
Additional kwargs for attention processors.
427559
output_type (`str`, *optional*, defaults to np):

0 commit comments

Comments
 (0)