@@ -39,18 +39,17 @@ class HunyuanVideo15CoreDenoiseStep(SequentialPipelineBlocks):
3939 Denoise block that takes encoded conditions and runs the denoising process.
4040
4141 Components:
42- transformer (`HunyuanVideo15Transformer3DModel`)
43- scheduler (`FlowMatchEulerDiscreteScheduler`)
44- guider (`ClassifierFreeGuidance`)
42+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
43+ (`ClassifierFreeGuidance`)
4544
4645 Inputs:
4746 num_videos_per_prompt (`int`, *optional*, defaults to 1):
4847 The number of images to generate per prompt.
4948 prompt_embeds (`Tensor`):
5049 text embeddings used to guide the image generation. Can be generated from text_encoder step.
5150 batch_size (`int`, *optional*):
52- Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
53- generated in input step.
51+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
52+ be generated in input step.
5453 num_inference_steps (`int`, *optional*, defaults to 50):
5554 The number of denoising steps.
5655 sigmas (`list`, *optional*):
@@ -111,15 +110,10 @@ class HunyuanVideo15Blocks(SequentialPipelineBlocks):
111110 Modular pipeline blocks for HunyuanVideo 1.5 text-to-video.
112111
113112 Components:
114- text_encoder (`Qwen2_5_VLTextModel`)
115- tokenizer (`Qwen2TokenizerFast`)
116- text_encoder_2 (`T5EncoderModel`)
117- tokenizer_2 (`ByT5Tokenizer`)
118- guider (`ClassifierFreeGuidance`)
119- transformer (`HunyuanVideo15Transformer3DModel`)
120- scheduler (`FlowMatchEulerDiscreteScheduler`)
121- vae (`AutoencoderKLHunyuanVideo15`)
122- video_processor (`HunyuanVideo15ImageProcessor`)
113+ text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
114+ tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`)
115+ transformer (`HunyuanVideo15Transformer3DModel`) vae (`AutoencoderKLHunyuanVideo15`) video_processor
116+ (`HunyuanVideo15ImageProcessor`)
123117
124118 Inputs:
125119 prompt (`str`, *optional*):
@@ -145,8 +139,8 @@ class HunyuanVideo15Blocks(SequentialPipelineBlocks):
145139 num_videos_per_prompt (`int`, *optional*, defaults to 1):
146140 The number of images to generate per prompt.
147141 batch_size (`int`, *optional*):
148- Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
149- generated in input step.
142+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
143+ be generated in input step.
150144 num_inference_steps (`int`, *optional*, defaults to 50):
151145 The number of denoising steps.
152146 sigmas (`list`, *optional*):
@@ -194,34 +188,33 @@ class HunyuanVideo15Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
194188 Denoise block for image-to-video that takes encoded conditions and runs the denoising process.
195189
196190 Components:
197- transformer (`HunyuanVideo15Transformer3DModel`)
198- scheduler (`FlowMatchEulerDiscreteScheduler`)
199- vae (`AutoencoderKLHunyuanVideo15`)
200- video_processor (`HunyuanVideo15ImageProcessor`)
201- image_encoder (`SiglipVisionModel`)
202- feature_extractor (`SiglipImageProcessor`)
203- guider (`ClassifierFreeGuidance`)
191+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
192+ (`ClassifierFreeGuidance`)
204193
205194 Inputs:
206195 num_videos_per_prompt (`int`, *optional*, defaults to 1):
207196 The number of images to generate per prompt.
208197 prompt_embeds (`Tensor`):
209198 text embeddings used to guide the image generation. Can be generated from text_encoder step.
210199 batch_size (`int`, *optional*):
211- Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
212- generated in input step.
200+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
201+ be generated in input step.
213202 num_inference_steps (`int`, *optional*, defaults to 50):
214203 The number of denoising steps.
215204 sigmas (`list`, *optional*):
216205 Custom sigmas for the denoising process.
217- image (`Image | list`):
218- Reference image(s) for denoising. Can be a single image or list of images.
206+ height (`int`, *optional*):
207+ The height in pixels of the generated image.
208+ width (`int`, *optional*):
209+ The width in pixels of the generated image.
219210 num_frames (`int`, *optional*, defaults to 121):
220211 TODO: Add description.
221212 latents (`Tensor`, *optional*):
222213 Pre-generated noisy latents for image generation.
223214 generator (`Generator`, *optional*):
224215 Torch generator for deterministic generation.
216+ image_latents (`Tensor`):
217+ TODO: Add description.
225218 attention_kwargs (`dict`, *optional*):
226219 Additional kwargs for attention processors.
227220 negative_prompt_embeds (`Tensor`, *optional*):
@@ -270,6 +263,25 @@ class HunyuanVideo15AutoVaeEncoderStep(AutoPipelineBlocks):
270263 This is an auto pipeline block that works for image-to-video tasks.
271264 - `HunyuanVideo15VaeEncoderStep` is used when `image` is provided.
272265 - If `image` is not provided, step will be skipped.
266+
267+ Components:
268+ vae (`AutoencoderKLHunyuanVideo15`) video_processor (`HunyuanVideo15ImageProcessor`)
269+
270+ Inputs:
271+ image (`Image | list`, *optional*):
272+ Reference image(s) for denoising. Can be a single image or list of images.
273+ height (`int`, *optional*):
274+ The height in pixels of the generated image.
275+ width (`int`, *optional*):
276+ The width in pixels of the generated image.
277+
278+ Outputs:
279+ image_latents (`Tensor`):
280+ Encoded image latents from the VAE encoder
281+ height (`int`):
282+ Target height resolved from image
283+ width (`int`):
284+ Target width resolved from image
273285 """
274286
275287 model_name = "hunyuan-video-1.5"
@@ -294,6 +306,17 @@ class HunyuanVideo15AutoImageEncoderStep(AutoPipelineBlocks):
294306 This is an auto pipeline block that works for image-to-video tasks.
295307 - `HunyuanVideo15ImageEncoderStep` is used when `image` is provided.
296308 - If `image` is not provided, step will be skipped.
309+
310+ Components:
311+ image_encoder (`SiglipVisionModel`) feature_extractor (`SiglipImageProcessor`)
312+
313+ Inputs:
314+ image (`Image | list`, *optional*):
315+ Reference image(s) for denoising. Can be a single image or list of images.
316+
317+ Outputs:
318+ image_embeds (`Tensor`):
319+ Image embeddings from the Siglip vision encoder
297320 """
298321
299322 model_name = "hunyuan-video-1.5"
@@ -315,8 +338,57 @@ def description(self):
315338class HunyuanVideo15AutoCoreDenoiseStep (AutoPipelineBlocks ):
316339 """
317340 Auto denoise block that selects the appropriate denoise pipeline based on inputs.
318- - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.
319- - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video).
341+ - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.
342+ - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video).
343+
344+ Components:
345+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
346+ (`ClassifierFreeGuidance`)
347+
348+ Inputs:
349+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
350+ The number of images to generate per prompt.
351+ prompt_embeds (`Tensor`):
352+ text embeddings used to guide the image generation. Can be generated from text_encoder step.
353+ batch_size (`int`):
354+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
355+ be generated in input step.
356+ num_inference_steps (`int`):
357+ The number of denoising steps.
358+ sigmas (`list`, *optional*):
359+ Custom sigmas for the denoising process.
360+ height (`int`, *optional*):
361+ The height in pixels of the generated image.
362+ width (`int`, *optional*):
363+ The width in pixels of the generated image.
364+ num_frames (`int`, *optional*, defaults to 121):
365+ TODO: Add description.
366+ latents (`Tensor`):
367+ Pre-generated noisy latents for image generation.
368+ generator (`Generator`, *optional*):
369+ Torch generator for deterministic generation.
370+ image_latents (`Tensor`, *optional*):
371+ TODO: Add description.
372+ attention_kwargs (`dict`, *optional*):
373+ Additional kwargs for attention processors.
374+ negative_prompt_embeds (`Tensor`, *optional*):
375+ TODO: Add description.
376+ prompt_embeds_mask (`Tensor`):
377+ TODO: Add description.
378+ negative_prompt_embeds_mask (`Tensor`, *optional*):
379+ TODO: Add description.
380+ prompt_embeds_2 (`Tensor`):
381+ TODO: Add description.
382+ negative_prompt_embeds_2 (`Tensor`, *optional*):
383+ TODO: Add description.
384+ prompt_embeds_mask_2 (`Tensor`):
385+ TODO: Add description.
386+ negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
387+ TODO: Add description.
388+
389+ Outputs:
390+ latents (`Tensor`):
391+ Denoised latents.
320392 """
321393
322394 model_name = "hunyuan-video-1.5"
@@ -338,9 +410,69 @@ class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks):
338410 """
339411 Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows.
340412
341- Supported workflows:
342- - `text2video`: requires `prompt`
343- - `image2video`: requires `image`, `prompt`
413+ Supported workflows:
414+ - `text2video`: requires `prompt`
415+ - `image2video`: requires `image`, `prompt`
416+
417+ Components:
418+ text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
419+ tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
420+ video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
421+ (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
422+ (`HunyuanVideo15Transformer3DModel`)
423+
424+ Inputs:
425+ prompt (`str`, *optional*):
426+ The prompt or prompts to guide image generation.
427+ negative_prompt (`str`, *optional*):
428+ The prompt or prompts not to guide the image generation.
429+ prompt_embeds (`Tensor`, *optional*):
430+ text embeddings used to guide the image generation. Can be generated from text_encoder step.
431+ prompt_embeds_mask (`Tensor`, *optional*):
432+ mask for the text embeddings. Can be generated from text_encoder step.
433+ negative_prompt_embeds (`Tensor`, *optional*):
434+ negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
435+ negative_prompt_embeds_mask (`Tensor`, *optional*):
436+ mask for the negative text embeddings. Can be generated from text_encoder step.
437+ prompt_embeds_2 (`Tensor`, *optional*):
438+ TODO: Add description.
439+ prompt_embeds_mask_2 (`Tensor`, *optional*):
440+ TODO: Add description.
441+ negative_prompt_embeds_2 (`Tensor`, *optional*):
442+ TODO: Add description.
443+ negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
444+ TODO: Add description.
445+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
446+ The number of images to generate per prompt.
447+ image (`Image | list`, *optional*):
448+ Reference image(s) for denoising. Can be a single image or list of images.
449+ height (`int`, *optional*):
450+ The height in pixels of the generated image.
451+ width (`int`, *optional*):
452+ The width in pixels of the generated image.
453+ batch_size (`int`):
454+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
455+ be generated in input step.
456+ num_inference_steps (`int`):
457+ The number of denoising steps.
458+ sigmas (`list`, *optional*):
459+ Custom sigmas for the denoising process.
460+ num_frames (`int`, *optional*, defaults to 121):
461+ TODO: Add description.
462+ latents (`Tensor`):
463+ Pre-generated noisy latents for image generation.
464+ generator (`Generator`, *optional*):
465+ Torch generator for deterministic generation.
466+ image_latents (`Tensor`, *optional*):
467+ TODO: Add description.
468+ attention_kwargs (`dict`, *optional*):
469+ Additional kwargs for attention processors.
470+ output_type (`str`, *optional*, defaults to np):
471+ Output format: 'pil', 'np', 'pt'.
472+
473+ Outputs:
474+ videos (`list`):
475+ The generated videos.
344476 """
345477
346478 model_name = "hunyuan-video-1.5"
@@ -352,14 +484,14 @@ class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks):
352484 HunyuanVideo15VaeDecoderStep ,
353485 ]
354486 block_names = ["text_encoder" , "vae_encoder" , "image_encoder" , "denoise" , "decode" ]
487+ _workflow_map = {
488+ "text2video" : {"prompt" : True },
489+ "image2video" : {"image" : True , "prompt" : True },
490+ }
355491
356492 @property
357493 def description (self ):
358- return (
359- "Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows.\n "
360- " - text2video: requires `prompt`\n "
361- " - image2video: requires `image`, `prompt`"
362- )
494+ return "Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows."
363495
364496 @property
365497 def outputs (self ):
@@ -372,17 +504,11 @@ class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
372504 Modular pipeline blocks for HunyuanVideo 1.5 image-to-video.
373505
374506 Components:
375- text_encoder (`Qwen2_5_VLTextModel`)
376- tokenizer (`Qwen2TokenizerFast`)
377- text_encoder_2 (`T5EncoderModel`)
378- tokenizer_2 (`ByT5Tokenizer`)
379- guider (`ClassifierFreeGuidance`)
380- transformer (`HunyuanVideo15Transformer3DModel`)
381- scheduler (`FlowMatchEulerDiscreteScheduler`)
382- vae (`AutoencoderKLHunyuanVideo15`)
383- video_processor (`HunyuanVideo15ImageProcessor`)
384- image_encoder (`SiglipVisionModel`)
385- feature_extractor (`SiglipImageProcessor`)
507+ text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
508+ tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
509+ video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
510+ (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
511+ (`HunyuanVideo15Transformer3DModel`)
386512
387513 Inputs:
388514 prompt (`str`, *optional*):
@@ -407,21 +533,27 @@ class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
407533 TODO: Add description.
408534 num_videos_per_prompt (`int`, *optional*, defaults to 1):
409535 The number of images to generate per prompt.
536+ image (`Image | list`, *optional*):
537+ Reference image(s) for denoising. Can be a single image or list of images.
538+ height (`int`, *optional*):
539+ The height in pixels of the generated image.
540+ width (`int`, *optional*):
541+ The width in pixels of the generated image.
410542 batch_size (`int`, *optional*):
411- Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
412- generated in input step.
543+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
544+ be generated in input step.
413545 num_inference_steps (`int`, *optional*, defaults to 50):
414546 The number of denoising steps.
415547 sigmas (`list`, *optional*):
416548 Custom sigmas for the denoising process.
417- image (`Image | list`):
418- Reference image(s) for denoising. Can be a single image or list of images.
419549 num_frames (`int`, *optional*, defaults to 121):
420550 TODO: Add description.
421551 latents (`Tensor`, *optional*):
422552 Pre-generated noisy latents for image generation.
423553 generator (`Generator`, *optional*):
424554 Torch generator for deterministic generation.
555+ image_latents (`Tensor`):
556+ TODO: Add description.
425557 attention_kwargs (`dict`, *optional*):
426558 Additional kwargs for attention processors.
427559 output_type (`str`, *optional*, defaults to np):
0 commit comments