update

DN6 · DN6 · commit b0f50c64e1fa · 2025-12-09T12:18:30.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux2/encoders.py b/src/diffusers/modular_pipelines/flux2/encoders.py
@@ -399,22 +399,22 @@ def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> Pi
         condition_images = block_state.condition_images
 
         if condition_images is None:
-            block_state.image_latents = None
-        else:
-            device = components._execution_device
-            dtype = components.vae.dtype
-
-            image_latents = []
-            for image in condition_images:
-                image = image.to(device=device, dtype=dtype)
-                latent = self._encode_vae_image(
-                    vae=components.vae,
-                    image=image,
-                    generator=block_state.generator,
-                )
-                image_latents.append(latent)
+            return components, state
+
+        device = components._execution_device
+        dtype = components.vae.dtype
+
+        image_latents = []
+        for image in condition_images:
+            image = image.to(device=device, dtype=dtype)
+            latent = self._encode_vae_image(
+                vae=components.vae,
+                image=image,
+                generator=block_state.generator,
+            )
+            image_latents.append(latent)
 
-            block_state.image_latents = image_latents
+        block_state.image_latents = image_latents
 
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/flux2/inputs.py b/src/diffusers/modular_pipelines/flux2/inputs.py
@@ -31,7 +31,6 @@ class Flux2TextInputStep(ModularPipelineBlocks):
     @property
     def description(self) -> str:
         return (
-            "Text input processing step that standardizes text embeddings for Flux2 pipeline.\n"
             "This step:\n"
             "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
             "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
@@ -86,55 +85,3 @@ def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> Pi
 
         self.set_block_state(state, block_state)
         return components, state
-
-
-class Flux2ImageInputStep(ModularPipelineBlocks):
-    model_name = "flux2"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Image input processing step that prepares image latents for Flux2 conditioning.\n"
-            "This step expands image latents to match the batch size."
-        )
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("num_images_per_prompt", default=1),
-            InputParam("batch_size", required=True, type_hint=int),
-            InputParam("image_latents", type_hint=torch.Tensor),
-            InputParam("image_latent_ids", type_hint=torch.Tensor),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                "image_latents",
-                type_hint=torch.Tensor,
-                description="Packed image latents expanded to batch size",
-            ),
-            OutputParam(
-                "image_latent_ids",
-                type_hint=torch.Tensor,
-                description="Image latent position IDs expanded to batch size",
-            ),
-        ]
-
-    @torch.no_grad()
-    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
-        block_state = self.get_block_state(state)
-
-        image_latents = block_state.image_latents
-        image_latent_ids = block_state.image_latent_ids
-        target_batch_size = block_state.batch_size * block_state.num_images_per_prompt
-
-        if image_latents is not None:
-            block_state.image_latents = image_latents.repeat(target_batch_size, 1, 1)
-
-        if image_latent_ids is not None:
-            block_state.image_latent_ids = image_latent_ids.repeat(target_batch_size, 1, 1)
-
-        self.set_block_state(state, block_state)
-        return components, state
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks.py b/src/diffusers/modular_pipelines/flux2/modular_blocks.py
@@ -30,38 +30,10 @@
     Flux2VaeEncoderStep,
 )
 from .inputs import (
-    Flux2ImageInputStep,
     Flux2TextInputStep,
 )
 
 
-class Flux2AutoTextInputStep(AutoPipelineBlocks):
-    block_classes = [Flux2TextInputStep]
-    block_names = ["text_input"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self):
-        return (
-            "Text input step that processes text embeddings and determines batch size.\n"
-            " - `Flux2TextInputStep` is always used."
-        )
-
-
-class Flux2AutoImageInputStep(AutoPipelineBlocks):
-    block_classes = [Flux2ImageInputStep]
-    block_names = ["image_input"]
-    block_trigger_inputs = ["image_latents"]
-
-    @property
-    def description(self):
-        return (
-            "Image input step that expands image latents to match batch size.\n"
-            " - `Flux2ImageInputStep` is used when `image_latents` is provided.\n"
-            " - Skipped when no image conditioning is used."
-        )
-
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -100,21 +72,6 @@ def description(self):
         )
 
 
-class Flux2AutoTextEncoderStep(AutoPipelineBlocks):
-    block_classes = [Flux2RemoteTextEncoderStep, Flux2TextEncoderStep]
-    block_names = ["remote", "local"]
-    block_trigger_inputs = ["remote_text_encoder", None]
-
-    @property
-    def description(self):
-        return (
-            "Text encoder step that generates text embeddings to guide the image generation.\n"
-            "This is an auto pipeline block that selects between local and remote text encoding.\n"
-            " - `Flux2RemoteTextEncoderStep` is used when `remote_text_encoder=True`.\n"
-            " - `Flux2TextEncoderStep` is used otherwise (default)."
-        )
-
-
 Flux2BeforeDenoiseBlocks = InsertableDict(
     [
         ("prepare_latents", Flux2PrepareLatentsStep()),
@@ -135,53 +92,25 @@ def description(self):
         return "Before denoise step that prepares the inputs for the denoise step in Flux2 generation."
 
 
-class Flux2AutoBeforeDenoiseStep(AutoPipelineBlocks):
-    model_name = "flux2"
-    block_classes = [Flux2BeforeDenoiseStep]
-    block_names = ["before_denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepares the inputs for the denoise step.\n"
-            "This is an auto pipeline block for Flux2.\n"
-            " - `Flux2BeforeDenoiseStep` is used for both text-to-image and image-conditioned generation."
-        )
-
-
-class Flux2AutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [Flux2DenoiseStep]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoises the latents. "
-            "This is an auto pipeline block that works for Flux2 text-to-image and image-conditioned tasks."
-            " - `Flux2DenoiseStep` (denoise) for text-to-image and image-conditioned tasks."
-        )
-
-
-class Flux2AutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [Flux2DecodeStep]
-    block_names = ["decode"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the denoised latents into image outputs.\n - `Flux2DecodeStep`"
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", Flux2TextEncoderStep()),
+        ("text_input", Flux2TextInputStep()),
+        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
+        ("before_denoise", Flux2BeforeDenoiseStep()),
+        ("denoise", Flux2DenoiseStep()),
+        ("decode", Flux2DecodeStep()),
+    ]
+)
 
 
-AUTO_BLOCKS = InsertableDict(
+REMOTE_AUTO_BLOCKS = InsertableDict(
     [
-        ("text_encoder", Flux2AutoTextEncoderStep()),
-        ("text_input", Flux2AutoTextInputStep()),
-        ("image_encoder", Flux2AutoVaeEncoderStep()),
-        ("image_input", Flux2AutoImageInputStep()),
-        ("before_denoise", Flux2AutoBeforeDenoiseStep()),
-        ("denoise", Flux2AutoDenoiseStep()),
+        ("text_encoder", Flux2RemoteTextEncoderStep()),
+        ("text_input", Flux2TextInputStep()),
+        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
+        ("before_denoise", Flux2BeforeDenoiseStep()),
+        ("denoise", Flux2DenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
@@ -221,7 +150,6 @@ def description(self):
         ("preprocess_images", Flux2ProcessImagesInputStep()),
         ("vae_encoder", Flux2VaeEncoderStep()),
         ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("image_input", Flux2ImageInputStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
@@ -234,4 +162,5 @@ def description(self):
     "text2image": TEXT2IMAGE_BLOCKS,
     "image_conditioned": IMAGE_CONDITIONED_BLOCKS,
     "auto": AUTO_BLOCKS,
+    "remote": REMOTE_AUTO_BLOCKS,
 }
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -15,7 +15,6 @@
 
 import random
 import tempfile
-import unittest
 
 import numpy as np
 import PIL
@@ -26,11 +25,6 @@
     Flux2ModularPipeline,
     ModularPipeline,
 )
-from diffusers.modular_pipelines.flux2 import (
-    Flux2AutoTextEncoderStep,
-    Flux2RemoteTextEncoderStep,
-    Flux2TextEncoderStep,
-)
 
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
@@ -114,26 +108,7 @@ def test_save_from_pretrained(self):
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-5
 
     def test_float16_inference(self):
         super().test_float16_inference(9e-2)
-
-
-class TestFlux2AutoTextEncoderStep(unittest.TestCase):
-    def test_auto_text_encoder_block_classes(self):
-        auto_step = Flux2AutoTextEncoderStep()
-
-        assert len(auto_step.block_classes) == 2
-        assert Flux2RemoteTextEncoderStep in auto_step.block_classes
-        assert Flux2TextEncoderStep in auto_step.block_classes
-
-    def test_auto_text_encoder_trigger_inputs(self):
-        auto_step = Flux2AutoTextEncoderStep()
-
-        assert auto_step.block_trigger_inputs == ["remote_text_encoder", None]
-
-    def test_auto_text_encoder_block_names(self):
-        auto_step = Flux2AutoTextEncoderStep()
-
-        assert auto_step.block_names == ["remote", "local"]
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -165,7 +165,6 @@ def test_inference_batch_single_identical(
         expected_max_diff=1e-4,
     ):
         pipe = self.get_pipeline().to(torch_device)
-
         inputs = self.get_dummy_inputs()
 
         # Reset generator in case it is has been used in self.get_dummy_inputs