update

DN6 · DN6 · commit 921b959b9aba · 2025-12-01T10:42:35.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -53,12 +53,11 @@
         "Flux2AutoBlocks",
         "Flux2AutoDecodeStep",
         "Flux2AutoDenoiseStep",
-        "Flux2AutoInputStep",
+        "Flux2AutoImageInputStep",
         "Flux2AutoTextEncoderStep",
+        "Flux2AutoTextInputStep",
         "Flux2AutoVaeEncoderStep",
         "Flux2BeforeDenoiseStep",
-        "Flux2CoreDenoiseStep",
-        "Flux2InputSequentialStep",
         "Flux2VaeEncoderSequentialStep",
     ]
     _import_structure["modular_pipeline"] = ["Flux2ModularPipeline"]
@@ -102,12 +101,11 @@
             Flux2AutoBlocks,
             Flux2AutoDecodeStep,
             Flux2AutoDenoiseStep,
-            Flux2AutoInputStep,
+            Flux2AutoImageInputStep,
             Flux2AutoTextEncoderStep,
+            Flux2AutoTextInputStep,
             Flux2AutoVaeEncoderStep,
             Flux2BeforeDenoiseStep,
-            Flux2CoreDenoiseStep,
-            Flux2InputSequentialStep,
             Flux2VaeEncoderSequentialStep,
         )
         from .modular_pipeline import Flux2ModularPipeline
diff --git a/src/diffusers/modular_pipelines/flux2/before_denoise.py b/src/diffusers/modular_pipelines/flux2/before_denoise.py
@@ -366,7 +366,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.",
             ),
             OutputParam(
-                name="img_ids",
+                name="latent_ids",
                 kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="4D position IDs (T, H, W, L) for image latents, used for RoPE calculation.",
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks.py b/src/diffusers/modular_pipelines/flux2/modular_blocks.py
@@ -35,6 +35,33 @@
 )
 
 
+class Flux2AutoTextInputStep(AutoPipelineBlocks):
+    block_classes = [Flux2TextInputStep]
+    block_names = ["text_input"]
+    block_trigger_inputs = [None]
+
+    @property
+    def description(self):
+        return (
+            "Text input step that processes text embeddings and determines batch size.\n"
+            " - `Flux2TextInputStep` is always used."
+        )
+
+
+class Flux2AutoImageInputStep(AutoPipelineBlocks):
+    block_classes = [Flux2ImageInputStep]
+    block_names = ["image_input"]
+    block_trigger_inputs = ["image_latents"]
+
+    @property
+    def description(self):
+        return (
+            "Image input step that expands image latents to match batch size.\n"
+            " - `Flux2ImageInputStep` is used when `image_latents` is provided.\n"
+            " - Skipped when no image conditioning is used."
+        )
+
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -147,66 +174,14 @@ def description(self):
         return "Decode step that decodes the denoised latents into image outputs.\n - `Flux2DecodeStep`"
 
 
-Flux2InputBlocks = InsertableDict(
-    [
-        ("text_inputs", Flux2TextInputStep()),
-        ("image_inputs", Flux2ImageInputStep()),
-    ]
-)
-
-
-class Flux2InputSequentialStep(SequentialPipelineBlocks):
-    model_name = "flux2"
-    block_classes = Flux2InputBlocks.values()
-    block_names = Flux2InputBlocks.keys()
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the Flux2 denoising step. It:\n"
-            " - Makes sure the text embeddings have consistent batch size.\n"
-            " - Processes image latents if provided."
-        )
-
-
-class Flux2AutoInputStep(AutoPipelineBlocks):
-    block_classes = [Flux2InputSequentialStep, Flux2TextInputStep]
-    block_names = ["img_conditioning", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Input step that standardizes the inputs for the denoising step.\n"
-            "This is an auto pipeline block that works for text-to-image/image-conditioned tasks.\n"
-            " - `Flux2InputSequentialStep` is used when `image_latents` is provided.\n"
-            " - `Flux2TextInputStep` is used when `image_latents` is not provided.\n"
-        )
-
-
-class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux2"
-    block_classes = [Flux2AutoInputStep, Flux2AutoBeforeDenoiseStep, Flux2AutoDenoiseStep]
-    block_names = ["input", "before_denoise", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process for Flux2. \n"
-            " - `Flux2AutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            " - `Flux2AutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            " - `Flux2AutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            "This step supports text-to-image and image-conditioned tasks for Flux2:\n"
-            " - For image-conditioned generation, you need to provide `packed_image_latents`.\n"
-            " - For text-to-image generation, all you need to provide is prompt embeddings."
-        )
-
-
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2AutoTextEncoderStep()),
+        ("text_input", Flux2AutoTextInputStep()),
         ("image_encoder", Flux2AutoVaeEncoderStep()),
-        ("denoise", Flux2CoreDenoiseStep()),
+        ("image_input", Flux2AutoImageInputStep()),
+        ("before_denoise", Flux2AutoBeforeDenoiseStep()),
+        ("denoise", Flux2AutoDenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
@@ -230,7 +205,7 @@ def description(self):
 TEXT2IMAGE_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
-        ("input", Flux2TextInputStep()),
+        ("text_input", Flux2TextInputStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
@@ -242,10 +217,11 @@ def description(self):
 IMAGE_CONDITIONED_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
+        ("text_input", Flux2TextInputStep()),
         ("preprocess_images", Flux2ProcessImagesInputStep()),
         ("vae_encoder", Flux2VaeEncoderStep()),
         ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("input", Flux2InputSequentialStep()),
+        ("image_input", Flux2ImageInputStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -1586,7 +1586,6 @@ def __init__(
         for name, config_spec in self._config_specs.items():
             default_configs[name] = config_spec.default
         self.register_to_config(**default_configs)
-
         self.register_to_config(_blocks_class_name=self.blocks.__class__.__name__ if self.blocks is not None else None)
 
     @property
diff --git a/tests/modular_pipelines/flux2/__init__.py b/tests/modular_pipelines/flux2/__init__.py
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import PIL
+import torch
+
+from diffusers.modular_pipelines import (
+    Flux2AutoBlocks,
+    Flux2ModularPipeline,
+    ModularPipeline,
+)
+from diffusers.modular_pipelines.flux2 import (
+    Flux2AutoTextEncoderStep,
+    Flux2RemoteTextEncoderStep,
+    Flux2TextEncoderStep,
+)
+
+from ...testing_utils import floats_tensor, torch_device
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2ModularPipeline
+    pipeline_blocks_class = Flux2AutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-modular"
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale"])
+    batch_params = frozenset(["prompt"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+
+class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2ModularPipeline
+    pipeline_blocks_class = Flux2AutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-modular"
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = PIL.Image.fromarray(np.uint8(image * 255)).convert("RGB")
+        inputs["image"] = init_image
+
+        return inputs
+
+    def test_save_from_pretrained(self):
+        pipes = []
+        base_pipe = self.get_pipeline().to(torch_device)
+        pipes.append(base_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            base_pipe.save_pretrained(tmpdirname)
+
+            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
+            pipe.load_components(torch_dtype=torch.float32)
+            pipe.to(torch_device)
+
+        pipes.append(pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs()
+            image = pipe(**inputs, output="images")
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+
+class TestFlux2AutoTextEncoderStep(unittest.TestCase):
+    def test_auto_text_encoder_block_classes(self):
+        auto_step = Flux2AutoTextEncoderStep()
+
+        assert len(auto_step.block_classes) == 2
+        assert Flux2RemoteTextEncoderStep in auto_step.block_classes
+        assert Flux2TextEncoderStep in auto_step.block_classes
+
+    def test_auto_text_encoder_trigger_inputs(self):
+        auto_step = Flux2AutoTextEncoderStep()
+
+        assert auto_step.block_trigger_inputs == ["remote_text_encoder", None]
+
+    def test_auto_text_encoder_block_names(self):
+        auto_step = Flux2AutoTextEncoderStep()
+
+        assert auto_step.block_names == ["remote", "local"]