Merge branch 'main' into cp-fixes-attn-backends

sayakpaul · web-flow · commit 3b1ccd79a5e9 · 2025-12-15T20:30:22.000+08:00
diff --git a/docs/source/en/api/models/controlnet.md b/docs/source/en/api/models/controlnet.md
@@ -33,6 +33,21 @@ url = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/m
 pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
 ```
 
+## Loading from Control LoRA
+
+Control-LoRA is introduced by Stability AI in [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) by adding low-rank parameter efficient fine tuning to ControlNet. This approach offers a more efficient and compact method to bring model control to a wider variety of consumer GPUs.
+
+```py
+from diffusers import ControlNetModel, UNet2DConditionModel
+
+lora_id = "stabilityai/control-lora"
+lora_filename = "control-LoRAs-rank128/control-lora-canny-rank128.safetensors"
+
+unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", torch_dtype=torch.bfloat16).to("cuda")
+controlnet = ControlNetModel.from_unet(unet).to(device="cuda", dtype=torch.bfloat16)
+controlnet.load_lora_adapter(lora_id, weight_name=lora_filename, prefix=None, controlnet_config=controlnet.config)
+```
+
 ## ControlNetModel
 
 [[autodoc]] ControlNetModel
diff --git a/examples/research_projects/control_lora/README.md b/examples/research_projects/control_lora/README.md
@@ -0,0 +1,41 @@
+# Control-LoRA inference example
+
+Control-LoRA is introduced by Stability AI in [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) by adding low-rank parameter efficient fine tuning to ControlNet. This approach offers a more efficient and compact method to bring model control to a wider variety of consumer GPUs.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+## Inference on SDXL
+
+[stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) provides a set of Control-LoRA weights for SDXL. Here we use the `canny` condition to generate an image from a text prompt and a reference image.
+
+```bash
+python control_lora.py
+```
+
+## Acknowledgements
+
+- [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora)
+- [comfyanonymous/ControlNet-v1-1_fp16_safetensors](https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors)
+- [HighCWu/control-lora-v2](https://github.com/HighCWu/control-lora-v2)
diff --git a/examples/research_projects/control_lora/control_lora.py b/examples/research_projects/control_lora/control_lora.py
@@ -0,0 +1,58 @@
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import load_image, make_image_grid
+
+
+pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
+lora_id = "stabilityai/control-lora"
+lora_filename = "control-LoRAs-rank128/control-lora-canny-rank128.safetensors"
+
+unet = UNet2DConditionModel.from_pretrained(pipe_id, subfolder="unet", torch_dtype=torch.bfloat16).to("cuda")
+controlnet = ControlNetModel.from_unet(unet).to(device="cuda", dtype=torch.bfloat16)
+controlnet.load_lora_adapter(lora_id, weight_name=lora_filename, prefix=None, controlnet_config=controlnet.config)
+
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = "low quality, bad quality, sketches"
+
+image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+controlnet_conditioning_scale = 1.0  # recommended for good generalization
+
+vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", torch_dtype=torch.bfloat16)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    pipe_id,
+    unet=unet,
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.bfloat16,
+    safety_checker=None,
+).to("cuda")
+
+image = np.array(image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+image = Image.fromarray(image)
+
+images = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    image=image,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    num_images_per_prompt=4,
+).images
+
+final_image = [image] + images
+grid = make_image_grid(final_image, 1, 5)
+grid.save("hf-logo_canny.png")
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -27,6 +27,7 @@
     MIN_PEFT_VERSION,
     USE_PEFT_BACKEND,
     check_peft_version,
+    convert_sai_sd_control_lora_state_dict_to_peft,
     convert_unet_state_dict_to_peft,
     delete_adapter_layers,
     get_adapter_name,
@@ -232,6 +233,13 @@ def load_lora_adapter(
             if "lora_A" not in first_key:
                 state_dict = convert_unet_state_dict_to_peft(state_dict)
 
+            # Control LoRA from SAI is different from BFL Control LoRA
+            # https://huggingface.co/stabilityai/control-lora
+            # https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors
+            is_sai_sd_control_lora = "lora_controlnet" in state_dict
+            if is_sai_sd_control_lora:
+                state_dict = convert_sai_sd_control_lora_state_dict_to_peft(state_dict)
+
             rank = {}
             for key, val in state_dict.items():
                 # Cannot figure out rank from lora layers that don't have at least 2 dimensions.
@@ -263,6 +271,14 @@ def load_lora_adapter(
                 adapter_name=adapter_name,
             )
 
+            # Adjust LoRA config for Control LoRA
+            if is_sai_sd_control_lora:
+                lora_config.lora_alpha = lora_config.r
+                lora_config.alpha_pattern = lora_config.rank_pattern
+                lora_config.bias = "all"
+                lora_config.modules_to_save = lora_config.exclude_modules
+                lora_config.exclude_modules = None
+
             # <Unsafe code
             # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
             # Now we remove any existing hooks to `_pipeline`.
diff --git a/src/diffusers/models/controlnets/controlnet.py b/src/diffusers/models/controlnets/controlnet.py
@@ -19,6 +19,7 @@
 from torch.nn import functional as F
 
 from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
 from ..attention import AttentionMixin
@@ -106,7 +107,7 @@ def forward(self, conditioning):
         return embedding
 
 
-class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin):
+class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
     """
     A ControlNet model.
 
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -14,6 +14,7 @@
 
 import functools
 import math
+from math import prod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -363,7 +364,13 @@ def __call__(
 @maybe_allow_in_graph
 class QwenImageTransformerBlock(nn.Module):
     def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+        zero_cond_t: bool = False,
     ):
         super().__init__()
 
@@ -403,10 +410,43 @@ def __init__(
         self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
         self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
 
-    def _modulate(self, x, mod_params):
+        self.zero_cond_t = zero_cond_t
+
+    def _modulate(self, x, mod_params, index=None):
         """Apply modulation to input tensor"""
+        # x: b l d, shift: b d, scale: b d, gate: b d
         shift, scale, gate = mod_params.chunk(3, dim=-1)
-        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+
+        if index is not None:
+            # Assuming mod_params batch dim is 2*actual_batch (chunked into 2 parts)
+            # So shift, scale, gate have shape [2*actual_batch, d]
+            actual_batch = shift.size(0) // 2
+            shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]  # each: [actual_batch, d]
+            scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
+            gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
+
+            # index: [b, l] where b is actual batch size
+            # Expand to [b, l, 1] to match feature dimension
+            index_expanded = index.unsqueeze(-1)  # [b, l, 1]
+
+            # Expand chunks to [b, 1, d] then broadcast to [b, l, d]
+            shift_0_exp = shift_0.unsqueeze(1)  # [b, 1, d]
+            shift_1_exp = shift_1.unsqueeze(1)  # [b, 1, d]
+            scale_0_exp = scale_0.unsqueeze(1)
+            scale_1_exp = scale_1.unsqueeze(1)
+            gate_0_exp = gate_0.unsqueeze(1)
+            gate_1_exp = gate_1.unsqueeze(1)
+
+            # Use torch.where to select based on index
+            shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
+            scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
+            gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
+        else:
+            shift_result = shift.unsqueeze(1)
+            scale_result = scale.unsqueeze(1)
+            gate_result = gate.unsqueeze(1)
+
+        return x * (1 + scale_result) + shift_result, gate_result
 
     def forward(
         self,
@@ -416,9 +456,13 @@ def forward(
         temb: torch.Tensor,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Get modulation parameters for both streams
         img_mod_params = self.img_mod(temb)  # [B, 6*dim]
+
+        if self.zero_cond_t:
+            temb = torch.chunk(temb, 2, dim=0)[0]
         txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
 
         # Split modulation parameters for norm1 and norm2
@@ -427,7 +471,7 @@ def forward(
 
         # Process image stream - norm1 + modulation
         img_normed = self.img_norm1(hidden_states)
-        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1, modulate_index)
 
         # Process text stream - norm1 + modulation
         txt_normed = self.txt_norm1(encoder_hidden_states)
@@ -457,7 +501,7 @@ def forward(
 
         # Process image stream - norm2 + MLP
         img_normed2 = self.img_norm2(hidden_states)
-        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
+        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2, modulate_index)
         img_mlp_output = self.img_mlp(img_modulated2)
         hidden_states = hidden_states + img_gate2 * img_mlp_output
 
@@ -533,6 +577,7 @@ def __init__(
         joint_attention_dim: int = 3584,
         guidance_embeds: bool = False,  # TODO: this should probably be removed
         axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+        zero_cond_t: bool = False,
     ):
         super().__init__()
         self.out_channels = out_channels or in_channels
@@ -553,6 +598,7 @@ def __init__(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
+                    zero_cond_t=zero_cond_t,
                 )
                 for _ in range(num_layers)
             ]
@@ -562,6 +608,7 @@ def __init__(
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False
+        self.zero_cond_t = zero_cond_t
 
     def forward(
         self,
@@ -618,6 +665,17 @@ def forward(
         hidden_states = self.img_in(hidden_states)
 
         timestep = timestep.to(hidden_states.dtype)
+
+        if self.zero_cond_t:
+            timestep = torch.cat([timestep, timestep * 0], dim=0)
+            modulate_index = torch.tensor(
+                [[0] * prod(sample[0]) + [1] * sum([prod(s) for s in sample[1:]]) for sample in img_shapes],
+                device=timestep.device,
+                dtype=torch.int,
+            )
+        else:
+            modulate_index = None
+
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)
         encoder_hidden_states = self.txt_in(encoder_hidden_states)
 
@@ -641,6 +699,8 @@ def forward(
                     encoder_hidden_states_mask,
                     temb,
                     image_rotary_emb,
+                    attention_kwargs,
+                    modulate_index,
                 )
 
             else:
@@ -651,6 +711,7 @@ def forward(
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
                     joint_attention_kwargs=attention_kwargs,
+                    modulate_index=modulate_index,
                 )
 
             # controlnet residual
@@ -659,6 +720,8 @@ def forward(
                 interval_control = int(np.ceil(interval_control))
                 hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
 
+        if self.zero_cond_t:
+            temb = temb.chunk(2, dim=0)[0]
         # Use only the image part (hidden_states) from the dual-stream blocks
         hidden_states = self.norm_out(hidden_states, temb)
         output = self.proj_out(hidden_states)
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -143,6 +143,7 @@
 from .remote_utils import remote_decode
 from .state_dict_utils import (
     convert_all_state_dict_to_peft,
+    convert_sai_sd_control_lora_state_dict_to_peft,
     convert_state_dict_to_diffusers,
     convert_state_dict_to_kohya,
     convert_state_dict_to_peft,
diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py
diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py