huggingface
diff --git a/‎examples/dreambooth/test_dreambooth_lora_sana.py‎
Lines changed: 42 additions & 0 deletions b/‎examples/dreambooth/test_dreambooth_lora_sana.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/dreambooth/train_dreambooth_lora_sana.py‎
Lines changed: 14 additions & 4 deletions b/‎examples/dreambooth/train_dreambooth_lora_sana.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py‎
Lines changed: 2 additions & 4 deletions b/‎src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/diffusers/pipelines/consisid/consisid_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/pipelines/consisid/consisid_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py‎
Lines changed: 5 additions & 1 deletion b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py‎
Lines changed: 3 additions & 3 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py‎
Lines changed: 3 additions & 3 deletions
@@ -13,13 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import logging
 import os
 import sys
 import tempfile
 
 import safetensors
 
+from diffusers.loaders.lora_base import LORA_ADAPTER_METADATA_KEY
+
 
 sys.path.append("..")
 from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
@@ -204,3 +207,42 @@ def test_dreambooth_lora_sana_checkpointing_checkpoints_total_limit_removes_mult
             run_command(self._launch_args + resume_run_args)
 
             self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+    def test_dreambooth_lora_sana_with_metadata(self):
+        lora_alpha = 8
+        rank = 4
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --resolution=32
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --lora_alpha={lora_alpha}
+            --rank={rank}
+            --checkpointing_steps=2
+            --max_sequence_length 166
+            """.split()
+
+            test_args.extend(["--instance_prompt", ""])
+            run_command(self._launch_args + test_args)
+
+            state_dict_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
+            self.assertTrue(os.path.isfile(state_dict_file))
+
+            # Check if the metadata was properly serialized.
+            with safetensors.torch.safe_open(state_dict_file, framework="pt", device="cpu") as f:
+                metadata = f.metadata() or {}
+
+            metadata.pop("format", None)
+            raw = metadata.get(LORA_ADAPTER_METADATA_KEY)
+            if raw:
+                raw = json.loads(raw)
+
+            loaded_lora_alpha = raw["transformer.lora_alpha"]
+            self.assertTrue(loaded_lora_alpha == lora_alpha)
+            loaded_lora_rank = raw["transformer.r"]
+            self.assertTrue(loaded_lora_rank == rank)
@@ -52,6 +52,7 @@
 )
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import (
+    _collate_lora_metadata,
     cast_training_params,
     compute_density_for_timestep_sampling,
     compute_loss_weighting_for_sd3,
@@ -323,9 +324,13 @@ def parse_args(input_args=None):
         default=4,
         help=("The dimension of the LoRA update matrices."),
     )
-
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=4,
+        help="LoRA alpha to be used for additional scaling.",
+    )
     parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
     parser.add_argument(
         "--with_prior_preservation",
         default=False,
@@ -1023,7 +1028,7 @@ def main(args):
     # now we will add new LoRA weights the transformer layers
     transformer_lora_config = LoraConfig(
         r=args.rank,
-        lora_alpha=args.rank,
+        lora_alpha=args.lora_alpha,
         lora_dropout=args.lora_dropout,
         init_lora_weights="gaussian",
         target_modules=target_modules,
@@ -1039,10 +1044,11 @@ def unwrap_model(model):
     def save_model_hook(models, weights, output_dir):
         if accelerator.is_main_process:
             transformer_lora_layers_to_save = None
-
+            modules_to_save = {}
             for model in models:
                 if isinstance(model, type(unwrap_model(transformer))):
                     transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                    modules_to_save["transformer"] = model
                 else:
                     raise ValueError(f"unexpected save model: {model.__class__}")
 
@@ -1052,6 +1058,7 @@ def save_model_hook(models, weights, output_dir):
             SanaPipeline.save_lora_weights(
                 output_dir,
                 transformer_lora_layers=transformer_lora_layers_to_save,
+                **_collate_lora_metadata(modules_to_save),
             )
 
     def load_model_hook(models, input_dir):
@@ -1507,15 +1514,18 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         transformer = unwrap_model(transformer)
+        modules_to_save = {}
         if args.upcast_before_saving:
             transformer.to(torch.float32)
         else:
             transformer = transformer.to(weight_dtype)
         transformer_lora_layers = get_peft_model_state_dict(transformer)
+        modules_to_save["transformer"] = transformer
 
         SanaPipeline.save_lora_weights(
             save_directory=args.output_dir,
             transformer_lora_layers=transformer_lora_layers,
+            **_collate_lora_metadata(modules_to_save),
         )
 
         # Final inference
 
@@ -41,7 +41,7 @@
     replace_example_docstring,
 )
 from ...utils.import_utils import is_transformers_version
-from ...utils.torch_utils import randn_tensor
+from ...utils.torch_utils import empty_device_cache, randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
 
@@ -267,9 +267,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
 
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            device_mod = getattr(torch, device.type, None)
-            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
-                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+            empty_device_cache(device.type)
 
         model_sequence = [
             self.text_encoder.text_model,
 
@@ -294,7 +294,7 @@ def prepare_face_models(model_path, device, dtype):
 
     Parameters:
     - model_path: Path to the directory containing model files.
-    - device: The device (e.g., 'cuda', 'cpu') where models will be loaded.
+    - device: The device (e.g., 'cuda', 'xpu', 'cpu') where models will be loaded.
     - dtype: Data type (e.g., torch.float32) for model inference.
 
     Returns:
 
@@ -37,7 +37,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, is_torch_version, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1339,7 +1339,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -36,7 +36,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1311,7 +1311,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -38,7 +38,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1500,7 +1500,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -51,7 +51,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
@@ -1858,7 +1858,7 @@ def denoising_value_valid(dnv):
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
 
@@ -1465,7 +1465,11 @@ def __call__(
 
                 # Relevant thread:
                 # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
-                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                if (
+                    torch.cuda.is_available()
+                    and (is_unet_compiled and is_controlnet_compiled)
+                    and is_torch_higher_equal_2_1
+                ):
                     torch._inductor.cudagraph_mark_step_begin()
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
 
@@ -53,7 +53,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
@@ -921,7 +921,7 @@ def prepare_latents(
         # Offload text encoder if `enable_model_cpu_offload` was enabled
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.text_encoder_2.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         image = image.to(device=device, dtype=dtype)
 
@@ -1632,7 +1632,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16