Merge branch 'main' into fix-memory-address-problem

sayakpaul · web-flow · commit 9553b7975848 · 2025-06-26T12:30:56.000+05:30
diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
@@ -150,11 +150,63 @@ pipeline(prompt, num_inference_steps=30).images[0]
 
 Compilation is slow the first time, but once compiled, it is significantly faster. Try to only use the compiled pipeline on the same type of inference operations. Calling the compiled pipeline on a different image size retriggers compilation which is slow and inefficient.
 
+### Dynamic shape compilation
+
+> [!TIP]
+> Make sure to always use the nightly version of PyTorch for better support.
+
+`torch.compile` keeps track of input shapes and conditions, and if these are different, it recompiles the model. For example, if a model is compiled on a 1024x1024 resolution image and used on an image with a different resolution, it triggers recompilation.
+
+To avoid recompilation, add `dynamic=True` to try and generate a more dynamic kernel to avoid recompilation when conditions change.
+
+```diff
++ torch.fx.experimental._config.use_duck_shape = False
++ pipeline.unet = torch.compile(
+    pipeline.unet, fullgraph=True, dynamic=True
+)
+```
+
+Specifying `use_duck_shape=False` instructs the compiler if it should use the same symbolic variable to represent input sizes that are the same. For more details, check out this [comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
+
+Not all models may benefit from dynamic compilation out of the box and may require changes. Refer to this [PR](https://github.com/huggingface/diffusers/pull/11297/) that improved the [`AuraFlowPipeline`] implementation to benefit from dynamic compilation.
+
+Feel free to open an issue if dynamic compilation doesn't work as expected for a Diffusers model.
+
 ### Regional compilation
 
-[Regional compilation](https://docs.pytorch.org/tutorials/recipes/regional_compilation.html) reduces the cold start compilation time by only compiling a specific repeated region (or block) of the model instead of the entire model. The compiler reuses the cached and compiled code for the other blocks.
 
-[Accelerate](https://huggingface.co/docs/accelerate/index) provides the [compile_regions](https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78) method for automatically compiling the repeated blocks of a `nn.Module` sequentially. The rest of the model is compiled separately.
+[Regional compilation](https://docs.pytorch.org/tutorials/recipes/regional_compilation.html) trims cold-start latency by compiling **only the small, frequently-repeated block(s)** of a model, typically a Transformer layer, enabling reuse of compiled artifacts for every subsequent occurrence.
+For many diffusion architectures this delivers the *same* runtime speed-ups as full-graph compilation yet cuts compile time by **8–10 ×**.
+
+To make this effortless, [`ModelMixin`] exposes [`ModelMixin.compile_repeated_blocks`] API, a helper that wraps `torch.compile` around any sub-modules you designate as repeatable:
+
+```py
+# pip install -U diffusers
+import torch
+from diffusers import StableDiffusionXLPipeline
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+# Compile only the repeated Transformer layers inside the UNet
+pipe.unet.compile_repeated_blocks(fullgraph=True)
+```
+
+To enable a new model with regional compilation, add a `_repeated_blocks` attribute to your model class containing the class names (as strings) of the blocks you want compiled:
+
+
+```py
+class MyUNet(ModelMixin):
+    _repeated_blocks = ("Transformer2DModel",)  # ← compiled by default
+```
+
+For more examples, see the reference [PR](https://github.com/huggingface/diffusers/pull/11705).
+
+**Relation to Accelerate compile_regions** There is also a separate API in [accelerate](https://huggingface.co/docs/accelerate/index) - [compile_regions](https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78). It takes a fully automatic approach: it walks the module, picks candidate blocks, then compiles the remaining graph separately. That hands-off experience is handy for quick experiments, but it also leaves fewer knobs when you want to fine-tune which blocks are compiled or adjust compilation flags.
+
+
 
 ```py
 # pip install -U accelerate
@@ -167,6 +219,8 @@ pipeline = StableDiffusionXLPipeline.from_pretrained(
 ).to("cuda")
 pipeline.unet = compile_regions(pipeline.unet, mode="reduce-overhead", fullgraph=True)
 ```
+`compile_repeated_blocks`, by contrast, is intentionally explicit. You list the repeated blocks once (via `_repeated_blocks`) and the helper compiles exactly those, nothing more. In practice this small dose of control hits a sweet spot for diffusion models: predictable behavior, easy reasoning about cache reuse, and still a one-liner for users.
+
 
 ### Graph breaks
 
@@ -241,4 +295,4 @@ An input is projected into three subspaces, represented by the projection matric
 
 ```py
 pipeline.fuse_qkv_projections()
-```
+```
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -137,9 +137,58 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
+    def _transfer_tensor_to_device(self, tensor, source_tensor, current_stream=None):
+        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        if self.record_stream and current_stream is not None:
+            tensor.data.record_stream(current_stream)
+
+    def _process_tensors_from_modules(self, pinned_memory=None, current_stream=None):
+        for group_module in self.modules:
+            for param in group_module.parameters():
+                source = pinned_memory[param] if pinned_memory else param.data
+                self._transfer_tensor_to_device(param, source, current_stream)
+            for buffer in group_module.buffers():
+                source = pinned_memory[buffer] if pinned_memory else buffer.data
+                self._transfer_tensor_to_device(buffer, source, current_stream)
+
+        for param in self.parameters:
+            source = pinned_memory[param] if pinned_memory else param.data
+            self._transfer_tensor_to_device(param, source, current_stream)
+
+        for buffer in self.buffers:
+            source = pinned_memory[buffer] if pinned_memory else buffer.data
+            self._transfer_tensor_to_device(buffer, source, current_stream)
+
+    def _onload_from_disk(self, current_stream):
+        if self.stream is not None:
+            loaded_cpu_tensors = safetensors.torch.load_file(self.safetensors_file_path, device="cpu")
+
+            for key, tensor_obj in self.key_to_tensor.items():
+                self.cpu_param_dict[tensor_obj] = loaded_cpu_tensors[key]
+
+            with self._pinned_memory_tensors() as pinned_memory:
+                for key, tensor_obj in self.key_to_tensor.items():
+                    self._transfer_tensor_to_device(tensor_obj, pinned_memory[tensor_obj], current_stream)
+
+            self.cpu_param_dict.clear()
+
+        else:
+            onload_device = (
+                self.onload_device.type if isinstance(self.onload_device, torch.device) else self.onload_device
+            )
+            loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=onload_device)
+            for key, tensor_obj in self.key_to_tensor.items():
+                tensor_obj.data = loaded_tensors[key]
+
+    def _onload_from_memory(self, current_stream):
+        if self.stream is not None:
+            with self._pinned_memory_tensors() as pinned_memory:
+                self._process_tensors_from_modules(pinned_memory, current_stream)
+        else:
+            self._process_tensors_from_modules(None, current_stream)
+
     @torch.compiler.disable()
     def onload_(self):
-        r"""Onloads the group of modules to the onload_device."""
         torch_accelerator_module = (
             getattr(torch, torch.accelerator.current_accelerator().type)
             if hasattr(torch, "accelerator")
@@ -177,67 +226,30 @@ def onload_(self):
             self.stream.synchronize()
 
         with context:
-            if self.stream is not None:
-                with self._pinned_memory_tensors() as pinned_memory:
-                    for group_module in self.modules:
-                        for param in group_module.parameters():
-                            param.data = pinned_memory[param].to(self.onload_device, non_blocking=self.non_blocking)
-                            if self.record_stream:
-                                param.data.record_stream(current_stream)
-                        for buffer in group_module.buffers():
-                            buffer.data = pinned_memory[buffer].to(self.onload_device, non_blocking=self.non_blocking)
-                            if self.record_stream:
-                                buffer.data.record_stream(current_stream)
-
-                    for param in self.parameters:
-                        param.data = pinned_memory[param].to(self.onload_device, non_blocking=self.non_blocking)
-                        if self.record_stream:
-                            param.data.record_stream(current_stream)
-
-                    for buffer in self.buffers:
-                        buffer.data = pinned_memory[buffer].to(self.onload_device, non_blocking=self.non_blocking)
-                        if self.record_stream:
-                            buffer.data.record_stream(current_stream)
-
+            if self.offload_to_disk_path:
+                self._onload_from_disk(current_stream)
             else:
-                for group_module in self.modules:
-                    for param in group_module.parameters():
-                        param.data = param.data.to(self.onload_device, non_blocking=self.non_blocking)
-                    for buffer in group_module.buffers():
-                        buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
-
-                for param in self.parameters:
-                    param.data = param.data.to(self.onload_device, non_blocking=self.non_blocking)
-
-                for buffer in self.buffers:
-                    buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
-                    if self.record_stream:
-                        buffer.data.record_stream(current_stream)
-
-    @torch.compiler.disable()
-    def offload_(self):
-        r"""Offloads the group of modules to the offload_device."""
-        if self.offload_to_disk_path:
-            # TODO: we can potentially optimize this code path by checking if the _all_ the desired
-            # safetensor files exist on the disk and if so, skip this step entirely, reducing IO
-            # overhead. Currently, we just check if the given `safetensors_file_path` exists and if not
-            # we perform a write.
-            # Check if the file has been saved in this session or if it already exists on disk.
-            if not self._is_offloaded_to_disk and not os.path.exists(self.safetensors_file_path):
-                os.makedirs(os.path.dirname(self.safetensors_file_path), exist_ok=True)
-                tensors_to_save = {
-                    key: tensor.data.to(self.offload_device) for tensor, key in self.tensor_to_key.items()
-                }
-                safetensors.torch.save_file(tensors_to_save, self.safetensors_file_path)
-
-            # The group is now considered offloaded to disk for the rest of the session.
-            self._is_offloaded_to_disk = True
-
-            # We do this to free up the RAM which is still holding the up tensor data.
-            for tensor_obj in self.tensor_to_key.keys():
-                tensor_obj.data = torch.empty_like(tensor_obj.data, device=self.offload_device)
-            return
-
+                self._onload_from_memory(current_stream)
+
+    def _offload_to_disk(self):
+        # TODO: we can potentially optimize this code path by checking if the _all_ the desired
+        # safetensor files exist on the disk and if so, skip this step entirely, reducing IO
+        # overhead. Currently, we just check if the given `safetensors_file_path` exists and if not
+        # we perform a write.
+        # Check if the file has been saved in this session or if it already exists on disk.
+        if not self._is_offloaded_to_disk and not os.path.exists(self.safetensors_file_path):
+            os.makedirs(os.path.dirname(self.safetensors_file_path), exist_ok=True)
+            tensors_to_save = {key: tensor.data.to(self.offload_device) for tensor, key in self.tensor_to_key.items()}
+            safetensors.torch.save_file(tensors_to_save, self.safetensors_file_path)
+
+        # The group is now considered offloaded to disk for the rest of the session.
+        self._is_offloaded_to_disk = True
+
+        # We do this to free up the RAM which is still holding the up tensor data.
+        for tensor_obj in self.tensor_to_key.keys():
+            tensor_obj.data = torch.empty_like(tensor_obj.data, device=self.offload_device)
+
+    def _offload_to_memory(self):
         torch_accelerator_module = (
             getattr(torch, torch.accelerator.current_accelerator().type)
             if hasattr(torch, "accelerator")
@@ -262,6 +274,14 @@ def offload_(self):
             for buffer in self.buffers:
                 buffer.data = buffer.data.to(self.offload_device, non_blocking=self.non_blocking)
 
+    @torch.compiler.disable()
+    def offload_(self):
+        r"""Offloads the group of modules to the offload_device."""
+        if self.offload_to_disk_path:
+            self._offload_to_disk()
+        else:
+            self._offload_to_memory()
+
 
 class GroupOffloadingHook(ModelHook):
     r"""
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -266,6 +266,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _keep_in_fp32_modules = None
     _skip_layerwise_casting_patterns = None
     _supports_group_offloading = True
+    _repeated_blocks = []
 
     def __init__(self):
         super().__init__()
@@ -1404,6 +1405,39 @@ def float(self, *args):
         else:
             return super().float(*args)
 
+    def compile_repeated_blocks(self, *args, **kwargs):
+        """
+        Compiles *only* the frequently repeated sub-modules of a model (e.g. the Transformer layers) instead of
+        compiling the entire model. This technique—often called **regional compilation** (see the PyTorch recipe
+        https://docs.pytorch.org/tutorials/recipes/regional_compilation.html) can reduce end-to-end compile time
+        substantially, while preserving the runtime speed-ups you would expect from a full `torch.compile`.
+
+        The set of sub-modules to compile is discovered by the presence of **`_repeated_blocks`** attribute in the
+        model definition. Define this attribute on your model subclass as a list/tuple of class names (strings). Every
+        module whose class name matches will be compiled.
+
+        Once discovered, each matching sub-module is compiled by calling `submodule.compile(*args, **kwargs)`. Any
+        positional or keyword arguments you supply to `compile_repeated_blocks` are forwarded verbatim to
+        `torch.compile`.
+        """
+        repeated_blocks = getattr(self, "_repeated_blocks", None)
+
+        if not repeated_blocks:
+            raise ValueError(
+                "`_repeated_blocks` attribute is empty. "
+                f"Set `_repeated_blocks` for the class `{self.__class__.__name__}` to benefit from faster compilation. "
+            )
+        has_compiled_region = False
+        for submod in self.modules():
+            if submod.__class__.__name__ in repeated_blocks:
+                submod.compile(*args, **kwargs)
+                has_compiled_region = True
+
+        if not has_compiled_region:
+            raise ValueError(
+                f"Regional compilation failed because {repeated_blocks} classes are not found in the model. "
+            )
+
     @classmethod
     def _load_pretrained_model(
         cls,
diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
@@ -407,6 +407,7 @@ class ChromaTransformer2DModel(
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["ChromaTransformerBlock", "ChromaSingleTransformerBlock"]
+    _repeated_blocks = ["ChromaTransformerBlock", "ChromaSingleTransformerBlock"]
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
 
     @register_to_config
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
@@ -227,6 +227,7 @@ class FluxTransformer2DModel(
     _supports_gradient_checkpointing = True
     _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -870,6 +870,12 @@ class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
         "HunyuanVideoPatchEmbed",
         "HunyuanVideoTokenRefiner",
     ]
+    _repeated_blocks = [
+        "HunyuanVideoTransformerBlock",
+        "HunyuanVideoSingleTransformerBlock",
+        "HunyuanVideoPatchEmbed",
+        "HunyuanVideoTokenRefiner",
+    ]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
@@ -328,6 +328,7 @@ class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin
 
     _supports_gradient_checkpointing = True
     _skip_layerwise_casting_patterns = ["norm"]
+    _repeated_blocks = ["LTXVideoTransformerBlock"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
@@ -345,6 +345,7 @@ class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
     _no_split_modules = ["WanTransformerBlock"]
     _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
     _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
@@ -167,6 +167,7 @@ class conditioning with `class_embed_type` equal to `None`.
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
     _skip_layerwise_casting_patterns = ["norm"]
+    _repeated_blocks = ["BasicTransformerBlock"]
 
     @register_to_config
     def __init__(
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py