huggingface
diff --git a/‎.ai/models.md‎
Lines changed: 6 additions & 6 deletions b/‎.ai/models.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet.py‎
Lines changed: 4 additions & 6 deletions b/‎src/diffusers/models/controlnets/controlnet.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet_sparsectrl.py‎
Lines changed: 4 additions & 6 deletions b/‎src/diffusers/models/controlnets/controlnet_sparsectrl.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet_union.py‎
Lines changed: 4 additions & 6 deletions b/‎src/diffusers/models/controlnets/controlnet_union.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet_xs.py‎
Lines changed: 4 additions & 7 deletions b/‎src/diffusers/models/controlnets/controlnet_xs.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎src/diffusers/models/embeddings.py‎
Lines changed: 2 additions & 1 deletion b/‎src/diffusers/models/embeddings.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/diffusers/models/transformers/transformer_anyflow.py‎
Lines changed: 3 additions & 6 deletions b/‎src/diffusers/models/transformers/transformer_anyflow.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_anyflow_far.py‎
Lines changed: 3 additions & 6 deletions b/‎src/diffusers/models/transformers/transformer_anyflow_far.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_bria.py‎
Lines changed: 3 additions & 5 deletions b/‎src/diffusers/models/transformers/transformer_bria.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_bria_fibo.py‎
Lines changed: 2 additions & 3 deletions b/‎src/diffusers/models/transformers/transformer_bria_fibo.py‎
Lines changed: 2 additions & 3 deletions
@@ -163,14 +163,14 @@ Boolean gate. If `False` (default), calling that method raises `ValueError`. All
 3. **Capability flags without matching implementation.** for example, `_supports_gradient_checkpointing = True` only takes effect if `forward` actually has `if self.gradient_checkpointing:` branches calling `self._gradient_checkpointing_func` on each block. Setting the flag without those branches means training code silently no-ops the checkpoint and runs a normal forward.
 4. **Hardcoded dtype in model forward.** Don't hardcode `torch.float32` or `torch.bfloat16`, and don't cast activations by reading a weight's dtype (`self.linear.weight.dtype`) — the stored weight dtype isn't the compute dtype under gguf / quantized loading. Always derive the cast target from the input tensor's dtype or `self.dtype`.
 
-5. **`torch.float64` anywhere in the model.** MPS and several NPU backends don't support float64 -- ops will either error out or silently fall back. Reference repos commonly reach for float64 in RoPE frequency bases, timestep embeddings, sinusoidal position encodings, and similar "precision-sensitive" precompute code (`torch.arange(..., dtype=torch.float64)`, `.double()`, `torch.float64` literals). When porting a model, grep for `float64` / `double()` up front and resolve as follows:
+5. **`torch.float64` anywhere in the model.** MPS, NPU, and Neuron backends don't support float64 -- ops will either error out or silently fall back. Reference repos commonly reach for float64 in RoPE frequency bases, timestep embeddings, sinusoidal position encodings, and similar "precision-sensitive" precompute code (`torch.arange(..., dtype=torch.float64)`, `.double()`, `torch.float64` literals). When porting a model, grep for `float64` / `double()` up front and resolve as follows:
     - **Default: just use `torch.float32`.** For inference it is almost always sufficient -- the precision difference in RoPE angles, timestep embeddings, etc. is immaterial to image/video quality. Flip it and move on.
-    - **Only if float32 visibly degrades output, fall back to the device-gated pattern** we use in the repo:
+    - **Only if float32 visibly degrades output, use the `maybe_adjust_dtype_for_device` helper** from `diffusers.utils.torch_utils`. It centralizes the device-specific dtype downcast (float64→float32, int64→int32) for all restricted backends (mps, npu, neuron):
       ```python
-      is_mps = hidden_states.device.type == "mps"
-      is_npu = hidden_states.device.type == "npu"
-      freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+      from diffusers.utils.torch_utils import maybe_adjust_dtype_for_device
+
+      freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, hidden_states.device)
       ```
-      See `transformer_flux.py`, `transformer_flux2.py`, `transformer_wan.py`, `unet_2d_condition.py` for reference usages. Never leave an unconditional `torch.float64` in the model.
+      See `transformer_flux.py`, `transformer_flux2.py`, `transformer_wan.py`, `unet_2d_condition.py`, and `pipeline_pixart_alpha.py` for reference usages. Never leave an unconditional `torch.float64` in the model.
 
 6. **Using `torch.empty`.** - Do not use `torch.empty` to initialize parameters. Use `torch.zeros` or `torch.ones`, instead.
@@ -22,6 +22,7 @@
 from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import BaseOutput, apply_lora_scale, logging
+from ...utils.torch_utils import maybe_adjust_dtype_for_device
 from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -675,12 +676,9 @@ def forward(
         if not torch.is_tensor(timesteps):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+            dtype = maybe_adjust_dtype_for_device(
+                torch.float64 if isinstance(timestep, float) else torch.int64, sample.device
+            )
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
@@ -22,6 +22,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
+from ...utils.torch_utils import maybe_adjust_dtype_for_device
 from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -604,12 +605,9 @@ def forward(
         if not torch.is_tensor(timesteps):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+            dtype = maybe_adjust_dtype_for_device(
+                torch.float64 if isinstance(timestep, float) else torch.int64, sample.device
+            )
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
@@ -19,6 +19,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import logging
+from ...utils.torch_utils import maybe_adjust_dtype_for_device
 from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -620,12 +621,9 @@ def forward(
         if not torch.is_tensor(timesteps):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+            dtype = maybe_adjust_dtype_for_device(
+                torch.float64 if isinstance(timestep, float) else torch.int64, sample.device
+            )
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
@@ -20,7 +20,7 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import BaseOutput, logging
-from ...utils.torch_utils import apply_freeu
+from ...utils.torch_utils import apply_freeu, maybe_adjust_dtype_for_device
 from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -1014,12 +1014,9 @@ def forward(
         if not torch.is_tensor(timesteps):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+            dtype = maybe_adjust_dtype_for_device(
+                torch.float64 if isinstance(timestep, float) else torch.int64, sample.device
+            )
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
@@ -19,6 +19,7 @@
 from torch import nn
 
 from ..utils import deprecate
+from ..utils.torch_utils import maybe_adjust_dtype_for_device
 from .activations import FP32SiLU, get_activation
 from .attention_processor import Attention
 
@@ -346,7 +347,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np", flip_sin
 
     # Auto-detect appropriate dtype if not specified
     if dtype is None:
-        dtype = torch.float32 if pos.device.type == "mps" else torch.float64
+        dtype = maybe_adjust_dtype_for_device(torch.float64, pos.device)
 
     omega = torch.arange(embed_dim // 2, device=pos.device, dtype=dtype)
     omega /= embed_dim / 2.0
 
@@ -28,6 +28,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import apply_lora_scale, logging
+from ...utils.torch_utils import maybe_adjust_dtype_for_device
 from ..attention import AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
@@ -41,9 +42,7 @@
 
 def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
     # MPS / NPU backends do not support complex128 / float64; fall back to float32 on those devices.
-    is_mps = hidden_states.device.type == "mps"
-    is_npu = hidden_states.device.type == "npu"
-    rotary_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+    rotary_dtype = maybe_adjust_dtype_for_device(torch.float64, hidden_states.device)
     x_rotated = torch.view_as_complex(hidden_states.to(rotary_dtype).unflatten(3, (-1, 2)))
     x_out = torch.view_as_real(x_rotated * freqs).flatten(3, 4)
     return x_out.type_as(hidden_states)
@@ -341,9 +340,7 @@ def _build_freqs(self, device: torch.device) -> torch.Tensor:
         if not is_compiling and self._freqs_cache is not None and self._freqs_cache[0] == cache_key:
             return self._freqs_cache[1]
 
-        is_mps = device.type == "mps"
-        is_npu = device.type == "npu"
-        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, device)
 
         h_dim = w_dim = 2 * (self.attention_head_dim // 6)
         t_dim = self.attention_head_dim - h_dim - w_dim
 
@@ -30,6 +30,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import BaseOutput, apply_lora_scale, logging
+from ...utils.torch_utils import maybe_adjust_dtype_for_device
 from ..attention import AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
@@ -44,9 +45,7 @@
 # Copied from diffusers.models.transformers.transformer_anyflow.apply_rotary_emb
 def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
     # MPS / NPU backends do not support complex128 / float64; fall back to float32 on those devices.
-    is_mps = hidden_states.device.type == "mps"
-    is_npu = hidden_states.device.type == "npu"
-    rotary_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+    rotary_dtype = maybe_adjust_dtype_for_device(torch.float64, hidden_states.device)
     x_rotated = torch.view_as_complex(hidden_states.to(rotary_dtype).unflatten(3, (-1, 2)))
     x_out = torch.view_as_real(x_rotated * freqs).flatten(3, 4)
     return x_out.type_as(hidden_states)
@@ -650,9 +649,7 @@ def _build_freqs(self, device: torch.device) -> torch.Tensor:
         if not is_compiling and self._freqs_cache is not None and self._freqs_cache[0] == cache_key:
             return self._freqs_cache[1]
 
-        is_mps = device.type == "mps"
-        is_npu = device.type == "npu"
-        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, device)
 
         h_dim = w_dim = 2 * (self.attention_head_dim // 6)
         t_dim = self.attention_head_dim - h_dim - w_dim
 
@@ -9,7 +9,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import apply_lora_scale, logging
-from ...utils.torch_utils import maybe_allow_in_graph
+from ...utils.torch_utils import maybe_adjust_dtype_for_device, maybe_allow_in_graph
 from ..attention import AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..cache_utils import CacheMixin
@@ -276,8 +276,7 @@ def forward(self, ids: torch.Tensor) -> torch.Tensor:
         cos_out = []
         sin_out = []
         pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        freqs_dtype = torch.float32 if is_mps else torch.float64
+        freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, ids.device)
         for i in range(n_axes):
             cos, sin = get_1d_rotary_pos_embed(
                 self.axes_dim[i],
@@ -344,8 +343,7 @@ def forward(self, ids: torch.Tensor) -> torch.Tensor:
         cos_out = []
         sin_out = []
         pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        freqs_dtype = torch.float32 if is_mps else torch.float64
+        freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, ids.device)
         for i in range(n_axes):
             cos, sin = get_1d_rotary_pos_embed(
                 self.axes_dim[i],
 
@@ -25,7 +25,7 @@
     apply_lora_scale,
     logging,
 )
-from ...utils.torch_utils import maybe_allow_in_graph
+from ...utils.torch_utils import maybe_adjust_dtype_for_device, maybe_allow_in_graph
 from ..attention import AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
@@ -222,8 +222,7 @@ def forward(self, ids: torch.Tensor) -> torch.Tensor:
         cos_out = []
         sin_out = []
         pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        freqs_dtype = torch.float32 if is_mps else torch.float64
+        freqs_dtype = maybe_adjust_dtype_for_device(torch.float64, ids.device)
         for i in range(n_axes):
             cos, sin = get_1d_rotary_pos_embed(
                 self.axes_dim[i],