fix: style

JingyaHuang · JingyaHuang · commit 929ab7288fff · 2026-04-09T16:37:56.000Z
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -85,9 +85,11 @@ def log_validation(pipeline, args, accelerator, generator, global_step, is_final
         os.makedirs(val_save_dir)
 
     original_image = (
-        lambda image_url_or_path: load_image(image_url_or_path)
-        if urlparse(image_url_or_path).scheme
-        else Image.open(image_url_or_path).convert("RGB")
+        lambda image_url_or_path: (
+            load_image(image_url_or_path)
+            if urlparse(image_url_or_path).scheme
+            else Image.open(image_url_or_path).convert("RGB")
+        )
     )(args.val_image_url_or_path)
 
     if torch.backends.mps.is_available():
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -46,7 +46,7 @@
 logger = logging.get_logger(__name__)
 
 _SET_ADAPTER_SCALE_FN_MAPPING = defaultdict(
-    lambda: (lambda model_cls, weights: weights),
+    lambda: lambda model_cls, weights: weights,
     {
         "UNet2DConditionModel": _maybe_expand_lora_scales,
         "UNetMotionModel": _maybe_expand_lora_scales,
diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py
@@ -35,7 +35,6 @@
 # - Unified Attention
 # - More dispatcher attention backends
 # - CFG/Data Parallel
-# - Tensor Parallel
 
 
 @dataclass
@@ -142,6 +141,63 @@ def setup(self, rank: int, world_size: int, device: torch.device, mesh: torch.di
         self._ulysses_local_rank = self._ulysses_mesh.get_local_rank()
 
 
+@dataclass
+class TensorParallelConfig:
+    """
+    Configuration for tensor parallelism.
+
+    Tensor parallelism shards weight matrices (column-wise and row-wise) across devices.
+    Each device computes a partial result; an AllReduce/AllGather at layer boundaries
+    reconstructs the full output. Uses ``torch.distributed.tensor.parallelize_module``
+    with ``ColwiseParallel`` / ``RowwiseParallel`` sharding styles.
+
+    On Neuron, use the ``_pre_shard_and_tp`` workaround from
+    ``transformer_flux2_neuron_tp`` to avoid the NRT consecutive-reduce-scatter bug
+    on large tensors (>= 5120x5120).
+
+    Args:
+        tp_degree (`int`, defaults to `1`):
+            Number of devices to shard across. Must be a divisor of the number of
+            attention heads (and FFN hidden dimensions) of the model being parallelised.
+        mesh (`torch.distributed.device_mesh.DeviceMesh`, *optional*):
+            A custom device mesh to use. If provided, ``tp_degree`` is inferred from
+            ``mesh.size()`` and the argument is ignored. Useful when combining TP with
+            other parallelism strategies (e.g. CP) that share the same mesh.
+    """
+
+    tp_degree: int = 1
+    mesh: torch.distributed.device_mesh.DeviceMesh | None = None
+
+    _rank: int = None
+    _world_size: int = None
+    _device: torch.device = None
+    _mesh: torch.distributed.device_mesh.DeviceMesh = None
+
+    def __post_init__(self):
+        if self.tp_degree < 1:
+            raise ValueError("`tp_degree` must be >= 1.")
+
+    def setup(
+        self,
+        rank: int,
+        world_size: int,
+        device: torch.device,
+        mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
+    ):
+        self._rank = rank
+        self._world_size = world_size
+        self._device = device
+        if mesh is not None:
+            self._mesh = mesh
+        elif self.mesh is not None:
+            self._mesh = self.mesh
+        else:
+            from torch.distributed.device_mesh import init_device_mesh
+
+            device_type = str(device).split(":")[0]
+            self._mesh = init_device_mesh(device_type, (self.tp_degree,), mesh_dim_names=("tp",))
+
+
 @dataclass
 class ParallelConfig:
     """
@@ -150,9 +206,12 @@ class ParallelConfig:
     Args:
         context_parallel_config (`ContextParallelConfig`, *optional*):
             Configuration for context parallelism.
+        tensor_parallel_config (`TensorParallelConfig`, *optional*):
+            Configuration for tensor parallelism.
     """
 
     context_parallel_config: ContextParallelConfig | None = None
+    tensor_parallel_config: TensorParallelConfig | None = None
 
     _rank: int = None
     _world_size: int = None
@@ -173,6 +232,8 @@ def setup(
         self._mesh = mesh
         if self.context_parallel_config is not None:
             self.context_parallel_config.setup(rank, world_size, device, mesh)
+        if self.tensor_parallel_config is not None:
+            self.tensor_parallel_config.setup(rank, world_size, device, mesh)
 
 
 @dataclass(frozen=True)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -22,7 +22,7 @@
 import types
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import Any, Callable, Dict, List, Union, get_args, get_origin, get_type_hints
 
 import httpx
 import numpy as np
@@ -68,7 +68,6 @@
     is_transformers_version,
     logging,
     numpy_to_pil,
-    requires_backends,
 )
 from ..utils.distributed_utils import is_torch_dist_rank_zero
 from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card
@@ -2249,6 +2248,7 @@ def _is_pipeline_device_mapped(self):
 
         return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
 
+
 class StableDiffusionMixin:
     r"""
     Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
@@ -39,7 +39,14 @@
     import torch
     from torch.fft import fftn, fftshift, ifftn, ifftshift
 
-    BACKEND_SUPPORTS_TRAINING = {"cuda": True, "xpu": True, "cpu": True, "mps": False, "neuron": False, "default": True}
+    BACKEND_SUPPORTS_TRAINING = {
+        "cuda": True,
+        "xpu": True,
+        "cpu": True,
+        "mps": False,
+        "neuron": False,
+        "default": True,
+    }
     BACKEND_EMPTY_CACHE = {
         "cuda": torch.cuda.empty_cache,
         "xpu": torch.xpu.empty_cache,
diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -27,7 +27,6 @@
     PixArtAlphaPipeline,
     PixArtTransformer2DModel,
 )
-
 from diffusers.utils.import_utils import is_torch_neuronx_available
 
 from ...testing_utils import (

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`logger = logging.get_logger(__name__)`
`47`	`47`
`48`	`48`	`_SET_ADAPTER_SCALE_FN_MAPPING = defaultdict(`
`49`		`- lambda: (lambda model_cls, weights: weights),`
	`49`	`+ lambda: lambda model_cls, weights: weights,`
`50`	`50`	`{`
`51`	`51`	`"UNet2DConditionModel": _maybe_expand_lora_scales,`
`52`	`52`	`"UNetMotionModel": _maybe_expand_lora_scales,`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,6 @@`
`27`	`27`	`PixArtAlphaPipeline,`
`28`	`28`	`PixArtTransformer2DModel,`
`29`	`29`	`)`
`30`		`-`
`31`	`30`	`from diffusers.utils.import_utils import is_torch_neuronx_available`
`32`	`31`
`33`	`32`	`from ...testing_utils import (`