huggingface · sayakpaul · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/tests/models/testing_utils/parallelism.py b/tests/models/testing_utils/parallelism.py
@@ -22,6 +22,7 @@
 import torch.multiprocessing as mp
 
 from diffusers.models._modeling_parallel import ContextParallelConfig
+from diffusers.models.attention_dispatch import _AttentionBackendRegistry, AttentionBackendName
 
 from ...testing_utils import (
     is_context_parallel,
@@ -167,6 +168,11 @@ def test_context_parallel_inference(self, cp_type):
         if not hasattr(self.model_class, "_cp_plan") or self.model_class._cp_plan is None:
             pytest.skip("Model does not have a _cp_plan defined for context parallel inference.")
 
+        if cp_type == "ring_degree":
+            active_backend, _ = _AttentionBackendRegistry.get_active_backend()
+            if active_backend == AttentionBackendName.NATIVE:
+                pytest.skip("Ring attention is not supported with the native attention backend.")
+
         world_size = 2
         init_dict = self.get_init_dict()
         inputs_dict = self.get_dummy_inputs()
@@ -209,6 +215,11 @@ def test_context_parallel_custom_mesh(self, cp_type, mesh_shape, mesh_dim_names)
         if not hasattr(self.model_class, "_cp_plan") or self.model_class._cp_plan is None:
             pytest.skip("Model does not have a _cp_plan defined for context parallel inference.")
 
+        if cp_type == "ring_degree":
+            active_backend, _ = _AttentionBackendRegistry.get_active_backend()
+            if active_backend == AttentionBackendName.NATIVE:
+                pytest.skip("Ring attention is not supported with the native attention backend.")
+
         world_size = 2
         init_dict = self.get_init_dict()
         inputs_dict = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in self.get_dummy_inputs().items()}

diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import warnings
 
 import torch
+import pytest
 
 from diffusers import QwenImageTransformer2DModel
 from diffusers.models.transformers.transformer_qwenimage import compute_text_seq_len_from_mask
@@ -77,8 +79,7 @@ def get_init_dict(self) -> dict[str, int | list[int]]:
             "axes_dims_rope": (8, 4, 4),
         }
 
-    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
-        batch_size = 1
+    def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]:
         num_latent_channels = embedding_dim = 16
         height = width = 4
         sequence_length = 8
@@ -106,9 +107,10 @@ def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
 
 
 class TestQwenImageTransformer(QwenImageTransformerTesterConfig, ModelTesterMixin):
-    def test_infers_text_seq_len_from_mask(self):
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    def test_infers_text_seq_len_from_mask(self, batch_size):
         init_dict = self.get_init_dict()
-        inputs = self.get_dummy_inputs()
+        inputs = self.get_dummy_inputs(batch_size=batch_size)
         model = self.model_class(**init_dict).to(torch_device)
 
         encoder_hidden_states_mask = inputs["encoder_hidden_states_mask"].clone()
@@ -122,7 +124,7 @@ def test_infers_text_seq_len_from_mask(self):
         assert isinstance(per_sample_len, torch.Tensor)
         assert int(per_sample_len.max().item()) == 2
         assert normalized_mask.dtype == torch.bool
-        assert normalized_mask.sum().item() == 2
+        assert normalized_mask.sum().item() == 2 * batch_size
         assert rope_text_seq_len >= inputs["encoder_hidden_states"].shape[1]
 
         inputs["encoder_hidden_states_mask"] = normalized_mask
@@ -139,7 +141,7 @@ def test_infers_text_seq_len_from_mask(self):
         )
 
         assert int(per_sample_len2.max().item()) == 8
-        assert normalized_mask2.sum().item() == 5
+        assert normalized_mask2.sum().item() == 5 * batch_size
 
         rope_text_seq_len_none, per_sample_len_none, normalized_mask_none = compute_text_seq_len_from_mask(
             inputs["encoder_hidden_states"], None
@@ -149,9 +151,10 @@ def test_infers_text_seq_len_from_mask(self):
         assert per_sample_len_none is None
         assert normalized_mask_none is None
 
-    def test_non_contiguous_attention_mask(self):
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    def test_non_contiguous_attention_mask(self, batch_size):
         init_dict = self.get_init_dict()
-        inputs = self.get_dummy_inputs()
+        inputs = self.get_dummy_inputs(batch_size=batch_size)
         model = self.model_class(**init_dict).to(torch_device)
 
         encoder_hidden_states_mask = inputs["encoder_hidden_states_mask"].clone()
@@ -174,9 +177,10 @@ def test_non_contiguous_attention_mask(self):
 
         assert output.sample.shape[1] == inputs["hidden_states"].shape[1]
 
-    def test_txt_seq_lens_deprecation(self):
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    def test_txt_seq_lens_deprecation(self, batch_size):
         init_dict = self.get_init_dict()
-        inputs = self.get_dummy_inputs()
+        inputs = self.get_dummy_inputs(batch_size=batch_size)
         model = self.model_class(**init_dict).to(torch_device)
 
         txt_seq_lens = [inputs["encoder_hidden_states"].shape[1]]
@@ -276,6 +280,17 @@ class TestQwenImageTransformerAttention(QwenImageTransformerTesterConfig, Attent
 class TestQwenImageTransformerContextParallel(QwenImageTransformerTesterConfig, ContextParallelTesterMixin):
     """Context Parallel inference tests for QwenImage Transformer."""
 
+    @pytest.mark.parametrize(
+        "batch_size",
+        [
+            1,
+            pytest.param(2, marks=pytest.mark.xfail(reason="Context parallel does not support batch_size > 1")),
+        ],
+    )
+    def test_context_parallel_batch_size(self, batch_size):
+        self.get_dummy_inputs = functools.partial(self.get_dummy_inputs, batch_size=batch_size)
+        self.test_context_parallel_inference("ulysses_degree")
+
 
 class TestQwenImageTransformerLoRA(QwenImageTransformerTesterConfig, LoraTesterMixin):
     """LoRA adapter tests for QwenImage Transformer."""