huggingface · zhtmike · May 15, 2026 · May 19, 2026 · May 20, 2026 · May 20, 2026
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -2200,6 +2200,12 @@ def forward(
         query, key, value = (_all_to_all_single(x, group) for x in (query, key, value))
         query, key, value = (x.flatten(0, 1).permute(1, 0, 2, 3).contiguous() for x in (query, key, value))
 
+        if attn_mask is not None and attn_mask.shape[-1] == S_KV_LOCAL:
+            # All-gather a local mask so its layout matches the QKV layout after all-to-all.
+            mask_list = [torch.empty_like(attn_mask) for _ in range(world_size)]
+            dist.all_gather(mask_list, attn_mask, group=group)
+            attn_mask = torch.cat(mask_list, dim=-1)
+
         out = forward_op(
             ctx,
             query,
@@ -2399,6 +2405,8 @@ def forward(
         ctx.backward_op = backward_op
         ctx._parallel_config = _parallel_config
 
+        _, S_KV_LOCAL, _, _ = key.shape
+
         metadata = ulysses_anything_metadata(query)
         query_wait = all_to_all_single_any_qkv_async(query, group, **metadata)
         key_wait = all_to_all_single_any_qkv_async(key, group, **metadata)
@@ -2408,6 +2416,19 @@ def forward(
         key = key_wait()  # type: torch.Tensor
         value = value_wait()  # type: torch.Tensor
 
+        if attn_mask is not None and attn_mask.shape[-1] == S_KV_LOCAL:
+            # All-gather a local mask to match the post-all-to-all global sequence.
+            # The "anything" path allows unequal local sizes, so we pad to the
+            # maximum across ranks before all-gathering, then trim back.
+            mask_local_sizes = gather_size_by_comm(attn_mask.shape[-1], group)
+            max_local = max(mask_local_sizes)
+            if attn_mask.shape[-1] < max_local:
+                attn_mask = F.pad(attn_mask, (0, max_local - attn_mask.shape[-1]))
+            mask_list = [torch.empty_like(attn_mask) for _ in range(dist.get_world_size(group=group))]
+            dist.all_gather(mask_list, attn_mask, group=group)
+            attn_mask = torch.cat(mask_list, dim=-1)
+            attn_mask = attn_mask[..., : sum(mask_local_sizes)]
+
         out = forward_op(
             ctx,
             query,

diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -491,12 +491,24 @@ def __call__(
         hidden_states: torch.FloatTensor,  # Image stream
         encoder_hidden_states: torch.FloatTensor = None,  # Text stream
         encoder_hidden_states_mask: torch.FloatTensor = None,
-        attention_mask: torch.FloatTensor | None = None,
+        attention_mask: None = None,
         image_rotary_emb: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if encoder_hidden_states is None:
             raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
 
+        if attention_mask is not None:
+            raise ValueError(
+                "QwenDoubleStreamAttnProcessor2_0 does not accept an external attention_mask. "
+                "Pass encoder_hidden_states_mask to let the processor build the joint mask."
+            )
+
+        if encoder_hidden_states_mask is not None:
+            seq_img = hidden_states.shape[1]
+            image_mask = torch.ones((hidden_states.shape[0], seq_img), dtype=torch.bool, device=hidden_states.device)
+            attention_mask = torch.cat([encoder_hidden_states_mask, image_mask], dim=1)
+            attention_mask = attention_mask[:, None, None, :]
+
         seq_txt = encoder_hidden_states.shape[1]
 
         # Compute QKV for image stream (sample projections)
@@ -770,6 +782,7 @@ class QwenImageTransformer2DModel(
         },
         "transformer_blocks.*": {
             "modulate_index": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),
+            "encoder_hidden_states_mask": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),
         },
         "pos_embed": {
             0: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True),
@@ -909,38 +922,27 @@ def forward(
 
         image_rotary_emb = self.pos_embed(img_shapes, max_txt_seq_len=text_seq_len, device=hidden_states.device)
 
-        # Construct joint attention mask once to avoid reconstructing in every block
-        # This eliminates 60 GPU syncs during training while maintaining torch.compile compatibility
-        block_attention_kwargs = attention_kwargs.copy() if attention_kwargs is not None else {}
-        if encoder_hidden_states_mask is not None:
-            # Build joint mask: [text_mask, all_ones_for_image]
-            batch_size, image_seq_len = hidden_states.shape[:2]
-            image_mask = torch.ones((batch_size, image_seq_len), dtype=torch.bool, device=hidden_states.device)
-            joint_attention_mask = torch.cat([encoder_hidden_states_mask, image_mask], dim=1)
-            joint_attention_mask = joint_attention_mask[:, None, None, :]
-            block_attention_kwargs["attention_mask"] = joint_attention_mask
-
         for index_block, block in enumerate(self.transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
                     block,
                     hidden_states,
                     encoder_hidden_states,
-                    None,  # Don't pass encoder_hidden_states_mask (using attention_mask instead)
+                    encoder_hidden_states_mask,
                     temb,
                     image_rotary_emb,
-                    block_attention_kwargs,
+                    attention_kwargs,
                     modulate_index,
                 )
 
             else:
                 encoder_hidden_states, hidden_states = block(
                     hidden_states=hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    encoder_hidden_states_mask=None,  # Don't pass (using attention_mask instead)
+                    encoder_hidden_states_mask=encoder_hidden_states_mask,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
-                    joint_attention_kwargs=block_attention_kwargs,
+                    joint_attention_kwargs=attention_kwargs,
                     modulate_index=modulate_index,
                 )