remove big mask

xadupre · xadupre · commit 9c83ab38876c · 2025-12-05T12:35:26.000+01:00
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -52,7 +52,6 @@ def test_qwen25_vli_visual(self):
         .. code-block:: bash
 
             NEVERTEST=1 \\
-            QWEN25ATTENTION=BIGMASK \\
             PRETRAINED=1 \\
             TESTDEVICE=cuda \\
             TESTDTYPE=float16 \\
@@ -164,9 +163,11 @@ def _config_reduction(config, task):
         if qwen25_attention:
             attention_options = [qwen25_attention]
         elif device == "cuda" and dtype in ("float16", "bfloat16"):
-            attention_options = ["PACKED", "BIGMASK"]
+            attention_options = [
+                "PACKED",
+            ]
         else:
-            attention_options = ["LOOPMHA", "LOOPA24", "BIGMASK"]
+            attention_options = ["LOOPMHA", "LOOPA24"]
 
         # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
         for attention in attention_options:
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py
@@ -200,6 +200,39 @@ def qwen_sdpa_attention(
         scaling: float = 0,
         num_heads: int = 16,
     ) -> torch.Tensor:
+        """
+        The loop can be removed with the following code
+        but it hits memory overflow for big inputs.
+
+        .. code-block:: python
+
+            # make square mask
+            indices = torch.arange(
+                cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
+            )
+            dot = (cu_seqlens.unsqueeze(1) <= indices.unsqueeze(0)).to(
+                cu_seqlens.dtype
+            )
+            dot = dot.sum(dim=0)
+            mask = dot.unsqueeze(1) - dot.unsqueeze(0)
+            bool_mask = mask == 0
+            bool_mask = bool_mask.unsqueeze(0).unsqueeze(0)
+
+            torch._check(bool_mask.shape[2] == key_states.shape[2])
+            torch._check(bool_mask.shape[3] == key_states.shape[2])
+
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=bool_mask,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                is_causal=False,
+                **kwargs,
+            )
+        """
         lengths = cu_seqlens[1:] - cu_seqlens[:-1]
         splits = [
             torch.split(tensor, lengths.tolist(), dim=2)