fix default strategy

xadupre · xadupre · commit a8036a990d8b · 2025-12-05T12:25:32.000+01:00
diff --git a/_unittests/ut_torch_onnx/test_sbs.py b/_unittests/ut_torch_onnx/test_sbs.py
@@ -682,7 +682,7 @@ def test_sbs_with_loops(self):
             PLUGS_Qwen25,
         )
         from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
-            qwen_sdpa_attention_loopmha_versatile,
+            qwen_sdpa_attention_versatile,
         )
 
         class Model(torch.nn.Module):
@@ -693,9 +693,7 @@ def forward(self, query, key, value, seq_lens):
                 qs = query * mask
                 ks = key * mask
                 vs = value * mask
-                attn_output = qwen_sdpa_attention_loopmha_versatile(
-                    qs, ks, vs, seq_lens, 0.11, 16
-                )
+                attn_output = qwen_sdpa_attention_versatile(qs, ks, vs, seq_lens, 0.11, 16)
                 red = attn_output.mean(dim=-1, keepdim=True)
                 return attn_output - red
 
diff --git a/onnx_diagnostic/export/onnx_plug.py b/onnx_diagnostic/export/onnx_plug.py
@@ -150,20 +150,18 @@ def forward(self, x):
         def qwen_version_selector(opset: int, *args: torch.Tensor) -> Tuple[str, torch.dtype]:
             first_tensor = next(a for a in args if a is not None)
             dtype = first_tensor.dtype
-            strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
-            if strategy is not None:
-                return strategy, dtype
+            itype = torch_dtype_to_onnx_dtype(dtype)
             if dtype == torch.float32:
                 if opset >= 24:
-                    return "LOOPA24", dtype
-                return "LOOPMHA", dtype
+                    return "LOOPA24", itype
+                return "LOOPMHA", itype
             if dtype == torch.float16:
                 if first_tensor.is_cuda:
-                    return "PACKED", dtype
-                return "LOOPMHA", dtype
+                    return "PACKED", itype
+                return "LOOPMHA", itype
             raise AssertionError(
-                f"Unable to handle type {torch.dtype} on "
-                f"device {torch.device} with opset={opset}"
+                f"Unable to handle type {torch.dtype} (itype={itype}) "
+                f"on device {torch.device} with opset={opset}"
             )
 
         qwen_sdpa_attention_versatile = EagerDirectReplacementWithOnnx(
@@ -338,6 +336,8 @@ def _register(self):
                 input_args.append(f"int {p}={val}")
             elif isinstance(val, float):
                 input_args.append(f"float {p}={val}")
+            elif isinstance(val, str):
+                input_args.append(f"str {p}={val}")
             else:
                 raise NotImplementedError(
                     f"kwargs {p!r} has a default value of unsupported type {type(val)}"
@@ -445,7 +445,7 @@ def converter(
             *args,
             **kwargs,
         ) -> Any:
-            has_devices = [a for a in args if g.has_device(a)]
+            has_devices = [a for a in args if isinstance(a, str) and g.has_device(a)]
             assert (
                 has_devices
             ), f"Missing device for any of the inputs {args}{g.get_debug_msg()}"
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn.functional as F
 from ...export.onnx_plug import EagerDirectReplacementWithOnnx
+from ...helpers.torch_helper import torch_dtype_to_onnx_dtype
 from .patch_helper import _is_torchdynamo_exporting
 from ._patch_transformers_attention import patched_sdpa_attention_forward
 
@@ -225,18 +226,20 @@ def qwen_version_selector(opset: int, *args: torch.Tensor) -> Tuple[str, torch.d
         first_tensor = next(a for a in args if a is not None)
         dtype = first_tensor.dtype
         strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
+        itype = torch_dtype_to_onnx_dtype(dtype)
         if strategy is not None:
-            return strategy, dtype
+            return strategy, itype
         if dtype == torch.float32:
             if opset >= 24:
-                return "LOOPA24", dtype
-            return "LOOPMHA", dtype
+                return "LOOPA24", itype
+            return "LOOPMHA", itype
         if dtype == torch.float16:
             if first_tensor.is_cuda:
-                return "PACKED", dtype
-            return "LOOPMHA", dtype
+                return "PACKED", itype
+            return "LOOPMHA", itype
         raise AssertionError(
-            f"Unable to handle type {torch.dtype} on device {torch.device} with opset={opset}"
+            f"Unable to handle type {torch.dtype} (itype={itype}) "
+            f"on device {torch.device} with opset={opset}"
         )
 
     qwen_sdpa_attention_versatile = EagerDirectReplacementWithOnnx(
@@ -558,9 +561,7 @@ class patched_Qwen2_5_VLVisionAttention:
         _PATCHED_CLASS_ = (
             transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLVisionAttention
         )
-        STRATEGY_FOR_ATTENTION = lambda: os.environ.get(  # noqa: E731
-            "QWEN25ATTENTION", "PACKED"
-        )
+        STRATEGY_FOR_ATTENTION = lambda: os.environ.get("QWEN25ATTENTION", None)  # noqa: E731
 
         def forward(
             self,