final fixes

xadupre · xadupre · commit f4dc9d193744 · 2025-12-08T12:00:39.000Z
diff --git a/_scripts/export_qwen25_vl_visual.py b/_scripts/export_qwen25_vl_visual.py
@@ -18,6 +18,17 @@
 .. code-block:: bash
 
     python export_qwen25_vl_visual.py -m Qwen/Qwen2.5-VL-7B-Instruct --device cpu --dtype float32 --exporter onnx-dynamo --pretrained --second-input
+
+Attention
++++++++++
+
+The attention is either implemented with ``MultiHeadAttention`` in a loop, either with ``PackedMultiHeadAttention``.
+The choice is made based on the device. It is possible to overwrite this by by setting
+environment variable to ``QWEN25ATTENTION`` to:
+
+* ``PACKED``: PackedMultiHeadAttention
+* ``LOOPMHA``: Loop over MultiHeadAttention
+* ``LOOPA24``: Loop over Attention(24), needs opset 23 or 24.
 """
 
 import os
@@ -145,11 +156,16 @@ def _config_reduction(config, task):
     )
 
     prefix = simplify_model_id_for_a_filename(model_id)
+    if "QWEN25ATTENTION" in os.environ:
+        prefix = f"{prefix}.{os.environ['QWEN25ATTENTION']}"
     filename = f"model.{prefix}.visual.{device}.{dtype}.{exporter}.onnx"
     print(f"-- export in {filename!r}")
     stat_file = filename.replace(".onnx", ".stats")
     begin = time.perf_counter()
 
+    if exporter == "onnx-dynamo" and device == "cuda" and "QWEN25ATTENTION" not in os.environ:
+        os.environ["QWEN25ATTENTION"] = "PACKED"
+
     export_inputs = inputs
     with torch_export_patches(
         patch_torch=False,
@@ -189,23 +205,25 @@ def fprint(s):
             f.write(f"{s}\n")
 
         fprint(f"-- export duration: {duration}")
-        fprint("-- checking discrepancies")
         providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
         if device == "cpu":
             providers = providers[1:]
+        fprint(f"-- checking discrepancies with providers={providers!r}")
         sess = onnxruntime.InferenceSession(filename, providers=providers)
 
-        fprint(f"-- inputs {string_type(inputs, with_shape=True)}")
-        fprint(f"-- expected {string_type(expected, with_shape=True)}")
+        fprint(f"-- inputs {string_type(inputs, with_shape=True, with_device=True)}")
+        fprint(f"-- expected {string_type(expected, with_shape=True, with_device=True)}")
         feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
         small = sess.run(None, feeds)
         diff = max_diff(expected, small[0], hist=[0.1])
         fprint(f"-- discrepancies={diff}")
 
         if second_input:
             fprint("")
-            fprint(f"-- inputs {string_type(big_inputs, with_shape=True)}")
-            fprint(f"-- expected {string_type(expected_big, with_shape=True)}")
+            fprint(f"-- inputs {string_type(big_inputs, with_shape=True, with_device=True)}")
+            fprint(
+                f"-- expected {string_type(expected_big, with_shape=True, with_device=True)}"
+            )
             feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
             big = sess.run(None, feeds)
             diff = max_diff(expected_big, big[0], hist=[0.1])
diff --git a/_scripts/investigate.ipynb b/_scripts/investigate.ipynb
@@ -214,33 +214,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "e0e65bcd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor_type {\n",
-      "  elem_type: 1\n",
-      "}\n",
-      " ['ByteSize', 'Clear', 'ClearExtension', 'ClearField', 'CopyFrom', 'DESCRIPTOR', 'DiscardUnknownFields', 'FindInitializationErrors', 'FromString', 'HasExtension', 'HasField', 'IsInitialized', 'ListFields', 'Map', 'MergeFrom', 'MergeFromString', 'Opaque', 'Optional', 'ParseFromString', 'Sequence', 'SerializePartialToString', 'SerializeToString', 'SetInParent', 'SparseTensor', 'Tensor', 'UnknownFields', 'WhichOneof', '_CheckCalledFromGeneratedFile', '_ListFieldsItemKey', '_SetListener', '__class__', '__contains__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__unicode__', 'denotation', 'map_type', 'opaque_type', 'optional_type', 'sequence_type', 'sparse_tensor_type', 'tensor_type']\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "copy_from",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m      7\u001b[39m             g.input[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n\u001b[32m      8\u001b[39m             g.output[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m \u001b[43mremove_inplace_body_last_input_output_type_for_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 7\u001b[39m, in \u001b[36mremove_inplace_body_last_input_output_type_for_loop\u001b[39m\u001b[34m(filename)\u001b[39m\n\u001b[32m      5\u001b[39m g = node.attribute[\u001b[32m0\u001b[39m].g\n\u001b[32m      6\u001b[39m \u001b[38;5;28mprint\u001b[39m(g.input[-\u001b[32m1\u001b[39m].type, \u001b[38;5;28mdir\u001b[39m(g.input[-\u001b[32m1\u001b[39m].type))\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[43mg\u001b[49m\u001b[43m.\u001b[49m\u001b[43minput\u001b[49m\u001b[43m[\u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtype\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcopy_from\u001b[49m(onnx.TypeProto())\n\u001b[32m      8\u001b[39m g.output[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n",
-      "\u001b[31mAttributeError\u001b[39m: copy_from"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def remove_inplace_body_last_input_output_type_for_loop(filename: str):\n",
     "    model = onnx.load(filename, load_external_data=False)\n",
@@ -249,6 +226,7 @@
     "            g = node.attribute[0].g\n",
     "            g.input[-1].type.CopyFrom(onnx.TypeProto())\n",
     "            g.output[-1].type.CopyFrom(onnx.TypeProto())\n",
+    "    onnx.save(model, filename, save_as_external_data=False)\n",
     "\n",
     "\n",
     "remove_inplace_body_last_input_output_type_for_loop(model_name)"
diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -64,6 +64,7 @@ def to_onnx(
     exporter_kwargs: Optional[Dict[str, Any]] = None,
     save_ep: Optional[str] = None,
     optimize: bool = True,
+    optimizer_for_ort: bool = True,
     use_control_flow_dispatcher: bool = False,
     onnx_plugs: Optional[List[EagerDirectReplacementWithOnnx]] = None,
     inline: bool = True,
@@ -88,6 +89,7 @@ def to_onnx(
     :param exporter_kwargs: additional parameters sent to the exporter
     :param save_ep: saves the exported program
     :param optimize: optimizes the model
+    :param optimizer_for_ort: optimizes the model for onnxruntime
     :param use_control_flow_dispatcher: use the dispatcher created to supported
         custom loops (see :func:`onnx_diagnostic.export.control_flow_onnx.loop_for_onnx`)
     :param onnx_plugs: the code was modified to replace some parts with onnx translation
@@ -126,8 +128,10 @@ def to_onnx(
         options = None
         if exporter_kwargs is not None:
             options = exporter_kwargs.pop("options", None)
-        if options is None:
-            options = OptimizationOptions(patterns="default+onnxruntime")
+        if options is None and optimize:
+            options = OptimizationOptions(
+                patterns="default+onnxruntime" if optimizer_for_ort else "default"
+            )
         main_dispatcher = (
             get_main_dispatcher(use_control_flow_dispatcher, onnx_plugs)
             if onnx_plugs or use_control_flow_dispatcher
@@ -161,6 +165,9 @@ def to_onnx(
         assert (
             not output_dynamic_shapes
         ), f"output_dynamic_shapes not supported for exporter={exporter!r}"
+        assert (
+            optimize
+        ), f"torch.onnx.export always optimizes the model but optimize={optimize}"
         custom_translation_table = {}
         if onnx_plugs:
             for plug in onnx_plugs:
@@ -180,7 +187,7 @@ def to_onnx(
             custom_translation_table=custom_translation_table,
             **(exporter_kwargs or {}),
         )
-        if not inline and optimize:
+        if not inline and optimize and optimizer_for_ort:
             ort_fusions.optimize_for_ort(epo.model)
 
         if onnx_plugs:
@@ -207,7 +214,7 @@ def to_onnx(
                 common_passes.InlinePass()(epo.model)
                 common_passes.RemoveUnusedOpsetsPass()(epo.model)
 
-        if inline and optimize:
+        if inline and optimize and optimizer_for_ort:
             ort_fusions.optimize_for_ort(epo.model)
         if filename:
             epo.save(filename, external_data=True)
@@ -232,6 +239,10 @@ def to_onnx(
             f"Only a specified set of inputs is supported for exporter={exporter!r}, "
             f"but it is {list(kwargs)}"  # type: ignore[arg-type]
         )
+        assert optimizer_for_ort and optimize, (
+            f"ModelBuilder only produces model optimized for onnxruntime but "
+            f"optimizer_for_ort={optimizer_for_ort} and optimize={optimize}"
+        )
         flat_inputs = flatten_object(kwargs, drop_keys=True)
         first = flat_inputs[0]
         first_float = [