sdpython
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎_dump_test_26/sbs_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.sh‎
Lines changed: 7 additions & 0 deletions b/‎_dump_test_26/sbs_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎_dump_test_26/test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.graph‎
Lines changed: 2622 additions & 0 deletions b/‎_dump_test_26/test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.graph‎
Lines changed: 2622 additions & 0 deletions
diff --git a/‎_keep_dump_test/sbs_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.sh‎
Lines changed: 7 additions & 0 deletions b/‎_keep_dump_test/sbs_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎_keep_dump_test/test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.graph‎
Lines changed: 2552 additions & 0 deletions b/‎_keep_dump_test/test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.graph‎
Lines changed: 2552 additions & 0 deletions
diff --git a/‎_scripts/qwen25_vl_visual.py‎ ‎_scripts/export_qwen25_vl_visual.py‎_scripts/qwen25_vl_visual.py renamed to _scripts/export_qwen25_vl_visual.py
Lines changed: 74 additions & 19 deletions b/‎_scripts/qwen25_vl_visual.py‎ ‎_scripts/export_qwen25_vl_visual.py‎_scripts/qwen25_vl_visual.py renamed to _scripts/export_qwen25_vl_visual.py
Lines changed: 74 additions & 19 deletions
@@ -21,6 +21,7 @@
 *.sqlitest
 *.svg
 *.onnx.stats
+*.stats
 CodeLlama*
 _tools/benchenv**
 _tools/repos**
 
@@ -0,0 +1,7 @@
+
+clear&&python -m onnx_diagnostic sbs \
+    -i qwen25_vli_visual.inputs.pt \
+    -e test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.pt2 \
+    -m test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.onnx \
+    -o test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.xlsx \
+    -v 1 --atol 0.1 --rtol 1000
@@ -0,0 +1,7 @@
+
+clear&&python -m onnx_diagnostic sbs \
+    -i qwen25_vli_visual.inputs.pt \
+    -e test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.pt2 \
+    -m test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.onnx \
+    -o test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.xlsx \
+    -v 1 --atol 0.1 --rtol 1000
@@ -1,8 +1,43 @@
+"""
+export visual embedding of Qwen/Qwen2.5-VL-7B-Instruct
+======================================================
+
+requirements
+++++++++++++
+
+git+https://github.com/sdpython/experimental-experiment.git
+huggingface_hub>=1.2.1
+onnx-diagnostic>=0.8.4
+onnxruntime>=1.23
+torch>=2.9  # weekly is better
+transformers>=4.57
+
+example
++++++++
+
+.. code-block:: bash
+
+    python export_qwen25_vl_visual.py -m Qwen/Qwen2.5-VL-7B-Instruct --device cpu --dtype float32 --exporter custom --pretrained --second-input
+"""
+
 import os
 import sys
+import time
 from argparse import ArgumentParser, BooleanOptionalAction
 
 
+def remove_inplace_body_last_input_output_type_for_loop(filename: str):
+    import onnx
+
+    model = onnx.load(filename, load_external_data=False)
+    for node in model.graph.node:
+        if node.op_type == "Loop":
+            g = node.attribute[0].g
+            g.input[-1].type.CopyFrom(onnx.TypeProto())
+            g.output[-1].type.CopyFrom(onnx.TypeProto())
+    onnx.save(model, filename, save_as_external_data=False)
+
+
 def main(
     model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
     device: str = "cpu",
@@ -107,6 +142,8 @@ def _config_reduction(config, task):
 
     filename = f"qwen25_vli_visual.{device}.{dtype}.{exporter}.onnx"
     print(f"-- export in {filename!r}")
+    stat_file = filename.replace(".onnx", ".stats")
+    begin = time.perf_counter()
 
     export_inputs = inputs
     with torch_export_patches(
@@ -131,25 +168,43 @@ def _config_reduction(config, task):
             optimize=True,
             onnx_plugs=PLUGS,
         )
-
-    print("-- checking discrepancies")
-    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
-    if device == "cpu":
-        providers = providers[1:]
-    sess = onnxruntime.InferenceSession(filename, providers=providers)
-
-    print(f"-- inputs {string_type(inputs, with_shape=True)}")
-    feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
-    small = sess.run(None, feeds)
-    diff = max_diff(expected, small[0], hist=[0.1])
-    print(f"-- discrepancies={diff}")
-
-    if second_input:
-        print(f"-- inputs {string_type(big_inputs, with_shape=True)}")
-        feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
-        big = sess.run(None, feeds)
-        diff = max_diff(expected_big, big[0], hist=[0.1])
-        print(f"-- discrepancies={diff}")
+    duration = time.perf_counter() - begin
+
+    if exporter == "onnx-dynamo":
+        # onnx-dynamo fails at producing function body with sequences as input / output.
+        # They are replaced by tensor type one step in the model.
+        print("-- remove_body_last_input_output_for_loop")
+        remove_inplace_body_last_input_output_type_for_loop(filename)
+        print("-- done.")
+
+    with open(stat_file, "w") as f:
+
+        def fprint(s):
+            print(s)
+            f.write(f"{s}\n")
+
+        fprint(f"-- export duration: {duration}")
+        fprint("-- checking discrepancies")
+        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        if device == "cpu":
+            providers = providers[1:]
+        sess = onnxruntime.InferenceSession(filename, providers=providers)
+
+        fprint(f"-- inputs {string_type(inputs, with_shape=True)}")
+        fprint(f"-- expected {string_type(expected, with_shape=True)}")
+        feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
+        small = sess.run(None, feeds)
+        diff = max_diff(expected, small[0], hist=[0.1])
+        fprint(f"-- discrepancies={diff}")
+
+        if second_input:
+            fprint("")
+            fprint(f"-- inputs {string_type(big_inputs, with_shape=True)}")
+            fprint(f"-- expected {string_type(expected_big, with_shape=True)}")
+            feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
+            big = sess.run(None, feeds)
+            diff = max_diff(expected_big, big[0], hist=[0.1])
+            fprint(f"-- discrepancies={diff}")
 
 
 def get_parser() -> ArgumentParser: