Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
*.sqlitest
*.svg
*.onnx.stats
*.stats
CodeLlama*
_tools/benchenv**
_tools/repos**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

clear&&python -m onnx_diagnostic sbs \
-i qwen25_vli_visual.inputs.pt \
-e test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.pt2 \
-m test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.onnx \
-o test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.xlsx \
-v 1 --atol 0.1 --rtol 1000
2,622 changes: 2,622 additions & 0 deletions _dump_test_26/test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.graph

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

clear&&python -m onnx_diagnostic sbs \
-i qwen25_vli_visual.inputs.pt \
-e test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.graph.ep.pt2 \
-m test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.onnx \
-o test_qwen25_vli_visual.cpu.float32.LOOPMHA.custom.xlsx \
-v 1 --atol 0.1 --rtol 1000

Large diffs are not rendered by default.

116 changes: 97 additions & 19 deletions _scripts/qwen25_vl_visual.py → _scripts/export_qwen25_vl_visual.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,58 @@
"""
Export visual embedding of Qwen/Qwen2.5-VL-7B-Instruct
======================================================

requirements
++++++++++++

git+https://github.com/sdpython/experimental-experiment.git
huggingface_hub>=1.2.1
onnx-diagnostic>=0.8.4
onnxruntime>=1.23
torch>=2.9 # weekly is better
transformers>=4.57

Examples
++++++++

.. code-block:: bash

python export_qwen25_vl_visual.py -m Qwen/Qwen2.5-VL-7B-Instruct --device cpu --dtype float32 --exporter onnx-dynamo --pretrained --second-input

Attention
+++++++++

The attention is either implemented with ``MultiHeadAttention`` in a loop, either with ``PackedMultiHeadAttention``.
The choice is made based on the device. It is possible to overwrite this by by setting
environment variable to ``QWEN25ATTENTION`` to:

* ``PACKED``: PackedMultiHeadAttention
* ``LOOPMHA``: Loop over MultiHeadAttention
* ``LOOPA24``: Loop over Attention(24), needs opset 23 or 24.
"""

import os
import sys
import time
from argparse import ArgumentParser, BooleanOptionalAction


def remove_inplace_body_last_input_output_type_for_loop(filename: str):
import onnx

model = onnx.load(filename, load_external_data=False)
for node in model.graph.node:
if node.op_type == "Loop":
g = node.attribute[0].g
g.input[-1].type.CopyFrom(onnx.TypeProto())
g.output[-1].type.CopyFrom(onnx.TypeProto())
onnx.save(model, filename, save_as_external_data=False)


def simplify_model_id_for_a_filename(model_id: str) -> str:
return model_id.lower().replace("/", ".")


def main(
model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
device: str = "cpu",
Expand Down Expand Up @@ -105,8 +155,16 @@ def _config_reduction(config, task):
grid_thw={}, # {0: "n_images"}, # TODO: fix
)

filename = f"qwen25_vli_visual.{device}.{dtype}.{exporter}.onnx"
prefix = simplify_model_id_for_a_filename(model_id)
if "QWEN25ATTENTION" in os.environ:
prefix = f"{prefix}.{os.environ['QWEN25ATTENTION']}"
filename = f"model.{prefix}.visual.{device}.{dtype}.{exporter}.onnx"
print(f"-- export in {filename!r}")
stat_file = filename.replace(".onnx", ".stats")
begin = time.perf_counter()

if exporter == "onnx-dynamo" and device == "cuda" and "QWEN25ATTENTION" not in os.environ:
os.environ["QWEN25ATTENTION"] = "PACKED"

export_inputs = inputs
with torch_export_patches(
Expand All @@ -131,25 +189,45 @@ def _config_reduction(config, task):
optimize=True,
onnx_plugs=PLUGS,
)
duration = time.perf_counter() - begin

if exporter == "onnx-dynamo":
# onnx-dynamo fails at producing function body with sequences as input / output.
# They are replaced by tensor type one step in the model.
print("-- remove_body_last_input_output_for_loop")
remove_inplace_body_last_input_output_type_for_loop(filename)
print("-- done.")

with open(stat_file, "w") as f:

def fprint(s):
print(s)
f.write(f"{s}\n")

fprint(f"-- export duration: {duration}")
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
if device == "cpu":
providers = providers[1:]
fprint(f"-- checking discrepancies with providers={providers!r}")
sess = onnxruntime.InferenceSession(filename, providers=providers)

fprint(f"-- inputs {string_type(inputs, with_shape=True, with_device=True)}")
fprint(f"-- expected {string_type(expected, with_shape=True, with_device=True)}")
feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
small = sess.run(None, feeds)
diff = max_diff(expected, small[0], hist=[0.1])
fprint(f"-- discrepancies={diff}")

print("-- checking discrepancies")
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
if device == "cpu":
providers = providers[1:]
sess = onnxruntime.InferenceSession(filename, providers=providers)

print(f"-- inputs {string_type(inputs, with_shape=True)}")
feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
small = sess.run(None, feeds)
diff = max_diff(expected, small[0], hist=[0.1])
print(f"-- discrepancies={diff}")

if second_input:
print(f"-- inputs {string_type(big_inputs, with_shape=True)}")
feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
big = sess.run(None, feeds)
diff = max_diff(expected_big, big[0], hist=[0.1])
print(f"-- discrepancies={diff}")
if second_input:
fprint("")
fprint(f"-- inputs {string_type(big_inputs, with_shape=True, with_device=True)}")
fprint(
f"-- expected {string_type(expected_big, with_shape=True, with_device=True)}"
)
feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
big = sess.run(None, feeds)
diff = max_diff(expected_big, big[0], hist=[0.1])
fprint(f"-- discrepancies={diff}")


def get_parser() -> ArgumentParser:
Expand Down
Loading
Loading