Skip to content

Commit f4dc9d1

Browse files
committed
final fixes
1 parent 782ecfb commit f4dc9d1

3 files changed

Lines changed: 41 additions & 34 deletions

File tree

_scripts/export_qwen25_vl_visual.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,17 @@
1818
.. code-block:: bash
1919
2020
python export_qwen25_vl_visual.py -m Qwen/Qwen2.5-VL-7B-Instruct --device cpu --dtype float32 --exporter onnx-dynamo --pretrained --second-input
21+
22+
Attention
23+
+++++++++
24+
25+
The attention is either implemented with ``MultiHeadAttention`` in a loop, either with ``PackedMultiHeadAttention``.
26+
The choice is made based on the device. It is possible to overwrite this by by setting
27+
environment variable to ``QWEN25ATTENTION`` to:
28+
29+
* ``PACKED``: PackedMultiHeadAttention
30+
* ``LOOPMHA``: Loop over MultiHeadAttention
31+
* ``LOOPA24``: Loop over Attention(24), needs opset 23 or 24.
2132
"""
2233

2334
import os
@@ -145,11 +156,16 @@ def _config_reduction(config, task):
145156
)
146157

147158
prefix = simplify_model_id_for_a_filename(model_id)
159+
if "QWEN25ATTENTION" in os.environ:
160+
prefix = f"{prefix}.{os.environ['QWEN25ATTENTION']}"
148161
filename = f"model.{prefix}.visual.{device}.{dtype}.{exporter}.onnx"
149162
print(f"-- export in {filename!r}")
150163
stat_file = filename.replace(".onnx", ".stats")
151164
begin = time.perf_counter()
152165

166+
if exporter == "onnx-dynamo" and device == "cuda" and "QWEN25ATTENTION" not in os.environ:
167+
os.environ["QWEN25ATTENTION"] = "PACKED"
168+
153169
export_inputs = inputs
154170
with torch_export_patches(
155171
patch_torch=False,
@@ -189,23 +205,25 @@ def fprint(s):
189205
f.write(f"{s}\n")
190206

191207
fprint(f"-- export duration: {duration}")
192-
fprint("-- checking discrepancies")
193208
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
194209
if device == "cpu":
195210
providers = providers[1:]
211+
fprint(f"-- checking discrepancies with providers={providers!r}")
196212
sess = onnxruntime.InferenceSession(filename, providers=providers)
197213

198-
fprint(f"-- inputs {string_type(inputs, with_shape=True)}")
199-
fprint(f"-- expected {string_type(expected, with_shape=True)}")
214+
fprint(f"-- inputs {string_type(inputs, with_shape=True, with_device=True)}")
215+
fprint(f"-- expected {string_type(expected, with_shape=True, with_device=True)}")
200216
feeds = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
201217
small = sess.run(None, feeds)
202218
diff = max_diff(expected, small[0], hist=[0.1])
203219
fprint(f"-- discrepancies={diff}")
204220

205221
if second_input:
206222
fprint("")
207-
fprint(f"-- inputs {string_type(big_inputs, with_shape=True)}")
208-
fprint(f"-- expected {string_type(expected_big, with_shape=True)}")
223+
fprint(f"-- inputs {string_type(big_inputs, with_shape=True, with_device=True)}")
224+
fprint(
225+
f"-- expected {string_type(expected_big, with_shape=True, with_device=True)}"
226+
)
209227
feeds = {k: v.detach().cpu().numpy() for k, v in big_inputs.items()}
210228
big = sess.run(None, feeds)
211229
diff = max_diff(expected_big, big[0], hist=[0.1])

_scripts/investigate.ipynb

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -214,33 +214,10 @@
214214
},
215215
{
216216
"cell_type": "code",
217-
"execution_count": null,
217+
"execution_count": 12,
218218
"id": "e0e65bcd",
219219
"metadata": {},
220-
"outputs": [
221-
{
222-
"name": "stdout",
223-
"output_type": "stream",
224-
"text": [
225-
"tensor_type {\n",
226-
" elem_type: 1\n",
227-
"}\n",
228-
" ['ByteSize', 'Clear', 'ClearExtension', 'ClearField', 'CopyFrom', 'DESCRIPTOR', 'DiscardUnknownFields', 'FindInitializationErrors', 'FromString', 'HasExtension', 'HasField', 'IsInitialized', 'ListFields', 'Map', 'MergeFrom', 'MergeFromString', 'Opaque', 'Optional', 'ParseFromString', 'Sequence', 'SerializePartialToString', 'SerializeToString', 'SetInParent', 'SparseTensor', 'Tensor', 'UnknownFields', 'WhichOneof', '_CheckCalledFromGeneratedFile', '_ListFieldsItemKey', '_SetListener', '__class__', '__contains__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__unicode__', 'denotation', 'map_type', 'opaque_type', 'optional_type', 'sequence_type', 'sparse_tensor_type', 'tensor_type']\n"
229-
]
230-
},
231-
{
232-
"ename": "AttributeError",
233-
"evalue": "copy_from",
234-
"output_type": "error",
235-
"traceback": [
236-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
237-
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
238-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m 7\u001b[39m g.input[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n\u001b[32m 8\u001b[39m g.output[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m \u001b[43mremove_inplace_body_last_input_output_type_for_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m\n",
239-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 7\u001b[39m, in \u001b[36mremove_inplace_body_last_input_output_type_for_loop\u001b[39m\u001b[34m(filename)\u001b[39m\n\u001b[32m 5\u001b[39m g = node.attribute[\u001b[32m0\u001b[39m].g\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(g.input[-\u001b[32m1\u001b[39m].type, \u001b[38;5;28mdir\u001b[39m(g.input[-\u001b[32m1\u001b[39m].type))\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[43mg\u001b[49m\u001b[43m.\u001b[49m\u001b[43minput\u001b[49m\u001b[43m[\u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtype\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcopy_from\u001b[49m(onnx.TypeProto())\n\u001b[32m 8\u001b[39m g.output[-\u001b[32m1\u001b[39m].type.copy_from(onnx.TypeProto())\n",
240-
"\u001b[31mAttributeError\u001b[39m: copy_from"
241-
]
242-
}
243-
],
220+
"outputs": [],
244221
"source": [
245222
"def remove_inplace_body_last_input_output_type_for_loop(filename: str):\n",
246223
" model = onnx.load(filename, load_external_data=False)\n",
@@ -249,6 +226,7 @@
249226
" g = node.attribute[0].g\n",
250227
" g.input[-1].type.CopyFrom(onnx.TypeProto())\n",
251228
" g.output[-1].type.CopyFrom(onnx.TypeProto())\n",
229+
" onnx.save(model, filename, save_as_external_data=False)\n",
252230
"\n",
253231
"\n",
254232
"remove_inplace_body_last_input_output_type_for_loop(model_name)"

onnx_diagnostic/export/api.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def to_onnx(
6464
exporter_kwargs: Optional[Dict[str, Any]] = None,
6565
save_ep: Optional[str] = None,
6666
optimize: bool = True,
67+
optimizer_for_ort: bool = True,
6768
use_control_flow_dispatcher: bool = False,
6869
onnx_plugs: Optional[List[EagerDirectReplacementWithOnnx]] = None,
6970
inline: bool = True,
@@ -88,6 +89,7 @@ def to_onnx(
8889
:param exporter_kwargs: additional parameters sent to the exporter
8990
:param save_ep: saves the exported program
9091
:param optimize: optimizes the model
92+
:param optimizer_for_ort: optimizes the model for onnxruntime
9193
:param use_control_flow_dispatcher: use the dispatcher created to supported
9294
custom loops (see :func:`onnx_diagnostic.export.control_flow_onnx.loop_for_onnx`)
9395
:param onnx_plugs: the code was modified to replace some parts with onnx translation
@@ -126,8 +128,10 @@ def to_onnx(
126128
options = None
127129
if exporter_kwargs is not None:
128130
options = exporter_kwargs.pop("options", None)
129-
if options is None:
130-
options = OptimizationOptions(patterns="default+onnxruntime")
131+
if options is None and optimize:
132+
options = OptimizationOptions(
133+
patterns="default+onnxruntime" if optimizer_for_ort else "default"
134+
)
131135
main_dispatcher = (
132136
get_main_dispatcher(use_control_flow_dispatcher, onnx_plugs)
133137
if onnx_plugs or use_control_flow_dispatcher
@@ -161,6 +165,9 @@ def to_onnx(
161165
assert (
162166
not output_dynamic_shapes
163167
), f"output_dynamic_shapes not supported for exporter={exporter!r}"
168+
assert (
169+
optimize
170+
), f"torch.onnx.export always optimizes the model but optimize={optimize}"
164171
custom_translation_table = {}
165172
if onnx_plugs:
166173
for plug in onnx_plugs:
@@ -180,7 +187,7 @@ def to_onnx(
180187
custom_translation_table=custom_translation_table,
181188
**(exporter_kwargs or {}),
182189
)
183-
if not inline and optimize:
190+
if not inline and optimize and optimizer_for_ort:
184191
ort_fusions.optimize_for_ort(epo.model)
185192

186193
if onnx_plugs:
@@ -207,7 +214,7 @@ def to_onnx(
207214
common_passes.InlinePass()(epo.model)
208215
common_passes.RemoveUnusedOpsetsPass()(epo.model)
209216

210-
if inline and optimize:
217+
if inline and optimize and optimizer_for_ort:
211218
ort_fusions.optimize_for_ort(epo.model)
212219
if filename:
213220
epo.save(filename, external_data=True)
@@ -232,6 +239,10 @@ def to_onnx(
232239
f"Only a specified set of inputs is supported for exporter={exporter!r}, "
233240
f"but it is {list(kwargs)}" # type: ignore[arg-type]
234241
)
242+
assert optimizer_for_ort and optimize, (
243+
f"ModelBuilder only produces model optimized for onnxruntime but "
244+
f"optimizer_for_ort={optimizer_for_ort} and optimize={optimize}"
245+
)
235246
flat_inputs = flatten_object(kwargs, drop_keys=True)
236247
first = flat_inputs[0]
237248
first_float = [

0 commit comments

Comments
 (0)