More tests about patches

xadupre · xadupre · commit f2805e3513b5 · 2026-02-14T12:51:53.000+01:00
diff --git a/_unittests/ut_helpers/test_args_helper.py b/_unittests/ut_helpers/test_args_helper.py
@@ -1,6 +1,10 @@
 import unittest
 from onnx_diagnostic.ext_test_case import ExtTestCase
-from onnx_diagnostic.helpers.args_helper import get_parsed_args, check_cuda_availability
+from onnx_diagnostic.helpers.args_helper import (
+    get_parsed_args,
+    check_cuda_availability,
+    process_outputname,
+)
 
 
 class TestHelpers(ExtTestCase):
@@ -52,6 +56,10 @@ def test_args_expose(self):
         self.assertEqual(args.repeat, 10)
         self.assertEqual(args.warmup, 5)
 
+    def test_process_outputname(self):
+        self.assertEqual("ggg.g", process_outputname("ggg.g", "hhh.h"))
+        self.assertEqual("hhh.ggg.h", process_outputname("+.ggg", "hhh.h"))
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_helpers/test_onnx_helper.py b/_unittests/ut_helpers/test_onnx_helper.py
@@ -860,6 +860,66 @@ def test_make_model_with_local_functions_2(self):
 
         check_model(new_model)
 
+    @hide_stdout()
+    def test_make_model_with_local_functions_3(self):
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("Unsqueeze", ["X", "zero"], ["xu1"]),
+                    oh.make_node("Unsqueeze", ["xu1", "un"], ["xu2"]),
+                    oh.make_node("Reshape", ["xu2", "shape1"], ["xm1"]),
+                    oh.make_node("Reshape", ["Y", "shape2"], ["xm2c"]),
+                    oh.make_node("Cast", ["xm2c"], ["xm2"], to=1),
+                    oh.make_node("MatMul", ["xm1", "xm2"], ["xm"]),
+                    oh.make_node("Reshape", ["xm", "shape3"], ["Z"]),
+                ],
+                "dummy",
+                [oh.make_tensor_value_info("X", TFLOAT, [320, 1280])],
+                [oh.make_tensor_value_info("Z", TFLOAT, [3, 5, 320, 640])],
+                [
+                    onh.from_array(
+                        np.random.rand(3, 5, 1280, 640).astype(np.float32), name="Y"
+                    ),
+                    onh.from_array(np.array([0], dtype=np.int64), name="zero"),
+                    onh.from_array(np.array([1], dtype=np.int64), name="un"),
+                    onh.from_array(np.array([1, 320, 1280], dtype=np.int64), name="shape1"),
+                    onh.from_array(np.array([15, 1280, 640], dtype=np.int64), name="shape2"),
+                    onh.from_array(np.array([3, 5, 320, 640], dtype=np.int64), name="shape3"),
+                ],
+            ),
+            opset_imports=[oh.make_opsetid("", 18)],
+            ir_version=9,
+        )
+        for i_node in range(len(model.graph.node) - 1):
+            if i_node == 2:
+                continue
+            node = model.graph.node[i_node]
+            meta = node.metadata_props.add()
+            meta.key = f"source[{i_node}]"
+            meta.value = "LLL"
+        new_model = make_model_with_local_functions(
+            model, "^LLL$", metadata_key_prefix="source[", verbose=1
+        )
+        check_model(model)
+        self.assertEqual(len(new_model.functions), 1)
+        p = pretty_onnx(new_model)
+        self.assertIn("LLL0[local_function]", p)
+        self.assertIn("LLL1[local_function]", p)
+
+        self.assertEqual(["X", "shape1", "un", "zero"], new_model.functions[0].input)
+        self.assertEqual(["xm1"], new_model.functions[0].output)
+        self.assertEqual("LLL0", new_model.functions[0].name)
+        self.assertEqual("local_function", new_model.functions[0].domain)
+        self.assertEqual(len(new_model.functions[0].node), 3)
+
+        self.assertEqual(["Y", "shape2"], new_model.functions[1].input)
+        self.assertEqual(["xm2c"], new_model.functions[1].output)
+        self.assertEqual("LLL1", new_model.functions[1].name)
+        self.assertEqual("local_function", new_model.functions[1].domain)
+        self.assertEqual(len(new_model.functions[1].node), 1)
+
+        check_model(new_model)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -19,6 +19,7 @@
 from onnx_diagnostic.torch_models.hghub.hub_api import get_cached_configuration
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches.patches.patch_transformers import (
     patch_qwen2_5,
     patch_funnel,
@@ -392,6 +393,20 @@ def forward(self, q, k, cos, sin):
             rtol=1,
         )
 
+    @requires_transformers("4.55")
+    @requires_onnxscript("0.6.2")
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    def test_qwen_function_proto(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            LoopAttention23,
+            LoopMHAAttention,
+            PackedAttention,
+        )
+
+        LoopMHAAttention.to_function_proto()
+        LoopAttention23.to_function_proto()
+        PackedAttention.to_function_proto()
+
     @requires_transformers("4.55")
     @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
     def test_patched_qwen2_5_vl_rot_pos_emb(self):
@@ -874,6 +889,166 @@ def test_model_funnel(self):
         got = patched.relative_positional_attention(**inputs)
         self.assertEqualArray(expected, got)
 
+    def test_cache_dependant_input_preparation_exporting(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_generation_mixin import (  # noqa: E501
+            patched_GenerationMixin as GenerationMixin,
+        )
+
+        with self.subTest(case="case1"):
+            input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)[:, :0]
+            inputs_embeds = torch.rand((2, 8), dtype=torch.float32)
+            cache_position = torch.arange(0, 8, dtype=torch.int64)
+            eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(
+                input_ids, inputs_embeds, cache_position
+            )
+            export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+                input_ids, inputs_embeds, cache_position
+            )
+            torch.testing.assert_close(eager1, export1)
+            torch.testing.assert_close(eager2, export2)
+
+        with self.subTest(case="case2"):
+            input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)
+            inputs_embeds = torch.rand((2, 8), dtype=torch.float32)
+            cache_position = torch.arange(0, 8, dtype=torch.int64)
+            eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(
+                input_ids, inputs_embeds, cache_position
+            )
+            export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+                input_ids, inputs_embeds, cache_position
+            )
+            torch.testing.assert_close(eager1, export1)
+            torch.testing.assert_close(eager2, export2)
+
+        with self.subTest(case="case3"):
+            input_ids = torch.randint(0, 16, (2, 12), dtype=torch.int64)
+            inputs_embeds = None
+            cache_position = torch.arange(0, 8, dtype=torch.int64)
+            eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(
+                input_ids, inputs_embeds, cache_position
+            )
+            export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+                input_ids, inputs_embeds, cache_position
+            )
+            torch.testing.assert_close(eager1, export1)
+            torch.testing.assert_close(eager2, export2)
+
+        with self.subTest(case="case4"):
+            input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)
+            inputs_embeds = None
+            cache_position = torch.arange(0, 8, dtype=torch.int64)
+            eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(
+                input_ids, inputs_embeds, cache_position
+            )
+            export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+                input_ids, inputs_embeds, cache_position
+            )
+            torch.testing.assert_close(eager1, export1)
+            torch.testing.assert_close(eager2, export2)
+
+    def test_prepare_inputs_for_generation_decoder_llm(self):
+        data = get_untrained_model_with_inputs(
+            "hf-internal-testing/tiny-random-LlamaForCausalLM"
+        )
+        model = data["model"]
+        config = model.config
+        torch_device = "cpu"
+
+        with torch_export_patches(patch_transformers=True):
+            with self.subTest(case="case1"):
+                self.assertTrue("GenerationMixin" in str(model.prepare_inputs_for_generation))
+
+            input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]).to(torch_device)
+            cache_position = torch.arange(input_ids.shape[1], device=input_ids.device)
+
+            with self.subTest(case="case2"):
+                input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]).to(torch_device)
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids, cache_position=cache_position
+                )
+                self.assertTrue(torch.all(model_inputs["input_ids"] == input_ids))
+
+            with self.subTest(case="case3"):
+                attention_mask = torch.tensor([[1, 1, 1], [1, 1, 1]]).to(torch_device)
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids, attention_mask=attention_mask, cache_position=cache_position
+                )
+                self.assertTrue(torch.all(model_inputs["attention_mask"] == attention_mask))
+                self.assertTrue(model_inputs["position_ids"].shape == input_ids.shape)
+
+            with self.subTest(case="case4"):
+                self.assertFalse("use_cache" in model_inputs)
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids, use_cache=True, foo="bar", cache_position=cache_position
+                )
+                self.assertTrue(model_inputs["use_cache"] is True)
+                self.assertTrue(model_inputs["foo"] == "bar")
+
+            with self.subTest(case="case5"):
+                init_input_ids = input_ids[:, :2]
+                dynamic_cache = transformers.cache_utils.DynamicCache(config=config)
+                dynamic_cache = model(
+                    init_input_ids, past_key_values=dynamic_cache
+                ).past_key_values
+                with self.assertRaises((AttributeError, TypeError)):
+                    model_inputs = model.prepare_inputs_for_generation(
+                        input_ids, past_key_values=dynamic_cache
+                    )
+
+            with self.subTest(case="case6"):
+                cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long).to(
+                    torch_device
+                )
+                cache_position = cache_position[dynamic_cache.get_seq_length() :]
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids,
+                    past_key_values=dynamic_cache,
+                    cache_position=cache_position,
+                    attention_mask=attention_mask,
+                )
+                self.assertTrue("past_key_values" in model_inputs)
+                self.assertTrue(torch.all(model_inputs["cache_position"] == cache_position))
+                self.assertTrue(
+                    model_inputs["input_ids"].shape[-1] == 1
+                )  # 1 = 3 fed tokens - 2 tokens in the cache
+                self.assertTrue(model_inputs["position_ids"].shape[-1] == 1)
+                self.assertTrue(
+                    model_inputs["attention_mask"].shape[-1] == 3
+                )  # we still need the full attention mask!
+
+            with self.subTest(case="case6.2"):
+                max_cache_len = 10
+                batch_size = 2
+                query_length = input_ids.shape[-1] - init_input_ids.shape[-1]
+                static_cache = transformers.cache_utils.StaticCache(
+                    config=config, max_cache_len=max_cache_len
+                )
+                static_cache = model(
+                    init_input_ids, past_key_values=static_cache
+                ).past_key_values
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids,
+                    past_key_values=static_cache,
+                    cache_position=cache_position,
+                    attention_mask=attention_mask,
+                )
+                self.assertTrue("past_key_values" in model_inputs)
+                self.assertTrue(
+                    list(model_inputs["attention_mask"].shape)
+                    == [batch_size, 1, query_length, max_cache_len]
+                )
+
+            with self.subTest(case="case7"):
+                init_inputs_embeds = model.get_input_embeddings()(init_input_ids)
+                model_inputs = model.prepare_inputs_for_generation(
+                    input_ids,
+                    past_key_values=dynamic_cache,
+                    inputs_embeds=init_inputs_embeds,
+                    cache_position=cache_position,
+                )
+                self.assertTrue(model_inputs["input_ids"] is not None)
+                self.assertTrue(model_inputs["inputs_embeds"] is None)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/_command_lines_parser.py b/onnx_diagnostic/_command_lines_parser.py
@@ -47,6 +47,7 @@ def get_parser_dot() -> ArgumentParser:
 
 def _cmd_dot(argv: List[Any]):
     import subprocess
+    from .helpers.args_helper import process_outputname
     from .helpers.dot_helper import to_dot
 
     parser = get_parser_dot()
@@ -58,15 +59,17 @@ def _cmd_dot(argv: List[Any]):
         print("-- converts into dot")
     dot = to_dot(onx)
     if args.output:
+        outname = process_outputname(args.output, args.input)
         if args.verbose:
-            print(f"-- saves into {args.output}")
-        with open(args.output, "w") as f:
+            print(f"-- saves into {outname!r}")
+        with open(outname, "w") as f:
             f.write(dot)
     else:
         print(dot)
     if args.run:
         assert args.output, "Cannot run dot without an output file."
-        cmds = ["dot", f"-T{args.run}", args.output, "-o", f"{args.output}.{args.run}"]
+        outname = process_outputname(outname, args.input)
+        cmds = ["dot", f"-T{args.run}", outname, "-o", f"{args.output}.{args.run}"]
         if args.verbose:
             print(f"-- run {' '.join(cmds)}")
         p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -1553,10 +1556,11 @@ def _cmd_optimize(argv: List[Any]):
     parser = get_parser_optimize()
     args = parser.parse_args(argv[1:])
 
+    from .helpers.args_helper import process_outputname
     from .helpers.optim_helper import optimize_model
 
     output = (
-        args.output
+        process_outputname(args.output, args.input)
         if args.output
         else f"{os.path.splitext(args.input)[0]}.o-{args.algorithm}.onnx"
     )
@@ -1586,10 +1590,21 @@ def get_parser_partition() -> ArgumentParser:
             The regular may match the following values,
             'model.layers.0.forward', 'model.layers.1.forward', ...
             A local function will be created for each distinct layer.
+
+            Example:
+
+                python -m onnx_diagnostic partition \\
+                        model.onnx +.part -v 1 -r "model.layers.0.s.*"
             """),
     )
     parser.add_argument("input", help="input model")
-    parser.add_argument("output", help="output model")
+    parser.add_argument(
+        "output",
+        help=textwrap.dedent("""
+            output model, an expression like '+.part'
+            inserts '.part' just before the extension"
+            """).strip("\n"),
+    )
     parser.add_argument(
         "-r",
         "--regex",
@@ -1619,6 +1634,7 @@ def get_parser_partition() -> ArgumentParser:
 
 
 def _cmd_partition(argv: List[Any]):
+    from .helpers.args_helper import process_outputname
     from .helpers.onnx_helper import make_model_with_local_functions
 
     parser = get_parser_partition()
@@ -1635,9 +1651,10 @@ def _cmd_partition(argv: List[Any]):
         metadata_key_prefix=tuple(args.meta_prefix.split(",")),
         verbose=args.verbose,
     )
+    outname = process_outputname(args.output, args.input)
     if args.verbose:
-        print(f"-- save into {args.output!r}")
-    onnx.save(onx2, args.output)
+        print(f"-- save into {outname!r}")
+    onnx.save(onx2, outname)
     if args.verbose:
         print("-- done")
 
diff --git a/onnx_diagnostic/helpers/args_helper.py b/onnx_diagnostic/helpers/args_helper.py
@@ -1,3 +1,4 @@
+import os
 import subprocess
 from argparse import ArgumentParser, Namespace
 from typing import Dict, List, Optional, Tuple, Union
@@ -131,3 +132,14 @@ def get_parsed_args(
     if update:
         res.__dict__.update(update)
     return res
+
+
+def process_outputname(output_name: str, input_name: str) -> str:
+    """
+    If 'output_name' starts with '+', then it is modified into
+    ``<input_name_no_extension><output_name>.extension``.
+    """
+    if not output_name.startswith("+"):
+        return output_name
+    name, ext = os.path.splitext(input_name)
+    return f"{name}{output_name[1:]}{ext}"
diff --git a/onnx_diagnostic/helpers/onnx_helper.py b/onnx_diagnostic/helpers/onnx_helper.py
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py