improves serialization

xadupre · xadupre · commit de5958e38acb · 2026-02-23T13:30:49.000+01:00
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -61,7 +61,9 @@ def test_image_text_to_text_tiny_gemma3(self):
     def test_image_text_to_text_gemma3_4b_it(self):
         make_hybrid_cache = get_make_hybrid_cache()
         if make_hybrid_cache is None:
-            raise unittest.SkipTest("not implemented yet for transformers>=5")
+            raise unittest.SkipTest(
+                "not implemented yet for transformers>=5 (make_hybrid_cache is None)"
+            )
         mid = "google/gemma-3-4b-it"
         data = get_untrained_model_with_inputs(
             mid,
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -586,7 +586,7 @@ def forward(
         for exporter in ("custom", "onnx-dynamo"):
             # onnx-dynamo needs OpOverload(op='aten.sym_storage_offset' (transformers>=5.0?)
             if exporter == "onnx-dynamo" and not has_onnxscript("0.5.7"):
-                raise unittest.SkipTest("needs onnxscript>=0.5.7")
+                self.skipTest("needs onnxscript>=0.5.7")
             filename = self.get_dump_file(
                 f"test_patched_qwen2_5_vl_vision_attention_forward.{exporter}.onnx"
             )
@@ -640,7 +640,7 @@ def test_qwen2_5_vl_vision_attention_iteration(self):
         )
         for exporter in ("custom", "onnx-dynamo"):
             if exporter == "onnx-dynamo" and aten_sym_storage_offset is None:
-                raise unittest.SkipTest("update onnxscript to make this test run")
+                self.skipTest("update onnxscript to make this test run")
             # onnx-dynamo needs OpOverload(op='aten.sym_storage_offset' (transformers>=5.0?)
             filename = self.get_dump_file(
                 f"test_qwen2_5_vl_vision_attention_iteration.{exporter}.onnx"
@@ -909,7 +909,7 @@ def test_cache_dependant_input_preparation_exporting(self):
             torch.testing.assert_close(eager2, export2)
 
         with self.subTest(case="case2"):
-            raise unittest.SkipTest("torch 2.10+ has probably a bug here.")
+            self.skipTest("torch 2.10+ has probably a bug here.")
             input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)
             inputs_embeds = torch.rand((2, 8), dtype=torch.float32)
             cache_position = torch.arange(0, 8, dtype=torch.int64)
@@ -995,15 +995,17 @@ def test_prepare_inputs_for_generation_decoder_llm(self):
 
             with self.subTest(case="case5"):
                 if not has_transformers("4.57"):
-                    raise unittest.SkipTest("transformers 4.57+.")
+                    self.skipTest("This test only works with transformers>=4.57, <5.3.")
                 if has_transformers("5.2.99"):
-                    raise unittest.SkipTest("transformers 5.2+.")
+                    self.skipTest("This test is no longer valid with transformers>=5.3.")
                 with self.assertRaises((AttributeError, TypeError)):
                     model_inputs = model.prepare_inputs_for_generation(
                         input_ids, past_key_values=dynamic_cache
                     )
 
             with self.subTest(case="case6"):
+                if has_transformers("5.2.99"):
+                    self.skipTest("This test is no longer valid with transformers>=5.3.")
                 cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long).to(
                     torch_device
                 )
@@ -1025,6 +1027,8 @@ def test_prepare_inputs_for_generation_decoder_llm(self):
                 )  # we still need the full attention mask!
 
             with self.subTest(case="case6.2"):
+                if has_transformers("5.2.99"):
+                    self.skipTest("This test is no longer valid with transformers>=5.3.")
                 max_cache_len = 10
                 batch_size = 2
                 query_length = input_ids.shape[-1] - init_input_ids.shape[-1]
@@ -1048,7 +1052,7 @@ def test_prepare_inputs_for_generation_decoder_llm(self):
 
             with self.subTest(case="case7"):
                 if not has_transformers("4.57"):
-                    raise unittest.SkipTest("transformers 4.57+.")
+                    self.skipTest("This test only works with transformers>=4.57.")
                 init_inputs_embeds = model.get_input_embeddings()(init_input_ids)
                 model_inputs = model.prepare_inputs_for_generation(
                     input_ids,
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -742,16 +742,22 @@ def make_hybrid_cache(
                 not max_batch_size and not max_cache_len
             ), "key_value_pairs is not empty, do not specify max_cache_len and max_batch_size"
             max_batch_size = key_value_pairs[0][0].shape[0]
+            assert max_cache_len is not None or all(
+                isinstance(kv[0].shape[2], int) for kv in key_value_pairs
+            ), (
+                f"Cannot determine max_cache_len with "
+                f"shapes={[kv[0].shape for kv in key_value_pairs]}"
+            )
             sets_of_dim = set(kv[0].shape[2] for kv in key_value_pairs)
             if len(sets_of_dim) == 1:
-                max_cache_len = sets_of_dim.pop()
-                sliding_window = max_cache_len
+                if max_cache_len is None:
+                    max_cache_len = sets_of_dim.pop()
             else:
                 assert (
                     len(sets_of_dim) == 2
                 ), f"Not implemented for more than 2 dimensions {sets_of_dim}"
-                max_cache_len = max(sets_of_dim)
-                sliding_window = min(sets_of_dim)
+                if max_cache_len is None:
+                    max_cache_len = max(sets_of_dim)
                 layer_types = [
                     "full_attention" if i == max_cache_len else "sliding_attention"
                     for i in [kv[0].shape[2] for kv in key_value_pairs]
@@ -760,8 +766,8 @@ def make_hybrid_cache(
             assert (
                 max_batch_size and max_cache_len
             ), "key_value_pairs is empty, max_batch_size and max_cache_len are required"
-            if sliding_window is None:
-                sliding_window = max_cache_len
+        if sliding_window is None:
+            sliding_window = max_cache_len
         _max_cache_len = max_cache_len
         _sliding_window = sliding_window
 
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_attention.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_attention.py
@@ -139,6 +139,18 @@ def patched_sdpa_attention_forward(
         if is_causal is None and attention_mask is not None:
             is_causal = False
         if is_causal is not None:
+            torch._check(query.shape[0] > 0)
+            torch._check(query.shape[1] > 0)
+            torch._check(query.shape[2] > 0)
+            torch._check(query.shape[3] > 0)
+            torch._check(key.shape[0] > 0)
+            torch._check(key.shape[1] > 0)
+            torch._check(key.shape[2] > 0)
+            torch._check(key.shape[3] > 0)
+            torch._check(value.shape[0] > 0)
+            torch._check(value.shape[1] > 0)
+            torch._check(value.shape[2] > 0)
+            torch._check(value.shape[3] > 0)
             return (
                 torch.nn.functional.scaled_dot_product_attention(
                     query,
diff --git a/onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py b/onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py
@@ -61,7 +61,7 @@ def _flatten_key_value_cache(cache: Cache) -> Tuple[List[Any], torch.utils._pytr
     flat = list(itertools.chain.from_iterable(zip(ca.key_cache, ca.value_cache)))
     unique = set(ca.cls_layers) if ca.cls_layers else None
     if (
-        cache.__class__.__name__ != "DynamicCache"
+        cache.__class__.__name__ not in ("DynamicCache", "HybridCache")
         or unique is None
         or (len(unique) == 1 and unique.pop().__name__ == "DynamicLayer")
     ):