fix(attention): Another fix attempt for dynamo issues with torch compile and fa4.

BlueCrescent · BlueCrescent · commit 23074252b039 · 2026-04-22T23:11:51.000+02:00
diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py
@@ -63,13 +63,18 @@ class ModelFactory:
     """Model factory class to create models."""
 
     @staticmethod
-    def _requires_graph_break_friendly_compile(module: nn.Module) -> bool:
+    def _requires_eager_execution(module: nn.Module) -> bool:
         if isinstance(module, GPT2Block):
             return module.attn.attention_impl == AttentionImplementation.DAO_FLASH_V4
 
         attention_impl = getattr(module, "attention_impl", None)
         return attention_impl == AttentionImplementation.DAO_FLASH_V4
 
+    # TODO remove?
+    # @staticmethod
+    # def _requires_graph_break_friendly_compile(module: nn.Module) -> bool:
+    #     return ModelFactory._requires_eager_execution(module)
+
     @staticmethod
     def _is_model_on_meta_device(model: nn.Module) -> bool:
         """
@@ -410,15 +415,24 @@ def get_parent_module_and_child_name(child_module: nn.Module, model: nn.Module)
 
         for _, module in model.named_modules():
             if isinstance(module, block_types):
-                options = {"trace.enabled": True} if debug else {}
-                compiled_fullgraph = fullgraph
-                if compiled_fullgraph and ModelFactory._requires_graph_break_friendly_compile(module):
-                    compiled_fullgraph = False
+                if ModelFactory._requires_eager_execution(module):
                     logger.warning(
-                        "Disabling `fullgraph=True` for `%s` because FlashAttention-4 currently graph-breaks under "
-                        "torch.compile when tracing into flash_attn.cute internals.",
+                        "Skipping `torch.compile` for `%s` because FlashAttention-4 currently graph-breaks under "
+                        "TorchDynamo when tracing into flash_attn.cute internals.",
                         module.__class__.__name__,
                     )
+                    continue
+
+                options = {"trace.enabled": True} if debug else {}
+                compiled_fullgraph = fullgraph
+                # TODO remove?
+                # if compiled_fullgraph and ModelFactory._requires_graph_break_friendly_compile(module):
+                #     compiled_fullgraph = False
+                #     logger.warning(
+                #         "Disabling `fullgraph=True` for `%s` because FlashAttention-4 currently graph-breaks under "
+                #         "torch.compile when tracing into flash_attn.cute internals.",
+                #         module.__class__.__name__,
+                #     )
 
                 compiled_module = torch.compile(module, fullgraph=compiled_fullgraph, options=options)
                 parent_module, child_name = get_parent_module_and_child_name(child_module=module, model=model)
diff --git a/tests/test_torch_compile.py b/tests/test_torch_compile.py
@@ -106,18 +106,19 @@ def test_get_compiled_model_empty_block_names(gpt2_model: GPT2LLM) -> None:
 
 
 @pytest.mark.skipif(not is_flash_attn_v4_available(), reason="FA4 not installed")
-def test_get_compiled_model_disables_fullgraph_for_fa4(monkeypatch: MonkeyPatch, gpt2_model: GPT2LLM) -> None:
-    recorded_fullgraph_values: list[bool] = []
+def test_get_compiled_model_skips_compile_for_fa4(monkeypatch: MonkeyPatch, gpt2_model: GPT2LLM) -> None:
+    compile_call_count = 0
 
     for block in gpt2_model.transformer.h.values():
         block.attn.attention_impl = AttentionImplementation.DAO_FLASH_V4
 
     def fake_compile(module: nn.Module, fullgraph: bool, options: dict[str, object]) -> nn.Module:
-        recorded_fullgraph_values.append(fullgraph)
+        nonlocal compile_call_count
+        compile_call_count += 1
         return module
 
     monkeypatch.setattr(torch, "compile", fake_compile)
 
     ModelFactory.get_compiled_model(gpt2_model, ["GPT2Block"], fullgraph=True)
 
-    assert recorded_fullgraph_values == [False] * len(gpt2_model.transformer.h)
+    assert compile_call_count == 0