[tests] fix anyflow tests (#13855)

sayakpaul · dg845 · web-flow · commit c33abfcc8d40 · 2026-06-04T14:41:31.000+05:30
* fix anyflow tests * [tests] fix anyflow tests layerwise casting (#13863) Fix AnyFlow FAR causal transformer training layerwise / mixed precision tests --------- Co-authored-by: dg845 <58458699+dg845@users.noreply.github.com>
diff --git a/src/diffusers/models/transformers/transformer_anyflow_far.py b/src/diffusers/models/transformers/transformer_anyflow_far.py
@@ -111,6 +111,7 @@ def __call__(
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
+        target_dtype = hidden_states.dtype  # Effective compute dtype
 
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
@@ -121,6 +122,11 @@ def __call__(
         if attn.norm_k is not None:
             key = attn.norm_k(key)
 
+        # norm_q and norm_k upcast query and key to FP32 due to the use of RMSNorm, so cast them back to the effective
+        # compute dtype.
+        query = query.to(target_dtype)
+        key = key.to(target_dtype)
+
         # Layout (B, H, L, D) is required by KV-cache slicing and rotary application.
         query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
         key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
diff --git a/tests/models/transformers/test_models_transformer_anyflow_far.py b/tests/models/transformers/test_models_transformer_anyflow_far.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import pytest
 import torch
 
@@ -46,7 +44,7 @@ def model_class(self):
 
     @property
     def output_shape(self) -> tuple[int, ...]:
-        return (1, 2, 4, 16, 16)
+        return (4, 4, 16, 16)
 
     @property
     def input_shape(self) -> tuple[int, ...]:
@@ -137,15 +135,12 @@ def test_gradient_checkpointing_is_applied(self):
     # GPU-only (`torch.nn.attention.flex_attention` raises NotImplementedError on CPU). The
     # bidi transformer test file covers training on the SDPA path; FAR training correctness
     # is exercised end-to-end on H200 via the pipeline replay (L2=0 against NVlabs/AnyFlow).
-    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
     def test_training(self):
         super().test_training()
 
-    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
     def test_training_with_ema(self):
         super().test_training_with_ema()
 
-    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
     def test_gradient_checkpointing_equivalence(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip=None):
         super().test_gradient_checkpointing_equivalence(loss_tolerance, param_grad_tol, skip)
 
@@ -186,7 +181,7 @@ def test_compile_works_with_aot(self, tmp_path):
         super().test_compile_works_with_aot(tmp_path)
 
 
-class AnyFlowCausalAttnProcessorTest(unittest.TestCase):
+class TestAnyFlowCausalAttnProcessor:
     """Stand-alone smoke tests for the FAR causal attention processor.
 
     These cover behaviors not reached by the generated model mixins:
@@ -196,7 +191,7 @@ class AnyFlowCausalAttnProcessorTest(unittest.TestCase):
 
     def test_default_backend_is_flex(self):
         processor = AnyFlowCausalAttnProcessor()
-        self.assertEqual(processor._attention_backend, "flex")
+        assert processor._attention_backend == "flex"
 
     def test_unsupported_backend_raises(self):
         processor = AnyFlowCausalAttnProcessor()
@@ -217,10 +212,10 @@ def to_v(self, x):
 
             to_out = [lambda x: x, lambda x: x]
 
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             processor(_DummyAttn(), torch.zeros(1, 4, 4))
 
     def test_output_dataclass_exposed(self):
         # Downstream type-checking + autodoc rely on these attributes existing.
-        self.assertTrue(hasattr(AnyFlowFARTransformerOutput, "sample"))
-        self.assertTrue(hasattr(AnyFlowFARTransformerOutput, "kv_cache"))
+        assert hasattr(AnyFlowFARTransformerOutput, "sample")
+        assert hasattr(AnyFlowFARTransformerOutput, "kv_cache")