huggingface
diff --git a/‎tests/models/transformers/test_models_transformer_lumina.py‎
Lines changed: 45 additions & 61 deletions b/‎tests/models/transformers/test_models_transformer_lumina.py‎
Lines changed: 45 additions & 61 deletions
diff --git a/‎tests/models/transformers/test_models_transformer_lumina2.py‎
Lines changed: 42 additions & 34 deletions b/‎tests/models/transformers/test_models_transformer_lumina2.py‎
Lines changed: 42 additions & 34 deletions
diff --git a/‎tests/models/transformers/test_models_transformer_mochi.py‎
Lines changed: 48 additions & 34 deletions b/‎tests/models/transformers/test_models_transformer_mochi.py‎
Lines changed: 48 additions & 34 deletions
@@ -13,85 +13,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import LuminaNextDiT2DModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    BaseModelTesterConfig,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class LuminaNextDiT2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = LuminaNextDiT2DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class LuminaNextDiTTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        """
-        Args:
-            None
-        Returns:
-            Dict: Dictionary of dummy input tensors
-        """
-        batch_size = 2  # N
-        num_channels = 4  # C
-        height = width = 16  # H, W
-        embedding_dim = 32  # D
-        sequence_length = 16  # L
-
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.rand(size=(batch_size,)).to(torch_device)
-        encoder_mask = torch.randn(size=(batch_size, sequence_length)).to(torch_device)
-        image_rotary_emb = torch.randn((384, 384, 4)).to(torch_device)
+    def model_class(self):
+        return LuminaNextDiT2DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "encoder_mask": encoder_mask,
-            "image_rotary_emb": image_rotary_emb,
-            "cross_attention_kwargs": {},
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
-        """
-        Args:
-            None
-        Returns:
-            Tuple: (int, int, int)
-        """
+    def output_shape(self) -> tuple:
         return (4, 16, 16)
 
     @property
-    def output_shape(self):
-        """
-        Args:
-            None
-        Returns:
-            Tuple: (int, int, int)
-        """
+    def input_shape(self) -> tuple:
         return (4, 16, 16)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        """
-        Args:
-            None
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-        Returns:
-            Tuple: (Dict, Dict)
-        """
-        init_dict = {
+    def get_init_dict(self) -> dict:
+        return {
             "sample_size": 16,
             "patch_size": 2,
             "in_channels": 4,
@@ -108,5 +68,29 @@ def prepare_init_args_and_inputs_for_common(self):
             "scaling_factor": 1.0,
         }
 
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = 16
+        embedding_dim = 32
+        sequence_length = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.rand(size=(batch_size,), generator=self.generator).to(torch_device),
+            "encoder_mask": randn_tensor((batch_size, sequence_length), generator=self.generator, device=torch_device),
+            "image_rotary_emb": randn_tensor((384, 384, 4), generator=self.generator, device=torch_device),
+            "cross_attention_kwargs": {},
+        }
+
+
+class TestLuminaNextDiT(LuminaNextDiTTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestLuminaNextDiTTraining(LuminaNextDiTTesterConfig, TrainingTesterMixin):
+    pass
@@ -13,57 +13,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import Lumina2Transformer2DModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    BaseModelTesterConfig,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class Lumina2Transformer2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = Lumina2Transformer2DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class Lumina2TransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2  # N
-        num_channels = 4  # C
-        height = width = 16  # H, W
-        embedding_dim = 32  # D
-        sequence_length = 16  # L
-
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.rand(size=(batch_size,)).to(torch_device)
-        attention_mask = torch.ones(size=(batch_size, sequence_length), dtype=torch.bool).to(torch_device)
+    def model_class(self):
+        return Lumina2Transformer2DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "encoder_attention_mask": attention_mask,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 16, 16)
 
     @property
-    def output_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 16, 16)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "sample_size": 16,
             "patch_size": 2,
             "in_channels": 4,
@@ -81,9 +69,29 @@ def prepare_init_args_and_inputs_for_common(self):
             "cap_feat_dim": 32,
         }
 
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = 16
+        embedding_dim = 32
+        sequence_length = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.rand(size=(batch_size,), generator=self.generator).to(torch_device),
+            "encoder_attention_mask": torch.ones((batch_size, sequence_length), dtype=torch.bool, device=torch_device),
+        }
+
+
+class TestLumina2Transformer(Lumina2TransformerTesterConfig, ModelTesterMixin):
+    pass
+
 
+class TestLumina2TransformerTraining(Lumina2TransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"Lumina2Transformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
@@ -13,58 +13,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import MochiTransformer3DModel
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..test_modeling_common import ModelTesterMixin
+from ..testing_utils import (
+    BaseModelTesterConfig,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
 
 
 enable_full_determinism()
 
 
-class MochiTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = MochiTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-    # Overriding it because of the transformer size.
-    model_split_percents = [0.7, 0.6, 0.6]
-
+class MochiTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        num_frames = 2
-        height = 16
-        width = 16
-        embedding_dim = 16
-        sequence_length = 16
+    def model_class(self):
+        return MochiTransformer3DModel
 
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        encoder_attention_mask = torch.ones((batch_size, sequence_length)).bool().to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
+    @property
+    def model_split_percents(self) -> list:
+        return [0.7, 0.6, 0.6]
 
     @property
-    def input_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 2, 16, 16)
 
     @property
-    def output_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 2, 16, 16)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "patch_size": 2,
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -78,9 +69,32 @@ def prepare_init_args_and_inputs_for_common(self):
             "activation_fn": "swiglu",
             "max_sequence_length": 16,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        num_frames = 2
+        height = 16
+        width = 16
+        embedding_dim = 16
+        sequence_length = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+            "encoder_attention_mask": torch.ones((batch_size, sequence_length), dtype=torch.bool).to(torch_device),
+        }
+
+
+class TestMochiTransformer(MochiTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestMochiTransformerTraining(MochiTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"MochiTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)