diff --git a/tests/models/transformers/test_models_transformer_chronoedit.py b/tests/models/transformers/test_models_transformer_chronoedit.py new file mode 100644 index 000000000000..c13cb8fc0e6a --- /dev/null +++ b/tests/models/transformers/test_models_transformer_chronoedit.py @@ -0,0 +1,106 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from diffusers import ChronoEditTransformer3DModel +from diffusers.utils.torch_utils import randn_tensor + +from ...testing_utils import enable_full_determinism, torch_device +from ..testing_utils import ( + BaseModelTesterConfig, + ModelTesterMixin, + TorchCompileTesterMixin, + TrainingTesterMixin, +) + + +enable_full_determinism() + + +class ChronoEditTransformerTesterConfig(BaseModelTesterConfig): + @property + def model_class(self): + return ChronoEditTransformer3DModel + + @property + def main_input_name(self) -> str: + return "hidden_states" + + @property + def output_shape(self) -> tuple: + return (16, 8, 8) + + @property + def input_shape(self) -> tuple: + return (16, 8, 8) + + @property + def generator(self): + return torch.Generator("cpu").manual_seed(0) + + def get_init_dict(self) -> dict: + return { + "patch_size": (1, 2, 2), + "num_attention_heads": 2, + "attention_head_dim": 8, + "in_channels": 16, + "out_channels": 16, + "text_dim": 32, + "freq_dim": 16, + "ffn_dim": 32, + "num_layers": 2, + "cross_attn_norm": True, + "qk_norm": "rms_norm_across_heads", + "eps": 1e-06, + "image_dim": None, + "added_kv_proj_dim": None, + "rope_max_seq_len": 64, + "pos_embed_seq_len": None, + "rope_temporal_skip_len": 8, + } + + def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]: + num_channels = 16 + num_frames = 2 + height = 8 + width = 8 + embedding_dim = 32 + sequence_length = 12 + + return { + "hidden_states": randn_tensor( + (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device + ), + "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device), + "encoder_hidden_states": randn_tensor( + (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device + ), + "encoder_hidden_states_image": None, + } + + +class TestChronoEditTransformer(ChronoEditTransformerTesterConfig, ModelTesterMixin): + pass + + +class TestChronoEditTransformerTraining(ChronoEditTransformerTesterConfig, TrainingTesterMixin): + def test_gradient_checkpointing_is_applied(self): + expected_set = {"ChronoEditTransformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + +class TestChronoEditTransformerCompile(ChronoEditTransformerTesterConfig, TorchCompileTesterMixin): + pass diff --git a/tests/models/transformers/test_models_transformer_easyanimate.py b/tests/models/transformers/test_models_transformer_easyanimate.py index d7b90a47d974..61c040a5230e 100644 --- a/tests/models/transformers/test_models_transformer_easyanimate.py +++ b/tests/models/transformers/test_models_transformer_easyanimate.py @@ -13,58 +13,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest - import torch from diffusers import EasyAnimateTransformer3DModel +from diffusers.utils.torch_utils import randn_tensor from ...testing_utils import enable_full_determinism, torch_device -from ..test_modeling_common import ModelTesterMixin +from ..testing_utils import ( + BaseModelTesterConfig, + ModelTesterMixin, + TorchCompileTesterMixin, + TrainingTesterMixin, +) enable_full_determinism() -class EasyAnimateTransformerTests(ModelTesterMixin, unittest.TestCase): - model_class = EasyAnimateTransformer3DModel - main_input_name = "hidden_states" - uses_custom_attn_processor = True - +class EasyAnimateTransformerTesterConfig(BaseModelTesterConfig): @property - def dummy_input(self): - batch_size = 2 - num_channels = 4 - num_frames = 2 - height = 16 - width = 16 - embedding_dim = 16 - sequence_length = 16 + def model_class(self): + return EasyAnimateTransformer3DModel - hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) - encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device) - timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) - - return { - "hidden_states": hidden_states, - "timestep": timestep, - "timestep_cond": None, - "encoder_hidden_states": encoder_hidden_states, - "encoder_hidden_states_t5": None, - "inpaint_latents": None, - "control_latents": None, - } + @property + def main_input_name(self) -> str: + return "hidden_states" @property - def input_shape(self): + def output_shape(self) -> tuple: return (4, 2, 16, 16) @property - def output_shape(self): + def input_shape(self) -> tuple: return (4, 2, 16, 16) - def prepare_init_args_and_inputs_for_common(self): - init_dict = { + @property + def generator(self): + return torch.Generator("cpu").manual_seed(0) + + def get_init_dict(self) -> dict: + return { "attention_head_dim": 16, "num_attention_heads": 2, "in_channels": 4, @@ -79,9 +67,39 @@ def prepare_init_args_and_inputs_for_common(self): "time_position_encoding_type": "3d_rope", "timestep_activation_fn": "silu", } - inputs_dict = self.dummy_input - return init_dict, inputs_dict + def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]: + num_channels = 4 + num_frames = 2 + height = 16 + width = 16 + embedding_dim = 16 + sequence_length = 16 + + return { + "hidden_states": randn_tensor( + (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device + ), + "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device), + "timestep_cond": None, + "encoder_hidden_states": randn_tensor( + (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device + ), + "encoder_hidden_states_t5": None, + "inpaint_latents": None, + "control_latents": None, + } + + +class TestEasyAnimateTransformer(EasyAnimateTransformerTesterConfig, ModelTesterMixin): + pass + + +class TestEasyAnimateTransformerTraining(EasyAnimateTransformerTesterConfig, TrainingTesterMixin): def test_gradient_checkpointing_is_applied(self): expected_set = {"EasyAnimateTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + +class TestEasyAnimateTransformerCompile(EasyAnimateTransformerTesterConfig, TorchCompileTesterMixin): + pass diff --git a/tests/models/transformers/test_models_transformer_ovis_image.py b/tests/models/transformers/test_models_transformer_ovis_image.py new file mode 100644 index 000000000000..c1f6e100a747 --- /dev/null +++ b/tests/models/transformers/test_models_transformer_ovis_image.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from diffusers import OvisImageTransformer2DModel +from diffusers.utils.torch_utils import randn_tensor + +from ...testing_utils import enable_full_determinism, torch_device +from ..testing_utils import ( + BaseModelTesterConfig, + ModelTesterMixin, + TorchCompileTesterMixin, + TrainingTesterMixin, +) + + +enable_full_determinism() + + +class OvisImageTransformerTesterConfig(BaseModelTesterConfig): + @property + def model_class(self): + return OvisImageTransformer2DModel + + @property + def main_input_name(self) -> str: + return "hidden_states" + + @property + def output_shape(self) -> tuple: + return (16, 4) + + @property + def input_shape(self) -> tuple: + return (16, 4) + + @property + def generator(self): + return torch.Generator("cpu").manual_seed(0) + + def get_init_dict(self) -> dict: + return { + "patch_size": 1, + "in_channels": 4, + "out_channels": 4, + "num_layers": 1, + "num_single_layers": 1, + "attention_head_dim": 16, + "num_attention_heads": 2, + "joint_attention_dim": 32, + "axes_dims_rope": (4, 4, 8), + } + + def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]: + num_latent_channels = 4 + num_image_channels = 3 + height = width = 4 + sequence_length = 48 + embedding_dim = 32 + + return { + "hidden_states": randn_tensor( + (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device + ), + "encoder_hidden_states": randn_tensor( + (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device + ), + "img_ids": randn_tensor( + (height * width, num_image_channels), generator=self.generator, device=torch_device + ), + "txt_ids": randn_tensor( + (sequence_length, num_image_channels), generator=self.generator, device=torch_device + ), + "timestep": torch.tensor([1.0]).to(torch_device).expand(batch_size), + } + + +class TestOvisImageTransformer(OvisImageTransformerTesterConfig, ModelTesterMixin): + pass + + +class TestOvisImageTransformerTraining(OvisImageTransformerTesterConfig, TrainingTesterMixin): + def test_gradient_checkpointing_is_applied(self): + expected_set = {"OvisImageTransformer2DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + +class TestOvisImageTransformerCompile(OvisImageTransformerTesterConfig, TorchCompileTesterMixin): + pass diff --git a/tests/models/transformers/test_models_transformer_prx.py b/tests/models/transformers/test_models_transformer_prx.py index 1387625d5ea0..086b9b2e0a65 100644 --- a/tests/models/transformers/test_models_transformer_prx.py +++ b/tests/models/transformers/test_models_transformer_prx.py @@ -13,71 +13,83 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest - import torch from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel +from diffusers.utils.torch_utils import randn_tensor from ...testing_utils import enable_full_determinism, torch_device -from ..test_modeling_common import ModelTesterMixin +from ..testing_utils import ( + BaseModelTesterConfig, + ModelTesterMixin, + TorchCompileTesterMixin, + TrainingTesterMixin, +) enable_full_determinism() -class PRXTransformerTests(ModelTesterMixin, unittest.TestCase): - model_class = PRXTransformer2DModel - main_input_name = "hidden_states" - uses_custom_attn_processor = True +class PRXTransformerTesterConfig(BaseModelTesterConfig): + @property + def model_class(self): + return PRXTransformer2DModel @property - def dummy_input(self): - return self.prepare_dummy_input() + def main_input_name(self) -> str: + return "hidden_states" @property - def input_shape(self): + def output_shape(self) -> tuple: return (16, 16, 16) @property - def output_shape(self): + def input_shape(self) -> tuple: return (16, 16, 16) - def prepare_dummy_input(self, height=16, width=16): - batch_size = 1 - num_latent_channels = 16 - sequence_length = 16 - embedding_dim = 1792 - - hidden_states = torch.randn((batch_size, num_latent_channels, height, width)).to(torch_device) - encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device) - timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size) + @property + def generator(self): + return torch.Generator("cpu").manual_seed(0) + def get_init_dict(self) -> dict: return { - "hidden_states": hidden_states, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - } - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { "in_channels": 16, "patch_size": 2, "context_in_dim": 1792, "hidden_size": 1792, "mlp_ratio": 3.5, "num_heads": 28, - "depth": 4, # Smaller depth for testing + "depth": 4, "axes_dim": [32, 32], "theta": 10_000, } - inputs_dict = self.prepare_dummy_input() - return init_dict, inputs_dict + def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]: + num_latent_channels = 16 + height = width = 16 + sequence_length = 16 + embedding_dim = 1792 + + return { + "hidden_states": randn_tensor( + (batch_size, num_latent_channels, height, width), generator=self.generator, device=torch_device + ), + "encoder_hidden_states": randn_tensor( + (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device + ), + "timestep": torch.tensor([1.0]).to(torch_device).expand(batch_size), + } + + +class TestPRXTransformer(PRXTransformerTesterConfig, ModelTesterMixin): + pass + + +class TestPRXTransformerTraining(PRXTransformerTesterConfig, TrainingTesterMixin): def test_gradient_checkpointing_is_applied(self): expected_set = {"PRXTransformer2DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) -if __name__ == "__main__": - unittest.main() +class TestPRXTransformerCompile(PRXTransformerTesterConfig, TorchCompileTesterMixin): + pass