Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions tests/models/albert/test_modeling_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
import unittest

from packaging import version
from parameterized import parameterized

from transformers import AlbertConfig, AutoTokenizer, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
from transformers.testing_utils import require_torch, slow, torch_device

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
Expand Down Expand Up @@ -289,12 +288,6 @@ def setUp(self):
self.model_tester = AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@unittest.skip("Albert requires `head_mask` which is currently not done in this test.")
def test_eager_matches_sdpa_inference(self):
pass

def test_config(self):
self.config_tester.run_common_tests()

Expand Down
5 changes: 0 additions & 5 deletions tests/models/aya_vision/test_modeling_aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,6 @@ def test_model_outputs_equivalence(self, **kwargs):
def test_sdpa_can_dispatch_non_composite_models(self):
pass

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
def test_eager_matches_sdpa_inference(self):
pass

@unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
def test_eager_matches_sdpa_generate(self):
pass
Expand Down
197 changes: 2 additions & 195 deletions tests/models/beit/test_modeling_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,28 @@
# limitations under the License.
"""Testing suite for the PyTorch BEiT model."""

import inspect
import tempfile
import unittest

import numpy as np
from datasets import load_dataset
from packaging import version
from parameterized import parameterized

from transformers import BeitConfig
from transformers.testing_utils import (
require_torch,
require_torch_multi_gpu,
require_torch_sdpa,
require_vision,
slow,
torch_device,
)
from transformers.utils import (
cached_property,
is_torch_available,
is_torch_bf16_available_on_device,
is_torch_fp16_available_on_device,
is_vision_available,
)

from ...test_backbone_common import BackboneTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, sdpa_kernel
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin


Expand Down Expand Up @@ -119,6 +112,7 @@ def __init__(
# in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
self.mask_length = self.seq_length - 1
self.num_masks = int(mask_ratio * self.seq_length)
self.attn_implementation = attn_implementation

Expand Down Expand Up @@ -414,193 +408,6 @@ def test_model_from_pretrained(self):
model = BeitModel.from_pretrained(model_name)
self.assertIsNotNone(model)

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
# The common test modifies the num_hidden_layers to be 1. However, for Beit we want to
# avoid that because the num_hidden_layers is generally assumed to be 4. Also, the code
# related to attention masks in the original common tests is not required as the Beit
# model does not handle attention masks. Furthermore, some extra code like modifying
# the norm layers eps values for specialized configs and checking for the 'noise'
# has been omitted to simply the test.
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")

if not self.all_model_classes[0]._supports_sdpa:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")

if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")

if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
self.skipTest(
f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
)

# Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
if torch_dtype == "float16":
torch_dtype = torch.float16
elif torch_dtype == "bfloat16":
torch_dtype = torch.bfloat16
elif torch_dtype == "float32":
torch_dtype = torch.float32

atols = {
("cpu", False, torch.float32): 1e-6,
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 1e-2,
("cpu", True, torch.float32): 1e-6,
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 1e-2,
("cuda", False, torch.float32): 1e-6,
("cuda", False, torch.bfloat16): 1e-2,
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 1e-6,
("cuda", True, torch.bfloat16): 1e-2,
("cuda", True, torch.float16): 5e-3,
}
rtols = {
("cpu", False, torch.float32): 1e-4,
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 1e-2,
("cpu", True, torch.float32): 1e-4,
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 1e-2,
("cuda", False, torch.float32): 1e-4,
("cuda", False, torch.bfloat16): 1e-2,
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 1e-4,
("cuda", True, torch.bfloat16): 3e-2,
("cuda", True, torch.float16): 5e-3,
}

def get_mean_reldiff(failcase, x, ref, atol, rtol):
return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"

for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

config.rms_norm_eps = 1.0
config.layer_norm_eps = 1.0
config.norm_eps = 1.0
config.norm_epsilon = 1.0
config.layer_norm_epsilon = 1.0

model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype, use_mask_token=True)
model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)

model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
attn_implementation="eager",
use_mask_token=True,
)
model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)

# Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
for x in model_eager.modules():
if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
x.eps = 1.0
for x in model_sdpa.modules():
if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
x.eps = 1.0

# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
# but it would be nicer to have an efficient way to use parameterized.expand
fail_cases = []
for padding_side in ["left", "right"]:
for use_mask in [False, True]:
for output_attentions in [True, False]:
can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
if not (self.has_attentions and can_output_attn) and output_attentions:
continue
# TODO: if we can also check with `batch_size=1` without being flaky?
for batch_size in [7]:
dummy_input = inputs_dict[model.main_input_name]

if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
dummy_input = dummy_input.to(torch_dtype)

dummy_input = dummy_input[:batch_size]
for enable_kernels in [False, True]:
failcase = f"padding_side={padding_side}, use_mask={use_mask}, enable_kernels={enable_kernels}"
processed_inputs = {
model.main_input_name: dummy_input,
"output_hidden_states": True,
}

if (
self.has_attentions
and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
):
processed_inputs["output_attentions"] = output_attentions

if "bool_masked_pos" in inspect.signature(model_eager.forward).parameters:
dummy_mask = torch.ones((self.model_tester.num_masks,))
mask_length = self.model_tester.seq_length - 1 - dummy_mask.size(0)
dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)

with torch.no_grad():
with sdpa_kernel(
enable_flash=enable_kernels,
enable_math=True,
enable_mem_efficient=enable_kernels,
):
prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
outputs_eager = model_eager(**prepared_inputs)
outputs_sdpa = model_sdpa(**prepared_inputs)

logits_eager = outputs_eager.hidden_states[-1]
logits_sdpa = outputs_sdpa.hidden_states[-1]
if torch_device in ["cpu", "cuda"]:
atol = atols[torch_device, enable_kernels, torch_dtype]
rtol = rtols[torch_device, enable_kernels, torch_dtype]
elif torch_device == "xpu":
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
# which is implemented on PyTorch level using aten operators and is
# device agnostic with respect to implementation of each aten operator.
atol = atols["cuda", False, torch_dtype]
rtol = rtols["cuda", False, torch_dtype]
else:
atol = 1e-7
rtol = 1e-4

# Masked tokens output slightly deviates - we don't mind that.
if use_mask:
_logits_sdpa = torch.zeros_like(input=logits_sdpa)
_logits_eager = torch.zeros_like(input=logits_eager)

_logits_sdpa[:-1] = logits_sdpa[:-1]
_logits_eager[:-1] = logits_eager[:-1]

if padding_side == "left":
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
_logits_eager[-1:, 2:] = logits_eager[-1:, 2:]

elif padding_side == "right":
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
_logits_eager[-1:, 2:] = logits_eager[-1:, :-2]

logits_sdpa = _logits_sdpa
logits_eager = _logits_eager

results = [
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
]
# If 80% batch elements have matched results, it's fine
if np.mean(results) < 0.8:
fail_cases.append(
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
)

self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))


# We will verify our results on an image of cute cats
def prepare_img():
Expand Down
5 changes: 0 additions & 5 deletions tests/models/cohere2/test_modeling_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ def test_model_outputs_equivalence(self, **kwargs):
def test_sdpa_can_dispatch_non_composite_models(self):
pass

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
def test_eager_matches_sdpa_inference(self):
pass

@unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
def test_eager_matches_sdpa_generate(self):
pass
Expand Down
10 changes: 0 additions & 10 deletions tests/models/colpali/test_modeling_colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import torch
from datasets import load_dataset
from parameterized import parameterized

from tests.test_configuration_common import ConfigTester
from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
Expand All @@ -32,7 +31,6 @@
from transformers.models.colpali.processing_colpali import ColPaliProcessor
from transformers.testing_utils import (
require_torch,
require_torch_sdpa,
require_vision,
slow,
torch_device,
Expand Down Expand Up @@ -271,14 +269,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass

@require_torch_sdpa
@slow
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
self.skipTest(
"Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
)

@unittest.skip(
reason="From PaliGemma: Some undefined behavior encountered with test versions of this model. Skip for now."
)
Expand Down
Loading