huggingface · ArthurZucker · Oct 10, 2024 · Aug 29, 2024 · Aug 29, 2024 · Sep 2, 2024
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
@@ -234,6 +234,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
+* [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
@@ -270,7 +271,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
 * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)
 
-
 <Tip>
 
 FlashAttention can only be used for models with the `fp16` or `bf16` torch type, so make sure to cast your model to the appropriate type first. The memory-efficient attention backend is able to handle `fp32` models.

diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
@@ -22,7 +22,10 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -116,7 +119,7 @@ def __init__(
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int) -> torch.Tensor:
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
@@ -359,9 +362,106 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+class OPTSdpaAttention(OPTAttention):
+    """
+    OPT sdpa attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
+    The only required change would be on the forward pass where it needs to correctly call the public API of sdpa
+    attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once("""OPTModel is using SDPA attention, which currently does not support output_attentions=True.
+                                   failing back to eager attention. remove warning using attn_implementation="eager".""")
+
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                key_value_states=key_value_states,
+            )  # TODO after merge add position_ids=position_ids
+        is_cross_attention = key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states) * self.scaling
+        query_states = self._shape(query_states, -1, bsz)
+
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        # shape now is (bsz, num_heads, seq_len, head_dim), all are continuous
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+            # this model uses the scaling factor in the query projection for some reason, but not in Q@K^T
+            # so we need to scale to remove scaling in SDPA to have similar results with eager.
+            # Maybe needs a change in the model to remove scaling in query projection
+            scale=1.0,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 OPT_ATTENTION_CLASSES = {
     "eager": OPTAttention,
     "flash_attention_2": OptFlashAttention2,
+    "sdpa": OPTSdpaAttention,
 }
 
 
@@ -488,6 +588,7 @@ class OPTPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["OPTDecoderLayer"]
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -604,6 +705,7 @@ def __init__(self, config: OPTConfig):
 
         self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -615,6 +717,49 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    def _update_causal_mask(
+        self,
+        inputs_embeds: torch.Tensor,
+        input_shape: Tuple[int, int],
+        past_key_values_length: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+    ):
+        """
+        Updates the causal mask for the decoder.
+        """
+        batch_size, seq_length = input_shape
+        mask_seq_length = past_key_values_length + seq_length
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            causal_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            attention_mask = (
+                torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+                if attention_mask is None
+                else attention_mask
+            )
+
+            return causal_attention_mask, attention_mask
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != mask_seq_length:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+            )
+
+        if self._use_sdpa:
+            causal_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+        else:
+            causal_attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+
+        return causal_attention_mask, attention_mask
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -696,32 +841,12 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        batch_size, seq_length = input_shape
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values_length + seq_length
 
+        causal_attention_mask, attention_mask = self._update_causal_mask(
+            inputs_embeds, input_shape, past_key_values_length, attention_mask, output_attentions
+        )
         # embed positions
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            causal_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-            attention_mask = (
-                torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-                if attention_mask is None
-                else attention_mask
-            )
-        else:
-            # 4d mask is passed through the layers
-            if attention_mask is None:
-                attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-            elif attention_mask.shape[1] != mask_seq_length:
-                raise ValueError(
-                    f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
-                    f"{mask_seq_length} (sum of the lengths of current and past inputs)"
-                )
-            causal_attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
-            )
 
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
 

diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
@@ -70,6 +70,7 @@ def __init__(
         embed_dim=16,
         word_embed_proj_dim=16,
         initializer_range=0.02,
+        attn_implemetation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -92,6 +93,7 @@ def __init__(
         self.word_embed_proj_dim = word_embed_proj_dim
         self.initializer_range = initializer_range
         self.is_encoder_decoder = False
+        self.attn_implementation = attn_implemetation
 
     def prepare_config_and_inputs(self):
         input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
@@ -114,6 +116,7 @@ def prepare_config_and_inputs(self):
             word_embed_proj_dim=self.word_embed_proj_dim,
             initializer_range=self.initializer_range,
             use_cache=False,
+            attn_implementation=self.attn_implementation,
         )
         inputs_dict = prepare_opt_inputs_dict(config, input_ids)
         return config, inputs_dict

diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
@@ -21,7 +21,14 @@
 import timeout_decorator  # noqa
 
 from transformers import OPTConfig, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_accelerator, require_torch_fp16, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    require_torch_fp16,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -83,6 +90,7 @@ def __init__(
         num_labels=3,
         word_embed_proj_dim=16,
         type_sequence_label_size=2,
+        attn_implementation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -106,6 +114,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.word_embed_proj_dim = word_embed_proj_dim
         self.is_encoder_decoder = False
+        self.attn_implementation = attn_implementation
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -135,6 +144,7 @@ def get_config(self):
             embed_dim=self.embed_dim,
             is_encoder_decoder=False,
             word_embed_proj_dim=self.word_embed_proj_dim,
+            attn_implementation=self.attn_implementation,
         )
 
     def get_pipeline_config(self):
@@ -322,6 +332,68 @@ def test_opt_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        max_new_tokens = 30
+
+        tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-125M")
+
+        texts = [
+            "hi here's a longer context, getting longer and",
+            "Hello this is a very long sentence my friend, very long for real",
+            "Today I am in Paris and",
+        ]
+
+        model_sdpa = OPTForCausalLM.from_pretrained(
+            "facebook/opt-125M",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="sdpa",
+        ).to(torch_device)
+
+        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+        model_eager = OPTForCausalLM.from_pretrained(
+            "facebook/opt-125M",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",
+        ).to(torch_device)
+
+        self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+        for _, submodule in model_eager.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                raise ValueError("The eager model should not have SDPA attention layers")
+
+        has_sdpa = False
+        for _, submodule in model_sdpa.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                has_sdpa = True
+                break
+        if not has_sdpa:
+            raise ValueError("The SDPA model should have SDPA attention layers")
+
+        for padding_side in ["left", "right"]:
+            tokenizer.padding_side = padding_side
+            tokenizer.pad_token = tokenizer.eos_token
+
+            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
+
+            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+            with self.subTest(f"{padding_side}"):
+                torch.testing.assert_close(
+                    res_eager,
+                    res_sdpa,
+                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
+                )
+
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()