fix: export gemma3-text is now working thanks to

DWarez · DWarez · commit ed5abe1f8a4d · 2025-07-18T14:58:44.000+02:00
attention vmap patch as in here huggingface#2319
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
@@ -519,6 +519,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             and self.PAD_ATTENTION_MASK_TO_PAST
             and self.use_cache_branch is not False
             and "attention_mask" in dummy_inputs
+            and not isinstance(dummy_inputs["attention_mask"], dict)
         ):
             # Obtain the past sequence length from the value instead of the key (Bloom).
             past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[-2]
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -24,7 +24,7 @@
 import transformers
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
 
-from ...utils import is_transformers_version, logging
+from ...utils import is_torch_version, is_transformers_version, logging
 from ._traceable_cache import TraceableCache
 
 
@@ -40,6 +40,8 @@
     from transformers.cache_utils import DynamicCache, EncoderDecoderCache
     from transformers.integrations.sdpa_attention import repeat_kv, sdpa_attention_forward
     from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+if is_transformers_version(">=", "4.53"):
+    from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, _ignore_causal_mask_sdpa, prepare_padding_mask
 
 
 if TYPE_CHECKING:
@@ -218,6 +220,72 @@ def onnx_compatible_linalg_norm(x, ord=2, dim=None, keepdim=False, *, dtype=None
     return original_linal_norm(x, ord=ord, dim=dim, keepdim=keepdim, dtype=dtype, out=out)
 
 
+def sdpa_mask_without_vmap(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+
+    #  Under specific conditions, we can avoid materializing the mask, instead relying on the `is_causal` argument
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, local_size):
+        return None
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+    reshaped_cache_position = cache_position.view(-1, 1)
+
+    # This is a bit hacky to know what pattern we are using, but all mask creation function actually forward
+    # the config through kwargs anyway, so it allows to rely on it
+    # Usually, the `mask_function` is the only entry-point to define the pattern - we could do for loops over it,
+    # but this is more efficient
+    sliding_window = getattr(kwargs["config"], "sliding_window", None)
+    chunk_size = getattr(kwargs["config"], "attention_chunk_size", None)
+
+    if sliding_window is not None and chunk_size is not None:
+        raise ValueError("Cannot use both `sliding_window` and `attention_chunk_size`")
+
+    # Simplest and most efficient way to obtain a causal mask
+    causal_mask = kv_arange <= reshaped_cache_position
+    # If using sliding window, add the sliding mask
+    if sliding_window is not None:
+        sliding_mask_overlay = kv_arange > reshaped_cache_position - sliding_window
+        causal_mask *= sliding_mask_overlay
+    # If using chunk attention, add the chunked mask
+    elif chunk_size is not None:
+        chunked_mask_overlay = kv_arange // chunk_size == reshaped_cache_position // chunk_size
+        causal_mask *= chunked_mask_overlay
+
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]
+
+    # Due to a bug in some older torch version, we need to update the mask in case a query is not attending to any
+    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
+    if is_torch_version("<", "2.5") and allow_torch_fix:
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask
+
+
+def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]:
+    kwargs.pop("allow_torch_fix", None)
+    kwargs.pop("allow_is_causal_skip", None)
+    dtype = kwargs.get("dtype", torch.float32)
+    mask = sdpa_mask_without_vmap(*args, **kwargs, allow_is_causal_skip=False, allow_torch_fix=False)  # type: ignore
+    mask = torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), torch.finfo(dtype).min)  # type: ignore
+    return mask
+
+
 UNSUPPORTED_OPS_PATCHING_SPEC = [
     PatchingSpec(torch.Tensor, "unfold", onnx_compatible_unfold, torch.Tensor.unfold),
     PatchingSpec(torch.linalg, "norm", onnx_compatible_linalg_norm, original_linal_norm),
@@ -355,10 +423,20 @@ def __enter__(self):
         self.patch_ops()
         setattr(self._model, self.orig_forward_name, self.patched_forward)
 
+        if is_transformers_version(">=", "4.53"):
+            self.original_sdpa_mask = ALL_MASK_ATTENTION_FUNCTIONS["sdpa"]
+            self.original_eager_mask = ALL_MASK_ATTENTION_FUNCTIONS["eager"]
+            ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask_without_vmap)
+            ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask_without_vmap)
+
     def __exit__(self, exc_type, exc_value, traceback):
         self.restore_ops()
         setattr(self._model, self.orig_forward_name, self.orig_forward)
 
+        if is_transformers_version(">=", "4.53"):
+            ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", self.original_sdpa_mask)
+            ALL_MASK_ATTENTION_FUNCTIONS.register("eager", self.original_eager_mask)
+
     def __call__(self, *args, **kwargs):
         if getattr(self._model, self.orig_forward_name) is self.orig_forward:
             logger.warning("Running the non-patched model")
@@ -368,14 +446,14 @@ def __call__(self, *args, **kwargs):
 class Seq2SeqModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
-        if is_transformers_version(">=", "4.48"):
+        if is_transformers_version(">=", "4.48") and is_transformers_version("<", "4.53"):
             # this is required when gpt2 is used as decoder in any
             # encoder-decoder model with cross attention blocks
             ALL_ATTENTION_FUNCTIONS["sdpa"] = patched_sdpa_attention_forward
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if is_transformers_version(">=", "4.48"):
+        if is_transformers_version(">=", "4.48") and is_transformers_version("<", "4.53"):
             ALL_ATTENTION_FUNCTIONS["sdpa"] = sdpa_attention_forward
 
     def __init__(
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
@@ -1345,87 +1345,95 @@ def __init__(
         )
         self.sliding_window_size = getattr(normalized_config, "sliding_window", sequence_length)
 
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name in ["input_ids", "token_type_ids", "position_ids"]:
-            return super().generate(
-                input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
-            )
-        if input_name == "attention_mask":
-            return {
-                "full_causal_mask": self._generate_full_causal_mask(framework, float_dtype),
-                "sliding_causal_mask": self._generate_sliding_causal_mask(framework, float_dtype),
-            }
-        # if input_name == "full_causal_mask":
-        #     return self._generate_full_causal_mask(framework, float_dtype)
-        # elif input_name == "sliding_causal_mask":
-        #     return self._generate_sliding_causal_mask(framework, float_dtype)
-        else:
-            raise ValueError(f"What happened? This is not supported and should not be here: {input_name}")
-
-    def _generate_full_causal_mask(self, framework: str = "pt", float_dtype: str = "float32"):
-        if framework == "pt":
-            mask = torch.triu(
-                torch.ones((self.sequence_length, self.sequence_length), dtype=DTYPE_MAPPER.pt(float_dtype)),
-                diagonal=1,
-            )
-            mask = mask.masked_fill(mask == 1, float("-inf"))
-            mask = mask.unsqueeze(0).expand(self.batch_size, -1, -1)
-            return mask
-        elif framework == "tf":
-            mask = tf.linalg.band_part(
-                tf.ones((self.sequence_length, self.sequence_length), dtype=DTYPE_MAPPER.tf(float_dtype)), -1, 0
-            )
-            mask = tf.where(mask == 0, float("-inf"), 0.0)
-            mask = tf.expand_dims(mask, 0)
-            mask = tf.tile(mask, [self.batch_size, 1, 1])
-            return mask
-        else:
-            mask = np.triu(
-                np.ones((self.sequence_length, self.sequence_length), dtype=DTYPE_MAPPER.np(float_dtype)), k=1
-            )
-            mask = np.where(mask == 1, float("-inf"), 0.0)
-            mask = np.expand_dims(mask, 0)
-            mask = np.tile(mask, (self.batch_size, 1, 1))
-            return mask
-
-    def _generate_sliding_causal_mask(self, framework: str = "pt", float_dtype: str = "fp32"):
-        if framework == "pt":
-            mask = torch.full(
-                (self.sequence_length, self.sequence_length), float("-inf"), dtype=DTYPE_MAPPER.pt(float_dtype)
-            )
-            for i in range(self.sequence_length):
-                start = max(0, i - self.sliding_window_size + 1)
-                mask[i, start : i + 1] = 0.0
-            mask = mask.unsqueeze(0).expand(self.batch_size, -1, -1)
-            return mask
-        elif framework == "tf":
-            mask = tf.fill((self.sequence_length, self.sequence_length), float("-inf"))
-            mask = tf.cast(mask, DTYPE_MAPPER.tf(float_dtype))
-
-            updates = []
-            indices = []
-            for i in range(self.sequence_length):
-                start = max(0, i - self.sliding_window_size + 1)
-                for j in range(start, i + 1):
-                    indices.append([i, j])
-                    updates.append(0.0)
-            if indices:
-                indices = tf.constant(indices)
-                updates = tf.constant(updates, dtype=DTYPE_MAPPER.tf(float_dtype))
-                mask = tf.tensor_scatter_nd_update(mask, indices, updates)
-            mask = tf.expand_dims(mask, 0)
-            mask = tf.tile(mask, [self.batch_size, 1, 1])
-            return mask
-        else:
-            mask = np.full(
-                (self.sequence_length, self.sequence_length), float("-inf"), dtype=DTYPE_MAPPER.np(float_dtype)
-            )
-            for i in range(self.sequence_length):
-                start = max(0, i - self.sliding_window_size + 1)
-                mask[i, start : i + 1] = 0.0
-            mask = np.expand_dims(mask, 0)
-            mask = np.tile(mask, (self.batch_size, 1, 1))
-            return mask
+    # def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+    #     if input_name in ["input_ids", "token_type_ids", "position_ids"]:
+    #         return super().generate(
+    #             input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+    #         )
+    #     if input_name == "attention_mask":
+    #         return {
+    #             "full_attention": self._generate_full_causal_mask(framework, float_dtype),
+    #             "sliding_attention": self._generate_sliding_causal_mask(framework, float_dtype),
+    #         }
+    #     # if input_name == "full_causal_mask":
+    #     #     return self._generate_full_causal_mask(framework, float_dtype)
+    #     # elif input_name == "sliding_causal_mask":
+    #     #     return self._generate_sliding_causal_mask(framework, float_dtype)
+    #     else:
+    #         raise ValueError(f"What happened? This is not supported and should not be here: {input_name}")
+
+    # def _generate_full_causal_mask(self, framework: str = "pt", float_dtype: str = "float32"):
+    #     if framework == "pt":
+    #         row_indices = torch.arange(self.sequence_length).view(-1, 1)
+    #         col_indices = torch.arange(self.sequence_length).view(1, -1)
+    #         causal_mask = row_indices >= col_indices
+    #         dtype = getattr(torch, float_dtype)
+    #         mask = torch.zeros((self.sequence_length, self.sequence_length), dtype=dtype)
+    #         mask[~causal_mask] = float("-inf")
+    #         mask = mask.unsqueeze(0).expand(self.batch_size, -1, -1)
+    #         return mask
+    #     elif framework == "tf":
+    #         row_indices, col_indices = tf.meshgrid(
+    #             tf.range(self.sequence_length), tf.range(self.sequence_length), indexing="ij"
+    #         )
+    #         causal_mask = row_indices >= col_indices
+    #         dtype = getattr(tf, float_dtype)
+    #         mask = tf.where(
+    #             causal_mask,
+    #             tf.zeros((self.sequence_length, self.sequence_length), dtype=dtype),
+    #             tf.fill((self.sequence_length, self.sequence_length), float("-inf")),
+    #         )
+    #         mask = tf.expand_dims(mask, 0)
+    #         mask = tf.tile(mask, [self.batch_size, 1, 1])
+    #         return mask
+
+    #     else:
+    #         row_indices = np.arange(self.sequence_length).reshape(-1, 1)
+    #         col_indices = np.arange(self.sequence_length).reshape(1, -1)
+    #         causal_mask = row_indices >= col_indices
+    #         dtype = getattr(np, float_dtype)
+    #         mask = np.full((self.sequence_length, self.sequence_length), float("-inf"), dtype=dtype)
+    #         mask[causal_mask] = 0.0
+    #         mask = np.expand_dims(mask, 0)
+    #         mask = np.repeat(mask, self.batch_size, axis=0)
+    #         return mask
+
+    # def _generate_sliding_causal_mask(self, window_size: int, framework: str = "pt", float_dtype: str = "float32"):
+    #     if framework == "pt":
+    #         row_indices = torch.arange(self.sequence_length).view(-1, 1)
+    #         col_indices = torch.arange(self.sequence_length).view(1, -1)
+    #         causal_mask = (row_indices >= col_indices) & (row_indices - col_indices < window_size)
+    #         dtype = getattr(torch, float_dtype)
+    #         mask = torch.zeros((self.sequence_length, self.sequence_length), dtype=dtype)
+    #         mask[~causal_mask] = float("-inf")
+    #         mask = mask.unsqueeze(0).expand(self.batch_size, -1, -1)
+    #         return mask
+    #     elif framework == "tf":
+    #         row_indices, col_indices = tf.meshgrid(
+    #             tf.range(self.sequence_length), tf.range(self.sequence_length), indexing="ij"
+    #         )
+    #         causal_condition = row_indices >= col_indices
+    #         window_condition = (row_indices - col_indices) < window_size
+    #         sliding_mask = causal_condition & window_condition
+    #         dtype = getattr(tf, float_dtype)
+    #         mask = tf.where(
+    #             sliding_mask,
+    #             tf.zeros((self.sequence_length, self.sequence_length), dtype=dtype),
+    #             tf.fill((self.sequence_length, self.sequence_length), float("-inf")),
+    #         )
+    #         mask = tf.expand_dims(mask, 0)
+    #         mask = tf.tile(mask, [self.batch_size, 1, 1])
+    #         return mask
+    #     else:
+    #         row_indices = np.arange(self.sequence_length).reshape(-1, 1)
+    #         col_indices = np.arange(self.sequence_length).reshape(1, -1)
+    #         causal_mask = (row_indices >= col_indices) & (row_indices - col_indices < window_size)
+    #         dtype = getattr(np, float_dtype)
+    #         mask = np.full((self.sequence_length, self.sequence_length), float("-inf"), dtype=dtype)
+    #         mask[causal_mask] = 0.0
+    #         mask = np.expand_dims(mask, 0)
+    #         mask = np.repeat(mask, self.batch_size, axis=0)
+    #         return mask
 
 
 class DummySpeechT5InputGenerator(DummyInputGenerator):
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
@@ -253,6 +253,7 @@ class NormalizedConfigManager:
         "electra": NormalizedTextConfig,
         "encoder-decoder": NormalizedEncoderDecoderConfig,
         "gemma": NormalizedTextConfigWithGQA,
+        "gemma3_text": NormalizedTextConfigWithGQA,
         "gpt2": GPT2LikeNormalizedTextConfig,
         "gpt_bigcode": GPTBigCodeNormalizedTextConfig,
         "gpt_neo": NormalizedTextConfig.with_args(num_attention_heads="num_heads"),
diff --git a/test.py b/test.py
@@ -0,0 +1,13 @@
+from optimum.onnxruntime import ORTModelForCausalLM
+
+
+model_name = "google/gemma-3-1b-it"
+
+onnx_model = ORTModelForCausalLM.from_pretrained(
+    model_name,
+    export=True,
+    trust_remote_code=True,
+)
+
+
+print("done")