huggingface · IlyasMoutawwakil · Jul 30, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
@@ -27,6 +27,7 @@ jobs:
       matrix:
         python-version: [3.9]
         runs-on: [ubuntu-22.04]
+        transformers_version: [latest, 4.36.*, 4.45.*]
         test_file:
           [
             test_timm.py,
@@ -59,13 +60,26 @@ jobs:
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install .[tests,onnxruntime] diffusers
 
-      - name: Test with pytest (in series)
-        if: matrix.test_file == 'test_modeling.py'
+      - name: Install transformers ${{ matrix.transformers-version }}
         run: |
-          pytest tests/onnxruntime/test_modeling.py -m "run_in_series" --durations=0 -vvvv
+          if [ "${{ matrix.transformers_version }}" == '4.36.*' ]; then
+            pip install "transformers==4.36.*" "diffusers<0.32.0"
+          elif [ "${{ matrix.transformers_version }}" == '4.45.*' ]; then
+            pip install "transformers==4.45.*" "diffusers<0.33.0"
+          else
+            pip install transformers;
+          fi
 
       - name: Test with pytest (in parallel)
+        if: matrix.test_file != 'test_diffusion.py'
+        run: |
+          pytest tests/onnxruntime/${{ matrix.test_file }} --durations=0 -vvvv -n auto
+        env:
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+      - name: Test with pytest (in series)
+        if: matrix.test_file == 'test_diffusion.py'
         run: |
-          pytest tests/onnxruntime/${{ matrix.test_file }} -m "not run_in_series" --durations=0 -vvvv -n auto
+          pytest tests/onnxruntime/${{ matrix.test_file }} --durations=0 -vvvv
         env:
           HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
@@ -36,16 +36,15 @@ jobs:
         python-version: [3.9]
         transformers-version: [latest]
         runs-on: [ubuntu-22.04, windows-2022]
-        include:
-          - {python-version: 3.9, transformers-version: 4.36.*, runs-on: ubuntu-22.04}
-          - {python-version: 3.9, transformers-version: 4.45.*, runs-on: ubuntu-22.04}
 
     runs-on: ${{ matrix.runs-on }}
 
     steps:
       - name: Free Disk Space (Ubuntu)
         if: matrix.runs-on == 'ubuntu-22.04'
         uses: jlumbroso/free-disk-space@main
+        with:
+          swap-storage: false
 
       - name: Free Disk Space (macOS)
         if: matrix.runs-on == 'macos-15'
@@ -69,22 +68,12 @@ jobs:
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install .[tests,onnxruntime] diffusers
 
-      - name: Install transformers ${{ matrix.transformers-version }}
-        if: ${{ matrix.transformers-version == '4.36.*' }}
-        run: |
-          pip install "transformers==${{ matrix.transformers-version }}" "diffusers<0.32.0"
-
-      - name: Install transformers ${{ matrix.transformers-version }}
-        if: ${{ matrix.transformers-version == '4.45.*' }}
-        run: |
-          pip install "transformers==${{ matrix.transformers-version }}" "diffusers<0.33.0"
-
       - name: Test with pytest (in series)
         run: |
           pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv
         env:
           RUN_SLOW: 1
-  
+
       - name: Test with pytest (in parallel)
         run: |
           pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv

diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
@@ -94,13 +94,14 @@ def __init__(
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self.use_past_in_inputs:
             common_inputs = {"input_ids": {0: "batch_size", 1: "sequence_length"}}
+            common_inputs["attention_mask"] = {0: "batch_size", 1: "past_sequence_length + sequence_length"}
             self.add_past_key_values(common_inputs, direction="inputs")
-            common_inputs["attention_mask"] = {0: "batch_size", 1: "past_sequence_length + 1"}
         else:
             common_inputs = {
                 "input_ids": {0: "batch_size", 1: "sequence_length"},
                 "attention_mask": {0: "batch_size", 1: "sequence_length"},
             }
+
         return common_inputs
 
     @property

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -92,9 +92,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     CLIPModelPatcher,
-    FalconModelPatcher,
     MgpstrModelPatcher,
-    MistralModelPatcher,
     MusicgenModelPatcher,
     Qwen3MoeModelPatcher,
     SAMModelPatcher,
@@ -409,20 +407,12 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46
-if is_transformers_version(">=", "4.46.0"):
-
-    @register_tasks_manager_onnx("opt", *COMMON_TEXT_GENERATION_TASKS + ["text-classification", "question-answering"])
-    class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
-        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
-
-else:
-
-    @register_tasks_manager_onnx("opt", *COMMON_TEXT_GENERATION_TASKS + ["text-classification", "question-answering"])
-    class OPTOnnxConfig(TextDecoderOnnxConfig):
-        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
-        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+@register_tasks_manager_onnx("opt", *COMMON_TEXT_GENERATION_TASKS + ["text-classification", "question-answering"])
+class OPTOnnxConfig(
+    TextDecoderWithPositionIdsOnnxConfig if is_transformers_version(">=", "4.46.0") else TextDecoderOnnxConfig
+):
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
 @register_tasks_manager_onnx("llama", *COMMON_TEXT_GENERATION_TASKS + ["text-classification"])
@@ -477,7 +467,6 @@ class GemmaOnnxConfig(LlamaOnnxConfig):
 @register_tasks_manager_onnx("granite", *COMMON_TEXT_GENERATION_TASKS)
 class GraniteOnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
-    MIN_TORCH_VERSION = version.parse("2.5.0")
 
 
 @register_tasks_manager_onnx("phi", *COMMON_TEXT_GENERATION_TASKS + ["text-classification"])
@@ -502,17 +491,11 @@ class InternLM2OnnxConfig(LlamaOnnxConfig):
 
 @register_tasks_manager_onnx("mistral", *COMMON_TEXT_GENERATION_TASKS + ["text-classification"])
 class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35
-    MIN_TRANSFORMERS_VERSION = version.parse("4.35.0")
-
     # The ONNX export of this architecture needs the Trilu operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
-    DUMMY_INPUT_GENERATOR_CLASSES = (
-        MistralDummyPastKeyValuesGenerator,
-    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
-    _MODEL_PATCHER = MistralModelPatcher
 
 
 @register_tasks_manager_onnx("mpt", *COMMON_TEXT_GENERATION_TASKS + ["text-classification", "token-classification"])
@@ -556,9 +539,7 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
     "gpt_bigcode", *COMMON_TEXT_GENERATION_TASKS + ["text-classification", "token-classification"]
 )
 class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DUMMY_INPUT_GENERATOR_CLASSES = (
-        GPTBigCodeDummyPastKeyValuesGenerator,
-    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GPTBigCodeDummyPastKeyValuesGenerator)
     DEFAULT_ONNX_OPSET = 14  # GPT BigCode now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     DUMMY_PKV_GENERATOR_CLASS = GPTBigCodeDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedConfigManager.get_normalized_config_class("gpt_bigcode")
@@ -571,36 +552,29 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
             decoder_sequence_name = "past_sequence_length"
             name = "past_key_values"
         else:
-            decoder_sequence_name = "past_sequence_length + 1"
+            decoder_sequence_name = "past_sequence_length + sequence_length"
             name = "present"
 
         for i in range(self._normalized_config.num_layers):
-            # No dim for `n_head` when using multi-query attention
-            inputs_or_outputs[f"{name}.{i}.key_value"] = {
-                0: "batch_size",
-                1: decoder_sequence_name,
-            }
+            if self._normalized_config.multi_query:
+                # No dim for `n_head` when using multi-query attention
+                inputs_or_outputs[f"{name}.{i}.key_value"] = {0: "batch_size", 1: decoder_sequence_name}
+            else:
+                inputs_or_outputs[f"{name}.{i}.key_value"] = {0: "batch_size", 2: decoder_sequence_name}
 
     def flatten_past_key_values(self, flattened_output, name, idx, t):
         flattened_output[f"{name}.{idx}.key_value"] = t
 
 
 @register_tasks_manager_onnx("falcon", *COMMON_TEXT_GENERATION_TASKS + ["question-answering", "token-classification"])
-class FalconOnnxConfig(TextDecoderOnnxConfig):
-    # This is due to the cache refactoring for Falcon in 4.36
-    MIN_TRANSFORMERS_VERSION = version.parse("4.35.99")
+class FalconOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.36.0")
 
-    DUMMY_INPUT_GENERATOR_CLASSES = (
-        FalconDummyPastKeyValuesGenerator,
-    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, FalconDummyPastKeyValuesGenerator)
     DEFAULT_ONNX_OPSET = 14  # Falcon uses aten::triu that requires opset>=14, and F.scaled_dot_product_attention
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DUMMY_PKV_GENERATOR_CLASS = FalconDummyPastKeyValuesGenerator
 
-    # we need to set output_attentions=True in the model input to avoid calling
-    # torch.nn.functional.scaled_dot_product_attention that is not supported by the ONNX export
-    _MODEL_PATCHER = FalconModelPatcher
-
     def __init__(
         self,
         config: "PretrainedConfig",
@@ -634,10 +608,8 @@ def __init__(
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = super().inputs
 
-        if not self.legacy and not self._config.alibi and self.task in ["text-generation", "feature-extraction"]:
-            # When alibi is used, position_ids are not used in Falcon.
-            # Reference: https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/models/falcon/modeling_falcon.py#L1116
-            common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
+        if self._config.alibi:
+            common_inputs.pop("position_ids", None)
 
         return common_inputs
 
@@ -836,7 +808,6 @@ def flatten_past_key_values(self, flattened_output, name, idx, t):
 )
 class BartOnnxConfig(M2M100OnnxConfig):
     DEFAULT_ONNX_OPSET = 14  # Bart now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
-    MIN_TORCH_VERSION = version.parse("2.1.2")
 
 
 @register_tasks_manager_onnx(
@@ -868,7 +839,7 @@ class BigBirdPegasusOnnxConfig(BartOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         inputs = super().inputs
-        if self._config.attention_type == "block_sparse":
+        if self._config.attention_type == "block_sparse" and self.task != "text-generation":
             # BigBirdPegasusEncoder creates its own attention_mask internally
             # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py#L1875
             inputs.pop("attention_mask", None)
@@ -888,7 +859,6 @@ class MarianOnnxConfig(BartOnnxConfig):
 @register_tasks_manager_onnx("vit", *["feature-extraction", "image-classification", "masked-im"])
 class ViTOnnxConfig(VisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
-    MIN_TORCH_VERSION = version.parse("1.11")
     DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
@@ -1574,7 +1544,6 @@ class OwlViTOnnxConfig(CLIPOnnxConfig):
     # Sets the absolute tolerance to when validating the exported ONNX model against the
     # reference model.
     ATOL_FOR_VALIDATION = 1e-4
-    MIN_TORCH_VERSION = version.parse("2.1")
 
     # needs einsum operator support, available since opset 12
     DEFAULT_ONNX_OPSET = 12
@@ -1646,7 +1615,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     "layoutlmv3", *["feature-extraction", "question-answering", "text-classification", "token-classification"]
 )
 class LayoutLMv3OnnxConfig(TextAndVisionOnnxConfig):
-    MIN_TORCH_VERSION = version.parse("1.12")
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         allow_new=True,
         MAX_2D_POSITION_EMBEDDINGS="max_2d_position_embeddings",
@@ -2570,8 +2538,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 @register_tasks_manager_onnx("sam", *["feature-extraction"])
 class SamOnnxConfig(OnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.29.0.dev0")
-    # Since ransformers 4.32.0, SAM uses repeat_interleave op that is broken in PyTorch 2.0.1: https://github.com/pytorch/pytorch/issues/100429
-    MIN_TORCH_VERSION = version.parse("2.0.99")
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyPointsGenerator, DummyVisionEmbeddingsGenerator)
     DEFAULT_ONNX_OPSET = 13  # Opset 12 for repeat_interleave falls back on the opset 9 implem, that raises Unsupported: ONNX export of repeat_interleave in opset 9.