huggingface · ydshieh · Apr 10, 2025 · Mar 31, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
@@ -16,7 +16,6 @@
 from collections import OrderedDict
 
 import torch
-from packaging import version
 from torch import Tensor, nn
 
 from .utils import logging
@@ -34,14 +33,6 @@ class PytorchGELUTanh(nn.Module):
     match due to rounding errors.
     """
 
-    def __init__(self):
-        super().__init__()
-        if version.parse(torch.__version__) < version.parse("1.12.0"):
-            raise ImportError(
-                f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
-                "PytorchGELUTanh. Please upgrade torch."
-            )
-
     def forward(self, input: Tensor) -> Tensor:
         return nn.functional.gelu(input, approximate="tanh")
 
@@ -145,10 +136,7 @@ class MishActivation(nn.Module):
 
     def __init__(self):
         super().__init__()
-        if version.parse(torch.__version__) < version.parse("1.9.0"):
-            self.act = self._mish_python
-        else:
-            self.act = nn.functional.mish
+        self.act = nn.functional.mish
 
     def _mish_python(self, input: Tensor) -> Tensor:
         return input * torch.tanh(nn.functional.softplus(input))

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -1496,7 +1496,6 @@ def create_extended_attention_mask_for_decoder(input_shape, attention_mask, devi
         seq_ids = torch.arange(seq_length, device=device)
         causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
         # in case past_key_values are used we need to add a prefix ones mask to the causal mask
-        # causal and attention masks must have same type with pytorch version < 1.3
         causal_mask = causal_mask.to(attention_mask.dtype)
 
         if causal_mask.shape[1] < attention_mask.shape[1]:

diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
@@ -633,7 +633,6 @@ def get_extended_attention_mask(
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
                 # in case past_key_values are used we need to add a prefix ones mask to the causal mask
-                # causal and attention masks must have same type with pytorch version < 1.3
                 causal_mask = causal_mask.to(attention_mask.dtype)
 
                 if causal_mask.shape[1] < attention_mask.shape[1]:

diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -20,11 +20,8 @@
 
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import is_sentencepiece_available, logging
-from ...utils.versions import require_version
 
 
-require_version("tokenizers>=0.13.3")
-
 if is_sentencepiece_available():
     from .tokenization_code_llama import CodeLlamaTokenizer
 else:

diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py
@@ -23,11 +23,8 @@
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
-from ...utils.versions import require_version
 
 
-require_version("tokenizers>=0.13.3")
-
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
 

diff --git a/src/transformers/models/gemma/tokenization_gemma_fast.py b/src/transformers/models/gemma/tokenization_gemma_fast.py
@@ -20,11 +20,8 @@
 
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import is_sentencepiece_available, logging
-from ...utils.versions import require_version
 
 
-require_version("tokenizers>=0.13.3")
-
 if is_sentencepiece_available():
     from .tokenization_gemma import GemmaTokenizer
 else:

diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -42,7 +42,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_torch_flex_attn_available,
-    is_torch_fx_available,
     logging,
 )
 from .configuration_gpt_neo import GPTNeoConfig
@@ -60,8 +59,7 @@
 
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 
 
 logger = logging.get_logger(__name__)

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -20,11 +20,8 @@
 
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import is_sentencepiece_available, logging
-from ...utils.versions import require_version
 
 
-require_version("tokenizers>=0.13.3")
-
 if is_sentencepiece_available():
     from .tokenization_llama import LlamaTokenizer
 else:

diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -42,7 +42,6 @@
     replace_return_docstrings,
 )
 from ...utils.deprecation import deprecate_kwarg
-from ...utils.import_utils import is_torch_fx_available
 from .configuration_phimoe import PhimoeConfig
 
 
@@ -51,8 +50,7 @@
 
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 
 
 logger = logging.get_logger(__name__)

diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
@@ -171,7 +171,7 @@ def visual_embed(self, pixel_values, pixel_mask, max_image_length=200):
         select = torch.cat(select, dim=0)
         x = x[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
         x_mask = x_mask[select[:, 0], select[:, 1]].view(batch_size, -1)
-        # `patch_index` should be on the same device as `select` (for torch>=1.13), which is ensured at definition time.
+        # `patch_index` should be on the same device as `select`, which is ensured at definition time.
         patch_index = patch_index[select[:, 0], select[:, 1]].view(batch_size, -1, 2)
         pos_embed = pos_embed[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
 

diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
@@ -25,7 +25,6 @@
 from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler
 from .trainer_utils import SchedulerType
 from .utils import logging
-from .utils.versions import require_version
 
 
 logger = logging.get_logger(__name__)
@@ -701,7 +700,6 @@ def __init__(
         relative_step=True,
         warmup_init=False,
     ):
-        require_version("torch>=1.5.0")  # add_ with alpha
         if lr is not None and relative_step:
             raise ValueError("Cannot combine manual `lr` and `relative_step=True` options")
         if warmup_init and not relative_step:

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -138,7 +138,6 @@
     is_tokenizers_available,
     is_torch_available,
     is_torch_bf16_available_on_device,
-    is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_deterministic,
     is_torch_fp16_available_on_device,
@@ -1073,14 +1072,6 @@ def require_torch_bf16_gpu(test_case):
     )(test_case)
 
 
-def require_torch_bf16_cpu(test_case):
-    """Decorator marking a test that requires torch>=1.10, using CPU."""
-    return unittest.skipUnless(
-        is_torch_bf16_cpu_available(),
-        "test requires torch>=1.10, using CPU",
-    )(test_case)
-
-
 def require_deterministic_for_xpu(test_case):
     if is_torch_xpu_available():
         return unittest.skipUnless(is_torch_deterministic(), "test requires torch to use deterministic algorithms")(

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -164,7 +164,7 @@
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
     is_schedulefree_available,
-    is_torch_compile_available,
+    is_torch_available,
     is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
@@ -257,7 +257,7 @@
 
 def _is_peft_model(model):
     if is_peft_available():
-        classes_to_check = (PeftModel,) if is_peft_available() else ()
+        classes_to_check = (PeftModel,)
         # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321
         if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"):
             from peft import PeftMixedModel
@@ -798,7 +798,7 @@ def __init__(
         self._memory_tracker.stop_and_update_metrics()
 
         # torch.compile
-        if args.torch_compile and not is_torch_compile_available():
+        if args.torch_compile and not is_torch_available():
             raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
 
         self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
@@ -1987,7 +1987,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
         if self.accelerator.unwrap_model(model) is not model:
             return model
 
-        # Mixed precision training with apex (torch < 1.6)
+        # Mixed precision training with apex
         if self.use_apex and training:
             model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)
 
@@ -3740,7 +3740,7 @@ def training_step(
                 torch.musa.empty_cache()
             elif is_torch_npu_available():
                 torch.npu.empty_cache()
-            elif is_torch_mps_available(min_version="2.0"):
+            elif is_torch_mps_available():
                 torch.mps.empty_cache()
             elif is_torch_hpu_available():
                 logger.warning(

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -44,7 +44,6 @@
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
     is_torch_available,
-    is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_hpu_available,
     is_torch_mlu_available,
@@ -1165,7 +1164,6 @@ class TrainingArguments:
             "help": (
                 "Number of batches loaded in advance by each worker. "
                 "2 means there will be a total of 2 * num_workers batches prefetched across all workers. "
-                "Default is 2 for PyTorch < 2.0.0 and otherwise None."
             )
         },
     )
@@ -1697,7 +1695,7 @@ def __post_init__(self):
                 self.half_precision_backend = self.fp16_backend
 
             if self.bf16 or self.bf16_full_eval:
-                if self.use_cpu and not is_torch_bf16_cpu_available() and not is_torch_xla_available():
+                if self.use_cpu and not is_torch_available() and not is_torch_xla_available():
                     # cpu
                     raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
                 elif not self.use_cpu:

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
@@ -61,10 +61,7 @@
 )
 from .import_utils import (
     ENV_VARS_TRUE_VALUES,
-    TORCH_FX_REQUIRED_VERSION,
-    get_torch_version,
     is_peft_available,
-    is_torch_fx_available,
 )
 
 
@@ -891,12 +888,6 @@ class HFTracer(Tracer):
     def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
         super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
 
-        if not is_torch_fx_available():
-            raise ImportError(
-                f"Found an incompatible version of torch. Found version {get_torch_version()}, but only version "
-                f"{TORCH_FX_REQUIRED_VERSION} is supported."
-            )
-
     def _generate_dummy_input(
         self, model: "PreTrainedModel", input_name: str, shape: list[int], input_names: list[str]
     ) -> dict[str, torch.Tensor]:

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -221,6 +221,10 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _torch_available = False
 if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
     _torch_available, _torch_version = _is_package_available("torch", return_version=True)
+    if _torch_available:
+        _torch_available = version.parse(_torch_version) >= version.parse("2.1.0")
+        if not _torch_available:
+            logger.warning(f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}")
 else:
     logger.info("Disabling PyTorch because USE_TF is set")
     _torch_available = False
@@ -309,15 +313,6 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
             _jax_version = _flax_version = "N/A"
 
 
-_torch_fx_available = False
-if _torch_available:
-    torch_version = version.parse(_torch_version)
-    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
-        TORCH_FX_REQUIRED_VERSION.major,
-        TORCH_FX_REQUIRED_VERSION.minor,
-    )
-
-
 _torch_xla_available = False
 if USE_TORCH_XLA in ENV_VARS_TRUE_VALUES:
     _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla", return_version=True)
@@ -521,19 +516,8 @@ def is_torch_bf16_gpu_available():
     return torch.cuda.is_available() and torch.cuda.is_bf16_supported()
 
 
-def is_torch_bf16_cpu_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    try:
-        # multiple levels of AttributeError depending on the pytorch version so do them all in one check
-        _ = torch.cpu.amp.autocast
-    except AttributeError:
-        return False
-
-    return True
+def is_torch_bf16_cpu_available() -> bool:
+    return is_torch_available()
 
 
 def is_torch_bf16_available():
@@ -609,20 +593,15 @@ def is_torch_tf32_available():
 
     import torch
 
-    if not torch.cuda.is_available() or torch.version.cuda is None:
+    if not torch.cuda.is_available():
         return False
     if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
         return False
-    if int(torch.version.cuda.split(".")[0]) < 11:
-        return False
-    if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.7"):
-        return False
-
     return True
 
 
 def is_torch_fx_available():
-    return _torch_fx_available
+    return is_torch_available()
 
 
 def is_peft_available():
@@ -827,21 +806,11 @@ def is_habana_gaudi1():
 
 
 def is_torchdynamo_available():
-    if not is_torch_available():
-        return False
-
-    return True
+    return is_torch_available()
 
 
 def is_torch_compile_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    # We don't do any version check here to support nighlies marked as 1.14. Ultimately needs to check version against
-    # 2.0 but let's do it later.
-    return hasattr(torch, "compile")
+    return is_torch_available()
 
 
 def is_torchdynamo_compiling():

diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
@@ -323,7 +323,6 @@ def test_fsdp_cpu_offloading(self):
 
     @require_torch_multi_accelerator
     @slow
-    @require_fsdp
     @require_fsdp_v2_version
     @require_accelerate_fsdp2
     def test_accelerate_fsdp2_integration(self):

diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
@@ -510,7 +510,6 @@ def test_model_as_decoder(self):
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
 
     def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
         (
             config,
             input_ids,

diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py
@@ -273,7 +273,6 @@ def test_decoder_model_past_with_large_inputs(self):
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
     def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
         (
             config,
             input_ids,