huggingface · Rocketknight1 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -212,10 +212,10 @@ class QuantizedCacheConfig(CacheConfig):
             Size of the quantization group, should be a divisor of the model's hidden dimension.
             Defaults to 64.
         residual_length (`Optional[int]`, *optional*, defaults to 128):
-            Length of the residual cache which will always be stored in original presicion.
+            Length of the residual cache which will always be stored in original precision.
             Defaults to 128.
         compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
-            The defualt dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
+            The default dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
         device (`str`, *optional*, defaults to `"cpu"`):
             Device on which to perform computations, should be same as the model's device.
     """
@@ -1074,7 +1074,7 @@ class StaticCache(Cache):
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
         layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between different gpus.
             You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
 
@@ -1267,7 +1267,7 @@ class SlidingWindowCache(StaticCache):
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
         layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between different gpus.
             You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Example:
@@ -1579,7 +1579,7 @@ class HybridCache(Cache):
         dtype (torch.dtype, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
         layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between different gpus.
             You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Example:
@@ -1929,7 +1929,7 @@ class OffloadedStaticCache(StaticCache):
         offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`):
             The device to offload to. Defaults to CPU.
         layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between different gpus.
             You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Attributes:

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -1184,7 +1184,7 @@ def _set_model_specific_special_tokens(self, special_tokens: List[str]):
         """
         Adds new special tokens to the "SPECIAL_TOKENS_ATTRIBUTES" list which will be part
         of "self.special_tokens" and saved as a special token in tokenizer's config.
-        This allows us to dynamically add new model-type specific tokens after initilizing the tokenizer.
+        This allows us to dynamically add new model-type specific tokens after initializing the tokenizer.
         For example: if the model tokenizers is multimodal, we can support special image or audio tokens.
         """
         self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
@@ -1199,7 +1199,7 @@ def _set_model_specific_special_tokens(self, special_tokens: List[str]):
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to add special tokens when encoding the sequences. This will use the underlying
                 `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
-                automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens
+                automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
                 automatically.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
@@ -2474,7 +2474,7 @@ def save_pretrained(
         # no typefields, this way old fast and slow can load it
         tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
 
-        # Process added tokens seperatly: allows previous versions to ignore it!
+        # Process added tokens separately: allows previous versions to ignore it!
         added_tokens = {}
         for key, value in self.added_tokens_decoder.items():
             added_tokens[key] = value.__getstate__()