bump tokenizers, fix added tokens fast (huggingface#32535)

ArthurZucker · Avishai Elmakies · commit 58c2b2b7d439 · 2024-09-25T16:08:17.000+03:00
* update based on tokenizers release

* update

* nits

* update

* revert re addition

* don't break that yet

* fmt

* revert unwanted

* update tokenizers version

* update dep table

* update

* update in conversion script as well

* some fix

* revert

* fully revert

* fix training

* remove set trace

* fixup

* update

* update
diff --git a/setup.py b/setup.py
@@ -181,7 +181,7 @@
     "timeout-decorator",
     "tiktoken",
     "timm<=0.9.16",
-    "tokenizers>=0.19,<0.20",
+    "tokenizers>=0.20,<0.21",
     "torch",
     "torchaudio",
     "torchvision",
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -609,33 +609,12 @@ def tokenizer(self, proto):
             for id, p in enumerate(proto.pieces)
             if p.type in [3, 4]
         ]
-        tokens_to_add = [
-            AddedToken(token, normalized=False, special=special)
-            for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
-        ]
-
-        if len(tokens_to_add) > 0:
-            # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
-            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
-            # individual tokens would repeatedly rebuild a trie, which can be slow.
-            is_last_special = None
-            tokens = []
-            for token in tokens_to_add:
-                is_special = token.special
-                if is_last_special is None or is_last_special == is_special:
-                    tokens.append(token)
-                else:
-                    if is_last_special:
-                        tokenizer.add_special_tokens(tokens)
-                    else:
-                        tokenizer.add_tokens(tokens)
-                    tokens = [token]
-                is_last_special = is_special
-            if tokens:
-                if is_last_special:
-                    tokenizer.add_special_tokens(tokens)
-                else:
-                    tokenizer.add_tokens(tokens)
+        tokenizer.add_tokens(
+            [
+                AddedToken(token, normalized=False, special=special)
+                for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
+            ]
+        )
 
         return tokenizer
 
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -86,7 +86,7 @@
     "timeout-decorator": "timeout-decorator",
     "tiktoken": "tiktoken",
     "timm": "timm<=0.9.16",
-    "tokenizers": "tokenizers>=0.19,<0.20",
+    "tokenizers": "tokenizers>=0.20,<0.21",
     "torch": "torch",
     "torchaudio": "torchaudio",
     "torchvision": "torchvision",
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
@@ -175,15 +175,8 @@ def __init__(self, *args, **kwargs):
 
         # We call this after having initialized the backend tokenizer because we update it.
         super().__init__(**kwargs)
-
-        # Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
         self._tokenizer.encode_special_tokens = self.split_special_tokens
 
-        # The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
-        # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
-        # uses the information stored in `added_tokens_decoder`.
-        # this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
-        # Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
         added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
         tokens_to_add = [
             token
@@ -197,10 +190,6 @@ def __init__(self, *args, **kwargs):
         ]
 
         if len(tokens_to_add) > 0:
-            # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
-            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
-            # individual tokens would repeatedly rebuild a trie, which can be slow.
-            is_last_special = None
             tokens = []
             special_tokens = self.all_special_tokens
             for token in tokens_to_add:
@@ -209,14 +198,13 @@ def __init__(self, *args, **kwargs):
                     if isinstance(token, AddedToken)
                     else str(token) in special_tokens
                 )
-                if is_last_special is None or is_last_special == is_special:
-                    tokens.append(token)
+                if isinstance(token, str):
+                    token = AddedToken(token, special=is_special)
                 else:
-                    self._add_tokens(tokens, special_tokens=is_last_special)
-                    tokens = [token]
-                is_last_special = is_special
+                    token.special = is_special
+                tokens.append(token)
             if tokens:
-                self._add_tokens(tokens, special_tokens=is_last_special)
+                self.add_tokens(tokens)
 
     @property
     def is_fast(self) -> bool:
@@ -849,6 +837,13 @@ def train_new_from_iterator(
                     if special_tokens_map is not None:
                         tokens = [special_tokens_map.get(token, token) for token in tokens]
                     post_processor["special_tokens"][key]["tokens"] = tokens
+                    for token in tokens:
+                        token_id = tokenizer.token_to_id(token)
+                        if token_id is None:
+                            raise ValueError(
+                                "Attempted to set a token in the post processor that does not exist in the mapping"
+                            )
+
                     post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
 
             for special_token in ["cls", "sep"]:
@@ -857,6 +852,10 @@ def train_new_from_iterator(
                     if special_tokens_map is not None and token in special_tokens_map:
                         token = special_tokens_map[token]
                     token_id = tokenizer.token_to_id(token)
+                    if token_id is None:
+                        raise ValueError(
+                            "Attempted to set a token in the post processor that does not exist in the mapping"
+                        )
                     post_processor[special_token] = [token, token_id]
 
             trained_tokenizer_json["post_processor"] = post_processor