Skip to content

Commit 58c2b2b

Browse files
ArthurZuckerAvishai Elmakies
authored andcommitted
bump tokenizers, fix added tokens fast (huggingface#32535)
* update based on tokenizers release * update * nits * update * revert re addition * don't break that yet * fmt * revert unwanted * update tokenizers version * update dep table * update * update in conversion script as well * some fix * revert * fully revert * fix training * remove set trace * fixup * update * update
1 parent 37da2d6 commit 58c2b2b

File tree

4 files changed

+24
-46
lines changed

4 files changed

+24
-46
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
"timeout-decorator",
182182
"tiktoken",
183183
"timm<=0.9.16",
184-
"tokenizers>=0.19,<0.20",
184+
"tokenizers>=0.20,<0.21",
185185
"torch",
186186
"torchaudio",
187187
"torchvision",

src/transformers/convert_slow_tokenizer.py

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -609,33 +609,12 @@ def tokenizer(self, proto):
609609
for id, p in enumerate(proto.pieces)
610610
if p.type in [3, 4]
611611
]
612-
tokens_to_add = [
613-
AddedToken(token, normalized=False, special=special)
614-
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
615-
]
616-
617-
if len(tokens_to_add) > 0:
618-
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
619-
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
620-
# individual tokens would repeatedly rebuild a trie, which can be slow.
621-
is_last_special = None
622-
tokens = []
623-
for token in tokens_to_add:
624-
is_special = token.special
625-
if is_last_special is None or is_last_special == is_special:
626-
tokens.append(token)
627-
else:
628-
if is_last_special:
629-
tokenizer.add_special_tokens(tokens)
630-
else:
631-
tokenizer.add_tokens(tokens)
632-
tokens = [token]
633-
is_last_special = is_special
634-
if tokens:
635-
if is_last_special:
636-
tokenizer.add_special_tokens(tokens)
637-
else:
638-
tokenizer.add_tokens(tokens)
612+
tokenizer.add_tokens(
613+
[
614+
AddedToken(token, normalized=False, special=special)
615+
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
616+
]
617+
)
639618

640619
return tokenizer
641620

src/transformers/dependency_versions_table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686
"timeout-decorator": "timeout-decorator",
8787
"tiktoken": "tiktoken",
8888
"timm": "timm<=0.9.16",
89-
"tokenizers": "tokenizers>=0.19,<0.20",
89+
"tokenizers": "tokenizers>=0.20,<0.21",
9090
"torch": "torch",
9191
"torchaudio": "torchaudio",
9292
"torchvision": "torchvision",

src/transformers/tokenization_utils_fast.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -175,15 +175,8 @@ def __init__(self, *args, **kwargs):
175175

176176
# We call this after having initialized the backend tokenizer because we update it.
177177
super().__init__(**kwargs)
178-
179-
# Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
180178
self._tokenizer.encode_special_tokens = self.split_special_tokens
181179

182-
# The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
183-
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
184-
# uses the information stored in `added_tokens_decoder`.
185-
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
186-
# Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
187180
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
188181
tokens_to_add = [
189182
token
@@ -197,10 +190,6 @@ def __init__(self, *args, **kwargs):
197190
]
198191

199192
if len(tokens_to_add) > 0:
200-
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
201-
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
202-
# individual tokens would repeatedly rebuild a trie, which can be slow.
203-
is_last_special = None
204193
tokens = []
205194
special_tokens = self.all_special_tokens
206195
for token in tokens_to_add:
@@ -209,14 +198,13 @@ def __init__(self, *args, **kwargs):
209198
if isinstance(token, AddedToken)
210199
else str(token) in special_tokens
211200
)
212-
if is_last_special is None or is_last_special == is_special:
213-
tokens.append(token)
201+
if isinstance(token, str):
202+
token = AddedToken(token, special=is_special)
214203
else:
215-
self._add_tokens(tokens, special_tokens=is_last_special)
216-
tokens = [token]
217-
is_last_special = is_special
204+
token.special = is_special
205+
tokens.append(token)
218206
if tokens:
219-
self._add_tokens(tokens, special_tokens=is_last_special)
207+
self.add_tokens(tokens)
220208

221209
@property
222210
def is_fast(self) -> bool:
@@ -849,6 +837,13 @@ def train_new_from_iterator(
849837
if special_tokens_map is not None:
850838
tokens = [special_tokens_map.get(token, token) for token in tokens]
851839
post_processor["special_tokens"][key]["tokens"] = tokens
840+
for token in tokens:
841+
token_id = tokenizer.token_to_id(token)
842+
if token_id is None:
843+
raise ValueError(
844+
"Attempted to set a token in the post processor that does not exist in the mapping"
845+
)
846+
852847
post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
853848

854849
for special_token in ["cls", "sep"]:
@@ -857,6 +852,10 @@ def train_new_from_iterator(
857852
if special_tokens_map is not None and token in special_tokens_map:
858853
token = special_tokens_map[token]
859854
token_id = tokenizer.token_to_id(token)
855+
if token_id is None:
856+
raise ValueError(
857+
"Attempted to set a token in the post processor that does not exist in the mapping"
858+
)
860859
post_processor[special_token] = [token, token_id]
861860

862861
trained_tokenizer_json["post_processor"] = post_processor

0 commit comments

Comments
 (0)