Skip to content

Commit 9daee2e

Browse files
ArthurZuckerita.zaporozhets@huggingface.coita.zaporozhets@huggingface.coita.zaporozhets@huggingface.coita.zaporozhets@huggingface.co
authored
use TokenizersBackend (#42894)
* us `TokenizersBackend` * fixes * pioritize mapping * pioritize mapping * only use mapping for some models * fix fallback * undo debug thing * add case to tokenizersbackend init * add default bos eos token to tok backend * set bos eos * fix more models * mistrla idefics * fix stopping criteria test * fix stopping criteria test * try stopping criteria fix * rebase * update tokenizer model for stopping criteria test * fix tuple mapping for ministral * ignore `tokenizer_class` as it is always wrong * up * try to fix idefics * fix unispeech and maybe other: fallback if conversion was not possible to the saveclass * nits * fixup * TIL that it was ALSO saved in config.json... * arf * fallback to tok config if no config json * people who map to Llama probably don't even want llama either.. * processors to load tokbackend * auto fix order * try diff order * mistral fix for weird chars * reorder * random fix attempt for failing tests that are failing locally so idk how to check these * trying an older commit * fix mistral * map unispeech * try something out * update * nits * trying to be a little bit more restrictive * token type ids for tokenizers should be explicits... let's see which test fail this and we'll add to the specific classes? * Nit * idefics 1-2 are actually the only ones that should map to llama force * small fixes * fix layout * fixup * fix some tests * 1 nit * aria fix * style * canine * fixup * very small test * style * update to tokenizersbackend --------- Co-authored-by: [email protected] <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: itazap <[email protected]> Co-authored-by: Ita Zaporozhets <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: [email protected] <[email protected]>
1 parent 69ec61f commit 9daee2e

29 files changed

+249
-243
lines changed

docs/source/en/model_doc/parakeet.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ outputs = model(**inputs)
188188
outputs.loss.backward()
189189
```
190190

191-
## ParakeetTokenizerFast
191+
## ParakeetTokenizer
192192

193-
[[autodoc]] ParakeetTokenizerFast
193+
[[autodoc]] ParakeetTokenizer
194194

195195
## ParakeetFeatureExtractor
196196

src/transformers/integrations/accelerate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def _get_device_map(
293293
# especially if the model uses WeightConverter (because there will be some uncontrollable cpu memory spikes during
294294
# the conversions before we resave the weights). In those cases, it's better to offload to disk a bit more
295295
# if we were in-between, as otherwise we blow-up cpu memory
296-
if max_memory is None:
296+
if max_memory is None and "cpu" in inferred_max_memory:
297297
inferred_max_memory["cpu"] *= 0.90
298298

299299
if hf_quantizer is not None:

src/transformers/models/auto/tokenization_auto.py

Lines changed: 126 additions & 155 deletions
Large diffs are not rendered by default.

src/transformers/models/blenderbot/tokenization_blenderbot.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,6 @@ def __init__(
160160

161161
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
162162
self._tokenizer.decoder = decoders.ByteLevel()
163-
self._tokenizer.post_processor = processors.RobertaProcessing(
164-
sep=(str(eos_token), self._vocab.get(str(eos_token), 2)),
165-
cls=(str(bos_token), self._vocab.get(str(bos_token), 0)),
166-
add_prefix_space=add_prefix_space,
167-
trim_offsets=True,
168-
)
169-
170163
super().__init__(
171164
bos_token=bos_token,
172165
eos_token=eos_token,
@@ -178,6 +171,12 @@ def __init__(
178171
add_prefix_space=add_prefix_space,
179172
**kwargs,
180173
)
174+
self._tokenizer.post_processor = processors.RobertaProcessing(
175+
sep=(str(eos_token), self.eos_token_id),
176+
cls=(str(bos_token), self.bos_token_id),
177+
add_prefix_space=add_prefix_space,
178+
trim_offsets=True,
179+
)
181180

182181

183182
__all__ = ["BlenderbotTokenizer"]

src/transformers/models/canine/tokenization_canine.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class CanineTokenizer(PreTrainedTokenizer):
6767
The maximum sentence length the model accepts.
6868
"""
6969

70+
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
71+
7072
def __init__(
7173
self,
7274
bos_token=chr(CLS),

src/transformers/models/code_llama/tokenization_code_llama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def __init__(
158158
unk_token=str(unk_token),
159159
)
160160
)
161-
prepend_scheme = "first" if self.add_prefix_space else "none"
161+
prepend_scheme = "first" if self.add_prefix_space else "never"
162162
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
163163
replacement="▁", prepend_scheme=prepend_scheme, split=False
164164
)

src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
160160

161161
vocab_files_names = VOCAB_FILES_NAMES
162162
model = models.WordPiece
163+
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
163164

164165
def __init__(
165166
self,

src/transformers/models/nougat/tokenization_nougat.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -441,31 +441,26 @@ def __init__(
441441
)
442442
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
443443

444-
# Set up post processor with bos and eos tokens
445-
bos_token_id = self._vocab.get(str(bos_token), 0)
446-
eos_token_id = self._vocab.get(str(eos_token), 2)
447-
pad_token_id = self._vocab.get(str(pad_token), 1)
444+
super().__init__(
445+
errors=errors,
446+
unk_token=unk_token,
447+
bos_token=bos_token,
448+
eos_token=eos_token,
449+
pad_token=pad_token,
450+
**kwargs,
451+
)
448452
self._tokenizer.post_processor = processors.TemplateProcessing(
449453
single=f"{bos_token}:0 $A:0 {eos_token}:0",
450454
pair="$A:0 $B:1",
451455
special_tokens=[
452-
(str(eos_token), eos_token_id),
453-
(str(bos_token), bos_token_id),
456+
(str(eos_token), self.eos_token_id),
457+
(str(bos_token), self.bos_token_id),
454458
],
455459
)
456460

457461
# Enable truncation and padding
458462
self._tokenizer.enable_truncation(max_length=4096)
459-
self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
460-
461-
super().__init__(
462-
errors=errors,
463-
unk_token=unk_token,
464-
bos_token=bos_token,
465-
eos_token=eos_token,
466-
pad_token=pad_token,
467-
**kwargs,
468-
)
463+
self._tokenizer.enable_padding(length=4096, pad_id=self.pad_token_id, pad_token=str(pad_token))
469464

470465
def remove_hallucinated_references(self, text: str) -> str:
471466
"""

src/transformers/models/parakeet/convert_nemo_to_hf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
ParakeetFeatureExtractor,
3131
ParakeetForCTC,
3232
ParakeetProcessor,
33-
ParakeetTokenizerFast,
33+
ParakeetTokenizer,
3434
)
3535
from transformers.convert_slow_tokenizer import ParakeetConverter
3636
from transformers.utils.hub import cached_file
@@ -151,7 +151,7 @@ def extract_nemo_archive(nemo_file_path: str, extract_dir: str) -> dict[str, str
151151

152152
def write_processor(nemo_config: dict, model_files, output_dir, push_to_repo_id=None):
153153
tokenizer_converted = ParakeetConverter(model_files["tokenizer_model_file"]).converted()
154-
tokenizer_converted_fast = ParakeetTokenizerFast(
154+
tokenizer_converted_fast = ParakeetTokenizer(
155155
tokenizer_object=tokenizer_converted,
156156
clean_up_tokenization_spaces=False,
157157
)

src/transformers/models/parakeet/tokenization_parakeet_fast.py renamed to src/transformers/models/parakeet/tokenization_parakeet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
import itertools
1717
from typing import Optional, Union
1818

19-
from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
19+
from ...tokenization_utils_tokenizers import TokenizersBackend
2020

2121

22-
class ParakeetTokenizerFast(PreTrainedTokenizerFast):
22+
class ParakeetTokenizer(TokenizersBackend):
2323
"""
2424
Inherits all methods from [`PreTrainedTokenizerFast`]. Users should refer to this superclass for more information regarding those methods,
2525
except for `_decode` which is overridden to adapt it to CTC decoding:
@@ -51,4 +51,4 @@ def _decode(
5151
)
5252

5353

54-
__all__ = ["ParakeetTokenizerFast"]
54+
__all__ = ["ParakeetTokenizer"]

0 commit comments

Comments
 (0)