File tree Expand file tree Collapse file tree 1 file changed +14
-1
lines changed
Expand file tree Collapse file tree 1 file changed +14
-1
lines changed Original file line number Diff line number Diff line change 2020
2121from transformers import AutoTokenizer , GPT2Tokenizer , GPT2TokenizerFast
2222from transformers .models .gpt2 .tokenization_gpt2 import VOCAB_FILES_NAMES
23- from transformers .testing_utils import require_jinja , require_tokenizers
23+ from transformers .testing_utils import require_jinja , require_tokenizers , require_tiktoken
2424
2525from ...test_tokenization_common import TokenizerTesterMixin
2626
@@ -299,6 +299,19 @@ def test_tokenization_for_chat(self):
299299 for tokenized_chat , expected_tokens in zip (tokenized_chats , expected_tokens ):
300300 self .assertListEqual (tokenized_chat , expected_tokens )
301301
302+ @require_tiktoken
303+ def test_tokenization_tiktoken (self ):
304+ from transformers .integrations .tiktoken import convert_tiktoken_to_fast
305+ from tiktoken import encoding_name_for_model
306+
307+ encoding = encoding_name_for_model ("gpt2" )
308+ convert_tiktoken_to_fast (encoding , self .tmpdirname )
309+
310+ tiktoken_fast_tokenizer = GPT2TokenizerFast .from_pretrained (self .tmpdirname )
311+ rust_tokenizer = GPT2TokenizerFast .from_pretrained ("openai-community/gpt2" )
312+ sequence = "lower newer"
313+ self .assertEqual (rust_tokenizer .decode (rust_tokenizer .encode (sequence )), tiktoken_fast_tokenizer .decode (rust_tokenizer .encode (sequence )))
314+
302315
303316@require_tokenizers
304317class OPTTokenizationTest (unittest .TestCase ):
You can’t perform that action at this time.
0 commit comments