|
9 | 9 | Test script for PyTorch Tokenizers Python bindings
|
10 | 10 | """
|
11 | 11 |
|
| 12 | +import os |
12 | 13 | import sys
|
13 | 14 | import unittest
|
14 | 15 |
|
@@ -91,3 +92,38 @@ def test_version(self):
|
91 | 92 | """Test that version is available"""
|
92 | 93 | self.assertTrue(hasattr(pytorch_tokenizers, '__version__'))
|
93 | 94 | self.assertEqual(pytorch_tokenizers.__version__, "0.1.0")
|
| 95 | + |
| 96 | + def test_hf_tokenizer_encode_decode(self): |
| 97 | + """Test HFTokenizer with test_hf_tokenizer.json to encode/decode 'Hello world!'""" |
| 98 | + # Get the path to the test tokenizer file |
| 99 | + test_dir = os.path.dirname(os.path.abspath(__file__)) |
| 100 | + tokenizer_path = os.path.join(test_dir, "resources", "test_hf_tokenizer.json") |
| 101 | + |
| 102 | + # Create and load the tokenizer |
| 103 | + hf_tokenizer = pytorch_tokenizers.HFTokenizer() |
| 104 | + self.assertFalse(hf_tokenizer.is_loaded()) |
| 105 | + |
| 106 | + # Load the tokenizer from JSON file |
| 107 | + hf_tokenizer.load(tokenizer_path) |
| 108 | + self.assertTrue(hf_tokenizer.is_loaded()) |
| 109 | + |
| 110 | + # Test encoding "Hello world!" |
| 111 | + text = "Hello world!" |
| 112 | + encoded_tokens = hf_tokenizer.encode(text, 1, 0) # bos=1, eos=0 |
| 113 | + self.assertIsInstance(encoded_tokens, list) |
| 114 | + self.assertGreater(len(encoded_tokens), 0) |
| 115 | + |
| 116 | + # Test decoding the encoded tokens |
| 117 | + for token_id in encoded_tokens: |
| 118 | + decoded_text = hf_tokenizer.decode(token_id) |
| 119 | + self.assertIsInstance(decoded_text, str) |
| 120 | + |
| 121 | + # Test that we can get vocab size |
| 122 | + vocab_size = hf_tokenizer.vocab_size() |
| 123 | + self.assertGreater(vocab_size, 0) |
| 124 | + |
| 125 | + # Test BOS and EOS tokens |
| 126 | + bos_token = hf_tokenizer.bos_tok() |
| 127 | + eos_token = hf_tokenizer.eos_tok() |
| 128 | + self.assertIsInstance(bos_token, int) |
| 129 | + self.assertIsInstance(eos_token, int) |
0 commit comments