Skip to content

Commit 0166966

Browse files
committed
Address comments
1 parent 74792a0 commit 0166966

File tree

2 files changed

+37
-1
lines changed

2 files changed

+37
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#
1414
cmake_minimum_required(VERSION 3.18)
1515
set(CMAKE_CXX_STANDARD 17)
16-
16+
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
1717
project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)

test/test_python_bindings.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
Test script for PyTorch Tokenizers Python bindings
1010
"""
1111

12+
import os
1213
import sys
1314
import unittest
1415

@@ -91,3 +92,38 @@ def test_version(self):
9192
"""Test that version is available"""
9293
self.assertTrue(hasattr(pytorch_tokenizers, '__version__'))
9394
self.assertEqual(pytorch_tokenizers.__version__, "0.1.0")
95+
96+
def test_hf_tokenizer_encode_decode(self):
97+
"""Test HFTokenizer with test_hf_tokenizer.json to encode/decode 'Hello world!'"""
98+
# Get the path to the test tokenizer file
99+
test_dir = os.path.dirname(os.path.abspath(__file__))
100+
tokenizer_path = os.path.join(test_dir, "resources", "test_hf_tokenizer.json")
101+
102+
# Create and load the tokenizer
103+
hf_tokenizer = pytorch_tokenizers.HFTokenizer()
104+
self.assertFalse(hf_tokenizer.is_loaded())
105+
106+
# Load the tokenizer from JSON file
107+
hf_tokenizer.load(tokenizer_path)
108+
self.assertTrue(hf_tokenizer.is_loaded())
109+
110+
# Test encoding "Hello world!"
111+
text = "Hello world!"
112+
encoded_tokens = hf_tokenizer.encode(text, 1, 0) # bos=1, eos=0
113+
self.assertIsInstance(encoded_tokens, list)
114+
self.assertGreater(len(encoded_tokens), 0)
115+
116+
# Test decoding the encoded tokens
117+
for token_id in encoded_tokens:
118+
decoded_text = hf_tokenizer.decode(token_id)
119+
self.assertIsInstance(decoded_text, str)
120+
121+
# Test that we can get vocab size
122+
vocab_size = hf_tokenizer.vocab_size()
123+
self.assertGreater(vocab_size, 0)
124+
125+
# Test BOS and EOS tokens
126+
bos_token = hf_tokenizer.bos_tok()
127+
eos_token = hf_tokenizer.eos_tok()
128+
self.assertIsInstance(bos_token, int)
129+
self.assertIsInstance(eos_token, int)

0 commit comments

Comments
 (0)