Skip to content

Commit 8b44cb0

Browse files
committed
Add unit tests for sentencepiece tokenizer
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 6bca95c commit 8b44cb0

File tree

5 files changed

+93
-7
lines changed

5 files changed

+93
-7
lines changed

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ code = 'NEWLINE'
123123
include_patterns = ['**']
124124
exclude_patterns = [
125125
'third-party/**',
126+
'test/resources/*.model',
126127
]
127128
command = [
128129
'python',

CMakeLists.txt

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,35 @@
1515
# It should also be cmake-lint clean.
1616
#
1717
cmake_minimum_required(VERSION 3.24)
18+
set(CMAKE_CXX_STANDARD 17)
1819

1920
project(Tokenizers)
2021

22+
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
23+
2124
set(ABSL_ENABLE_INSTALL ON)
2225
set(ABSL_PROPAGATE_CXX_STD ON)
23-
set(_pic_flag
24-
${CMAKE_POSITION_INDEPENDENT_CODE})
26+
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
2527
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2628
add_subdirectory(third-party/sentencepiece)
2729
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
2830

2931
add_library(tokenizers STATIC src/sentencepiece.cpp)
3032

3133
# Using abseil from sentencepiece/third_party
32-
target_include_directories(
33-
tokenizers PUBLIC third-party/sentencepiece/src
34-
third-party/sentencepiece include)
34+
target_include_directories(tokenizers PUBLIC third-party/sentencepiece/src
35+
third-party/sentencepiece include)
3536

3637
target_link_libraries(tokenizers PUBLIC sentencepiece-static)
38+
39+
# Build test
40+
if(TOKENIZERS_BUILD_TEST)
41+
find_package(GTest REQUIRED)
42+
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
43+
add_executable(sentencepiece_test test/test_sentencepiece.cpp)
44+
target_include_directories(
45+
sentencepiece_test PUBLIC third-party/sentencepiece/src
46+
third-party/sentencepiece include)
47+
target_link_libraries(sentencepiece_test PUBLIC tokenizers GTest::GTest
48+
GTest::Main)
49+
endif()

src/sentencepiece.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ Error SPTokenizer::load(const std::string &tokenizer_path) {
3838
const auto status = _processor->Load(tokenizer_path);
3939
if (!status.ok()) {
4040
fprintf(stderr,
41-
"couldn't load %s\n. If this tokenizer artifact is for llama3, "
42-
"please pass `-l 3`.",
41+
"couldn't load %s\n. It is likely that the tokenizer artifact is "
42+
"broken or of a different format.",
4343
tokenizer_path.c_str());
4444
return Error::LoadFailure;
4545
}
488 KB
Binary file not shown.

test/test_sentencepiece.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include "sentencepiece.h"
10+
#include "gtest/gtest.h"
11+
12+
namespace tokenizers {
13+
14+
TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
15+
SPTokenizer tokenizer;
16+
std::string text = "Hello world!";
17+
auto result = tokenizer.encode(text, /*bos*/ 0, /*eos*/ 1);
18+
EXPECT_EQ(result.error(), Error::Uninitialized);
19+
}
20+
21+
TEST(SPTokenizerTest, TestDecodeWithoutLoad) {
22+
SPTokenizer tokenizer;
23+
auto result = tokenizer.decode(0, 0);
24+
EXPECT_EQ(result.error(), Error::Uninitialized);
25+
}
26+
27+
TEST(SPTokenizerTest, TestLoad) {
28+
SPTokenizer tokenizer;
29+
auto resources = std::getenv("RESOURCES_PATH");
30+
auto path = resources + std::string("/test_sentencepiece.model");
31+
auto error = tokenizer.load(path);
32+
EXPECT_EQ(error, Error::Ok);
33+
}
34+
35+
TEST(SPTokenizerTest, TestLoadInvalidPath) {
36+
SPTokenizer tokenizer;
37+
auto error = tokenizer.load("invalid_path");
38+
EXPECT_EQ(error, Error::LoadFailure);
39+
}
40+
41+
TEST(SPTokenizerTest, TestEncode) {
42+
SPTokenizer tokenizer;
43+
auto resources = std::getenv("RESOURCES_PATH");
44+
auto path = resources + std::string("/test_sentencepiece.model");
45+
auto error = tokenizer.load(path);
46+
EXPECT_EQ(error, Error::Ok);
47+
std::string text = "Hello world!";
48+
auto result = tokenizer.encode(text, /*bos*/ 1, /*eos*/ 0);
49+
EXPECT_TRUE(result.ok());
50+
EXPECT_EQ(result.get().size(), 4);
51+
EXPECT_EQ(result.get()[0], 1);
52+
EXPECT_EQ(result.get()[1], 15043);
53+
EXPECT_EQ(result.get()[2], 3186);
54+
EXPECT_EQ(result.get()[3], 29991);
55+
}
56+
57+
TEST(SPTokenizerTest, TestDecode) {
58+
SPTokenizer tokenizer;
59+
auto resources = std::getenv("RESOURCES_PATH");
60+
auto path = resources + std::string("/test_sentencepiece.model");
61+
auto error = tokenizer.load(path);
62+
EXPECT_EQ(error, Error::Ok);
63+
std::vector<uint64_t> tokens = {1, 15043, 3186, 29991};
64+
std::vector<std::string> expected = {"", "Hello", " world", "!"};
65+
for (auto i = 0; i < 3; ++i) {
66+
auto result = tokenizer.decode(tokens[i], tokens[i + 1]);
67+
EXPECT_TRUE(result.ok());
68+
EXPECT_EQ(result.get(), expected[i + 1]);
69+
}
70+
}
71+
72+
} // namespace tokenizers

0 commit comments

Comments
 (0)