Skip to content

Commit 15d4cd0

Browse files
committed
tokenizer
1 parent c8311e6 commit 15d4cd0

File tree

9 files changed

+125
-79
lines changed

9 files changed

+125
-79
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
186186
OFF
187187
)
188188

189+
option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
190+
OFF
191+
)
192+
189193
option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
190194

191195
option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -717,6 +721,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
717721
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
718722
endif()
719723

724+
if(EXECUTORCH_BUILD_EXTENSION_LLM)
725+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
726+
endif()
727+
720728
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
721729
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
722730
endif()

build/cmake_deps.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,22 @@ deps = [
402402
"xnnpack_backend",
403403
]
404404

405+
[targets.extension_llm_tokenizer]
406+
buck_targets = [
407+
"//extension/llm/tokenizer:bpe_tokenizer",
408+
"//extension/llm/tokenizer:tiktoken",
409+
]
410+
filters = [
411+
".cpp$",
412+
]
413+
excludes = [
414+
"^codegen",
415+
]
416+
deps = [
417+
"executorch",
418+
"executorch_core",
419+
]
420+
405421
[targets.llama_runner]
406422
buck_targets = [
407423
"//examples/models/llama/runner:runner",
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
17+
endif()
18+
19+
set(ABSL_ENABLE_INSTALL ON)
20+
set(ABSL_PROPAGATE_CXX_STD ON)
21+
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
22+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
23+
add_subdirectory(
24+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
25+
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
26+
)
27+
add_subdirectory(
28+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
29+
${CMAKE_CURRENT_BINARY_DIR}/re2
30+
)
31+
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
32+
33+
list(TRANSFORM _extension_llm_tokenizer__srcs PREPEND "${EXECUTORCH_ROOT}/")
34+
add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
35+
target_include_directories(
36+
extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/..
37+
${_common_include_directories}
38+
)
39+
40+
target_link_libraries(extension_llm_tokenizer re2::re2)
41+
target_compile_options(
42+
extension_llm_tokenizer PUBLIC ${_common_compile_options}
43+
)
44+
45+
# Install libraries
46+
install(
47+
TARGETS extension_llm_tokenizer
48+
DESTINATION lib
49+
INCLUDES
50+
DESTINATION ${_common_include_directories}
51+
)
52+
53+
target_include_directories(
54+
extension_llm_tokenizer
55+
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
56+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
57+
)
58+
59+
if(BUILD_TESTING)
60+
add_subdirectory(test)
61+
endif()

extension/llm/tokenizer/test/CMakeLists.txt

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
#
78
# This file should be formatted with
89
# ~~~
910
# cmake-format -i CMakeLists.txt
@@ -12,39 +13,24 @@
1213
#
1314

1415
cmake_minimum_required(VERSION 3.19)
15-
project(tokenizer_test)
16-
17-
# Use C++17 for test.
18-
set(CMAKE_CXX_STANDARD 17)
1916

2017
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
2118

2219
include(${EXECUTORCH_ROOT}/build/Test.cmake)
2320

24-
set(_tokenizer_test_srcs
25-
test_tiktoken.cpp test_bpe_tokenizer.cpp
26-
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
27-
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
21+
set(test_env
22+
"TEST_BPE_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
23+
"TEST_TIKTOKEN_INVALID_BASE64=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
24+
"TEST_TIKTOKEN_INVALID_RANK=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
25+
"TEST_TIKTOKEN_NO_SPACE=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
26+
"TEST_TIKTOKEN_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
2827
)
2928

30-
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
31-
set(ABSL_ENABLE_INSTALL ON)
32-
set(ABSL_PROPAGATE_CXX_STD ON)
33-
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
34-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
35-
add_subdirectory(
36-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
37-
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
38-
)
39-
add_subdirectory(
40-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2
41-
${CMAKE_CURRENT_BINARY_DIR}/re2
42-
)
43-
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
29+
set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp)
4430

45-
et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
46-
target_include_directories(
47-
tokenizer_test
48-
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
49-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
31+
et_cxx_test(
32+
extension_llm_tokenizer_test SOURCES ${_test_srcs} EXTRA_LIBS
33+
extension_llm_tokenizer
5034
)
35+
36+
set_property(TEST extension_llm_tokenizer_test PROPERTY ENVIRONMENT ${test_env})

extension/llm/tokenizer/test/targets.bzl

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ def define_common_targets():
77
TARGETS and BUCK files that call this function.
88
"""
99

10+
test_env = {
11+
"TEST_BPE_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
12+
"TEST_TIKTOKEN_INVALID_BASE64": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
13+
"TEST_TIKTOKEN_INVALID_RANK": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
14+
"TEST_TIKTOKEN_NO_SPACE": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
15+
"TEST_TIKTOKEN_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
16+
},
17+
1018
runtime.python_test(
1119
name = "test_tokenizer_py",
1220
srcs = [
@@ -25,9 +33,7 @@ def define_common_targets():
2533
deps = [
2634
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
2735
],
28-
env = {
29-
"RESOURCES_PATH": "$(location :resources)/resources",
30-
},
36+
env = test_env,
3137
)
3238

3339
runtime.cxx_test(
@@ -45,10 +51,3 @@ def define_common_targets():
4551
"re2",
4652
],
4753
)
48-
49-
runtime.filegroup(
50-
name = "resources",
51-
srcs = native.glob([
52-
"resources/**",
53-
]),
54-
)

extension/llm/tokenizer/test/test_bpe_tokenizer.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef EXECUTORCH_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
129
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1310
#include <executorch/runtime/platform/runtime.h>
1411
#include <gtest/gtest.h>
15-
#include <vector>
1612

1713
using namespace ::testing;
1814

@@ -26,13 +22,7 @@ class TokenizerExtensionTest : public Test {
2622
void SetUp() override {
2723
executorch::runtime::runtime_init();
2824
tokenizer_ = std::make_unique<BPETokenizer>();
29-
#ifdef EXECUTORCH_FB_BUCK
30-
modelPath_ = facebook::xplat::testing::getPathForTestResource(
31-
"resources/test_bpe_tokenizer.bin");
32-
#else
33-
modelPath_ =
34-
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
35-
#endif
25+
modelPath_ = std::getenv("TEST_BPE_TOKENIZER");
3626
}
3727

3828
std::unique_ptr<Tokenizer> tokenizer_;
@@ -50,15 +40,15 @@ TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
5040
}
5141

5242
TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
53-
Error res = tokenizer_->load(modelPath_.c_str());
43+
Error res = tokenizer_->load(modelPath_);
5444
EXPECT_EQ(res, Error::Ok);
5545
auto result = tokenizer_->decode(0, 64000);
5646
// The vocab size is 32000, and token 64000 is out of vocab range.
5747
EXPECT_EQ(result.error(), Error::NotSupported);
5848
}
5949

6050
TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
61-
Error res = tokenizer_->load(modelPath_.c_str());
51+
Error res = tokenizer_->load(modelPath_);
6252
EXPECT_EQ(res, Error::Ok);
6353
// test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
6454
EXPECT_EQ(tokenizer_->vocab_size(), 0);

extension/llm/tokenizer/test/test_tiktoken.cpp

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef EXECUTORCH_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
129
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1310
#include <executorch/runtime/platform/runtime.h>
1411
#include <gmock/gmock.h>
1512
#include <gtest/gtest.h>
16-
#include <vector>
1713

1814
using namespace ::testing;
1915
using ::executorch::extension::llm::Tiktoken;
@@ -49,15 +45,6 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
4945
}
5046
return special_tokens;
5147
}
52-
53-
static inline std::string _get_resource_path(const std::string& name) {
54-
#ifdef EXECUTORCH_FB_BUCK
55-
return facebook::xplat::testing::getPathForTestResource("resources/" + name);
56-
#else
57-
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
58-
#endif
59-
}
60-
6148
} // namespace
6249

6350
class TiktokenExtensionTest : public Test {
@@ -66,7 +53,7 @@ class TiktokenExtensionTest : public Test {
6653
executorch::runtime::runtime_init();
6754
tokenizer_ = std::make_unique<Tiktoken>(
6855
_get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex);
69-
modelPath_ = _get_resource_path("test_tiktoken_tokenizer.model");
56+
modelPath_ = std::getenv("TEST_TIKTOKEN_TOKENIZER");
7057
}
7158

7259
std::unique_ptr<Tokenizer> tokenizer_;
@@ -84,15 +71,15 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
8471
}
8572

8673
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
87-
Error res = tokenizer_->load(modelPath_.c_str());
74+
Error res = tokenizer_->load(modelPath_);
8875
EXPECT_EQ(res, Error::Ok);
8976
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
9077
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
9178
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
9279
}
9380

9481
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
95-
Error res = tokenizer_->load(modelPath_.c_str());
82+
Error res = tokenizer_->load(modelPath_);
9683
EXPECT_EQ(res, Error::Ok);
9784
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
9885
EXPECT_EQ(out.error(), Error::Ok);
@@ -103,7 +90,7 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
10390
}
10491

10592
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
106-
Error res = tokenizer_->load(modelPath_.c_str());
93+
Error res = tokenizer_->load(modelPath_);
10794
EXPECT_EQ(res, Error::Ok);
10895
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
10996
std::vector<uint64_t> tokens = {128000, 15339, 1917};
@@ -115,7 +102,7 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
115102
}
116103

117104
TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
118-
Error res = tokenizer_->load(modelPath_.c_str());
105+
Error res = tokenizer_->load(modelPath_);
119106
EXPECT_EQ(res, Error::Ok);
120107
// The vocab size is 128256, addes 256 just so the token is out of vocab
121108
// range.
@@ -160,31 +147,29 @@ TEST_F(TiktokenExtensionTest, LoadWithInvalidPath) {
160147
}
161148

162149
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidRank) {
163-
auto invalidModelPath =
164-
_get_resource_path("test_tiktoken_invalid_rank.model");
165-
Error res = tokenizer_->load(invalidModelPath.c_str());
150+
auto invalidModelPath = std::getenv("TEST_TIKTOKEN_INVALID_RANK");
151+
Error res = tokenizer_->load(invalidModelPath);
166152

167153
EXPECT_EQ(res, Error::InvalidArgument);
168154
}
169155

170156
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidBase64) {
171-
auto invalidModelPath =
172-
_get_resource_path("test_tiktoken_invalid_base64.model");
173-
Error res = tokenizer_->load(invalidModelPath.c_str());
157+
auto invalidModelPath = std::getenv("TEST_TIKTOKEN_INVALID_BASE64");
158+
Error res = tokenizer_->load(invalidModelPath);
174159

175160
EXPECT_EQ(res, Error::InvalidArgument);
176161
}
177162

178163
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithNoSpace) {
179-
auto invalidModelPath = _get_resource_path("test_tiktoken_no_space.model");
180-
Error res = tokenizer_->load(invalidModelPath.c_str());
164+
auto invalidModelPath = std::getenv("TEST_TIKTOKEN_NO_SPACE");
165+
Error res = tokenizer_->load(invalidModelPath);
181166

182167
EXPECT_EQ(res, Error::InvalidArgument);
183168
}
184169

185170
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithBPEFile) {
186-
auto invalidModelPath = _get_resource_path("test_bpe_tokenizer.bin");
187-
Error res = tokenizer_->load(invalidModelPath.c_str());
171+
auto invalidModelPath = std::getenv("TEST_BPE_TOKENIZER");
172+
Error res = tokenizer_->load(invalidModelPath);
188173

189174
EXPECT_EQ(res, Error::InvalidArgument);
190175
}

runtime/executor/test/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ add_custom_command(
3232
"ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain"
3333
--outdir "${CMAKE_BINARY_DIR}" 2> /dev/null
3434
COMMAND
35-
python3 -m test.models.export_program --modules
35+
python -m test.models.export_program --modules
3636
"ModuleLinear" --external-constants
3737
--outdir "${CMAKE_BINARY_DIR}" 2> /dev/null
3838
COMMAND

test/run_oss_cpp_tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ build_executorch() {
4141
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
4242
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
4343
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
44+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
4445
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
4546
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
4647
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \

0 commit comments

Comments
 (0)