Skip to content

Commit 41b88a2

Browse files
committed
tokenizer
1 parent c8311e6 commit 41b88a2

File tree

10 files changed

+176
-80
lines changed

10 files changed

+176
-80
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
186186
OFF
187187
)
188188

189+
option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
190+
OFF
191+
)
192+
189193
option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
190194

191195
option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -717,6 +721,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
717721
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
718722
endif()
719723

724+
if(EXECUTORCH_BUILD_EXTENSION_LLM)
725+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
726+
endif()
727+
720728
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
721729
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
722730
endif()

build/cmake_deps.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,22 @@ deps = [
402402
"xnnpack_backend",
403403
]
404404

405+
[targets.extension_llm_tokenizer]
406+
buck_targets = [
407+
"//extension/llm/tokenizer:bpe_tokenizer",
408+
"//extension/llm/tokenizer:tiktoken",
409+
]
410+
filters = [
411+
".cpp$",
412+
]
413+
excludes = [
414+
"^codegen",
415+
]
416+
deps = [
417+
"executorch",
418+
"executorch_core",
419+
]
420+
405421
[targets.llama_runner]
406422
buck_targets = [
407423
"//examples/models/llama/runner:runner",
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
17+
endif()
18+
19+
set(ABSL_ENABLE_INSTALL ON)
20+
set(ABSL_PROPAGATE_CXX_STD ON)
21+
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
22+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
23+
add_subdirectory(
24+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
25+
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
26+
)
27+
add_subdirectory(
28+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
29+
${CMAKE_CURRENT_BINARY_DIR}/re2
30+
)
31+
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
32+
33+
list(TRANSFORM _extension_llm_tokenizer__srcs PREPEND "${EXECUTORCH_ROOT}/")
34+
add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
35+
target_include_directories(extension_llm_tokenizer PUBLIC
36+
${EXECUTORCH_ROOT}/..
37+
${_common_include_directories})
38+
39+
target_link_libraries(extension_llm_tokenizer re2::re2)
40+
target_compile_options(extension_llm_tokenizer PUBLIC ${_common_compile_options})
41+
42+
# Install libraries
43+
install(
44+
TARGETS extension_llm_tokenizer
45+
DESTINATION lib
46+
INCLUDES
47+
DESTINATION ${_common_include_directories}
48+
)
49+
50+
target_include_directories(
51+
extension_llm_tokenizer
52+
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
53+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
54+
)
55+
56+
if(BUILD_TESTING)
57+
add_subdirectory(test)
58+
endif()

extension/llm/tokenizer/test/CMakeLists.txt

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
# @generated by test/utils/generate_gtest_cmakelists.py
8+
#
79
# This file should be formatted with
810
# ~~~
911
# cmake-format -i CMakeLists.txt
@@ -12,39 +14,42 @@
1214
#
1315

1416
cmake_minimum_required(VERSION 3.19)
15-
project(tokenizer_test)
16-
17-
# Use C++17 for test.
18-
set(CMAKE_CXX_STANDARD 17)
1917

2018
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
2119

2220
include(${EXECUTORCH_ROOT}/build/Test.cmake)
2321

24-
set(_tokenizer_test_srcs
25-
test_tiktoken.cpp test_bpe_tokenizer.cpp
26-
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
27-
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
28-
)
29-
30-
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
31-
set(ABSL_ENABLE_INSTALL ON)
32-
set(ABSL_PROPAGATE_CXX_STD ON)
33-
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
34-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
35-
add_subdirectory(
36-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
37-
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
38-
)
39-
add_subdirectory(
40-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2
41-
${CMAKE_CURRENT_BINARY_DIR}/re2
22+
set(test_env
23+
"TEST_BPE_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
24+
"TEST_TIKTOKEN_INVALID_BASE64=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
25+
"TEST_TIKTOKEN_INVALID_RANK=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
26+
"TEST_TIKTOKEN_NO_SPACE=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
27+
"TEST_TIKTOKEN_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
4228
)
43-
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4429

45-
et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
46-
target_include_directories(
47-
tokenizer_test
48-
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
49-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
50-
)
30+
# set(ABSL_ENABLE_INSTALL ON)
31+
# set(ABSL_PROPAGATE_CXX_STD ON)
32+
# set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
33+
# set(CMAKE_POSITION_INDEPENDENT_CODE ON)
34+
# add_subdirectory(
35+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
36+
# ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
37+
# )
38+
# add_subdirectory(
39+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2
40+
# ${CMAKE_CURRENT_BINARY_DIR}/re2
41+
# )
42+
# set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
43+
44+
set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp)
45+
46+
et_cxx_test(extension_llm_tokenizer_test SOURCES ${_test_srcs} EXTRA_LIBS extension_llm_tokenizer)
47+
48+
set_property(TEST extension_llm_tokenizer_test PROPERTY ENVIRONMENT ${test_env})
49+
50+
# target_include_directories(extension_llm_tokenizer PUBLIC)
51+
# target_include_directories(
52+
# extension_llm_tokenizer_test
53+
# PRIVATE ${CMAKE_INSTALL_PREFIX}/include
54+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
55+
# )

extension/llm/tokenizer/test/targets.bzl

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ def define_common_targets():
77
TARGETS and BUCK files that call this function.
88
"""
99

10+
test_env = {
11+
"TEST_BPE_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
12+
"TEST_TIKTOKEN_INVALID_BASE64": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
13+
"TEST_TIKTOKEN_INVALID_RANK": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
14+
"TEST_TIKTOKEN_NO_SPACE": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
15+
"TEST_TIKTOKEN_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
16+
},
17+
1018
runtime.python_test(
1119
name = "test_tokenizer_py",
1220
srcs = [
@@ -25,9 +33,7 @@ def define_common_targets():
2533
deps = [
2634
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
2735
],
28-
env = {
29-
"RESOURCES_PATH": "$(location :resources)/resources",
30-
},
36+
env = test_env,
3137
)
3238

3339
runtime.cxx_test(
@@ -45,10 +51,3 @@ def define_common_targets():
4551
"re2",
4652
],
4753
)
48-
49-
runtime.filegroup(
50-
name = "resources",
51-
srcs = native.glob([
52-
"resources/**",
53-
]),
54-
)

extension/llm/tokenizer/test/test_bpe_tokenizer.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef EXECUTORCH_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
9+
// #ifdef EXECUTORCH_FB_BUCK
10+
// #include <TestResourceUtils/TestResourceUtils.h>
11+
// #endif
1212
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1313
#include <executorch/runtime/platform/runtime.h>
1414
#include <gtest/gtest.h>
@@ -26,13 +26,15 @@ class TokenizerExtensionTest : public Test {
2626
void SetUp() override {
2727
executorch::runtime::runtime_init();
2828
tokenizer_ = std::make_unique<BPETokenizer>();
29-
#ifdef EXECUTORCH_FB_BUCK
30-
modelPath_ = facebook::xplat::testing::getPathForTestResource(
31-
"resources/test_bpe_tokenizer.bin");
32-
#else
33-
modelPath_ =
34-
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
35-
#endif
29+
modelPath_ = std::getenv("TEST_BPE_TOKENIZER");
30+
// #ifdef EXECUTORCH_FB_BUCK
31+
// modelPath_ = facebook::xplat::testing::getPathForTestResource(
32+
// "resources/test_bpe_tokenizer.bin");
33+
// #else
34+
// modelPath_ =
35+
// std::getenv("RESOURCES_PATH") +
36+
// std::string("/test_bpe_tokenizer.bin");
37+
// #endif
3638
}
3739

3840
std::unique_ptr<Tokenizer> tokenizer_;
@@ -50,15 +52,15 @@ TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
5052
}
5153

5254
TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
53-
Error res = tokenizer_->load(modelPath_.c_str());
55+
Error res = tokenizer_->load(modelPath_);
5456
EXPECT_EQ(res, Error::Ok);
5557
auto result = tokenizer_->decode(0, 64000);
5658
// The vocab size is 32000, and token 64000 is out of vocab range.
5759
EXPECT_EQ(result.error(), Error::NotSupported);
5860
}
5961

6062
TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
61-
Error res = tokenizer_->load(modelPath_.c_str());
63+
Error res = tokenizer_->load(modelPath_);
6264
EXPECT_EQ(res, Error::Ok);
6365
// test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
6466
EXPECT_EQ(tokenizer_->vocab_size(), 0);

0 commit comments

Comments
 (0)