From b1a9127d29a3e571f5c3598056fe7b95aac14f62 Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 14 Feb 2025 15:39:34 -0800 Subject: [PATCH] Refactor tokenizer test and add to cmake (#8450) Summary: Refactor tokenizer test to use env instead of resource path. Add to cmake tests. Test Plan: ## Internal ``` buck2 test fbsource//xplat/executorch/extension/llm/tokenizer/test:test_tiktoken buck2 test fbsource//xplat/executorch/extension/llm/tokenizer/test:test_bpe_tokenizer buck2 test fbcode//executorch/extension/llm/tokenizer/test:test_tiktoken buck2 test fbcode//executorch/extension/llm/tokenizer/test:test_bpe_tokenizer ``` ## OSS build et ``` ./install_executorch.sh ``` build test ``` CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake . \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ -DEXECUTORCH_BUILD_TESTS=ON \ -Bcmake-out cmake --build cmake-out -j9 --target install ``` test ``` cd cmake-out ctest -R tokenizer Test project /data/users/lfq/executorch/cmake-out Start 54: extension_llm_tokenizer_test 1/1 Test https://github.com/pytorch/executorch/issues/54: extension_llm_tokenizer_test ..... Passed 3.66 sec 100% tests passed, 0 tests failed out of 1 Total Test time (real) =--sanitized-- Differential Revision: D69642007 Pulled By: lucylq --- CMakeLists.txt | 8 +++ build/Utils.cmake | 3 + build/cmake_deps.toml | 16 +++++ extension/llm/tokenizer/CMakeLists.txt | 61 +++++++++++++++++++ extension/llm/tokenizer/test/CMakeLists.txt | 35 +++-------- extension/llm/tokenizer/test/targets.bzl | 7 +++ .../llm/tokenizer/test/test_bpe_tokenizer.cpp | 13 +--- .../llm/tokenizer/test/test_tiktoken.cpp | 46 ++++++-------- test/run_oss_cpp_tests.sh | 2 - 9 files changed, 122 insertions(+), 69 deletions(-) create mode 100644 extension/llm/tokenizer/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index ee5611fdb13..69232b85a53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension" OFF ) +option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension" + OFF +) + option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF) option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension" @@ -718,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize) endif() +if(EXECUTORCH_BUILD_EXTENSION_LLM) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer) +endif() + if(EXECUTORCH_BUILD_EXTENSION_MODULE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) endif() diff --git a/build/Utils.cmake b/build/Utils.cmake index dca3f189ec4..3d4e9c76005 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -70,6 +70,9 @@ function(executorch_print_configuration_summary) message(STATUS " EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR : " "${EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR}" ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_LLM : " + "${EXECUTORCH_BUILD_EXTENSION_LLM}" + ) message(STATUS " EXECUTORCH_BUILD_EXTENSION_MODULE : " "${EXECUTORCH_BUILD_EXTENSION_MODULE}" ) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 287db04045e..c44fcf92ea6 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -402,6 +402,22 @@ deps = [ "xnnpack_backend", ] +[targets.extension_llm_tokenizer] +buck_targets = [ + "//extension/llm/tokenizer:bpe_tokenizer", + "//extension/llm/tokenizer:tiktoken", +] +filters = [ + ".cpp$", +] +excludes = [ + "^codegen", +] +deps = [ + "executorch", + "executorch_core", +] + [targets.llama_runner] buck_targets = [ "//examples/models/llama/runner:runner", diff --git a/extension/llm/tokenizer/CMakeLists.txt b/extension/llm/tokenizer/CMakeLists.txt new file mode 100644 index 00000000000..8745da6780a --- /dev/null +++ b/extension/llm/tokenizer/CMakeLists.txt @@ -0,0 +1,61 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +endif() + +set(ABSL_ENABLE_INSTALL ON) +set(ABSL_PROPAGATE_CXX_STD ON) +set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp + ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp +) +add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2 + ${CMAKE_CURRENT_BINARY_DIR}/re2 +) +set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) + +list(TRANSFORM _extension_llm_tokenizer__srcs PREPEND "${EXECUTORCH_ROOT}/") +add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs}) +target_include_directories( + extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/.. + ${_common_include_directories} +) + +target_link_libraries(extension_llm_tokenizer re2::re2) +target_compile_options( + extension_llm_tokenizer PUBLIC ${_common_compile_options} +) + +# Install libraries +install( + TARGETS extension_llm_tokenizer + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories} +) + +target_include_directories( + extension_llm_tokenizer + PRIVATE ${CMAKE_INSTALL_PREFIX}/include + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp +) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/llm/tokenizer/test/CMakeLists.txt b/extension/llm/tokenizer/test/CMakeLists.txt index b631e7cf5b3..ffc37f9e46f 100644 --- a/extension/llm/tokenizer/test/CMakeLists.txt +++ b/extension/llm/tokenizer/test/CMakeLists.txt @@ -12,39 +12,18 @@ # cmake_minimum_required(VERSION 3.19) -project(tokenizer_test) - -# Use C++17 for test. -set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_tokenizer_test_srcs - test_tiktoken.cpp test_bpe_tokenizer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp -) +set(test_env "RESOURCES_PATH=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources") -set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources) -set(ABSL_ENABLE_INSTALL ON) -set(ABSL_PROPAGATE_CXX_STD ON) -set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp -) -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 -) -set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) +set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp) -et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2) -target_include_directories( - tokenizer_test - PRIVATE ${CMAKE_INSTALL_PREFIX}/include - ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp +et_cxx_test( + extension_llm_tokenizer_test SOURCES ${_test_srcs} EXTRA_LIBS + extension_llm_tokenizer ) + +set_property(TEST extension_llm_tokenizer_test PROPERTY ENVIRONMENT ${test_env}) diff --git a/extension/llm/tokenizer/test/targets.bzl b/extension/llm/tokenizer/test/targets.bzl index 42f60b44ed3..2c314a98230 100644 --- a/extension/llm/tokenizer/test/targets.bzl +++ b/extension/llm/tokenizer/test/targets.bzl @@ -1,3 +1,8 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", + "CXX", +) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): @@ -28,6 +33,7 @@ def define_common_targets(): env = { "RESOURCES_PATH": "$(location :resources)/resources", }, + platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform. ) runtime.cxx_test( @@ -41,6 +47,7 @@ def define_common_targets(): env = { "RESOURCES_PATH": "$(location :resources)/resources", }, + platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform. external_deps = [ "re2", ], diff --git a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp index d207578de1e..2d208cb6973 100644 --- a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp +++ b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp @@ -6,13 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#ifdef EXECUTORCH_FB_BUCK -#include -#endif #include #include #include -#include using namespace ::testing; @@ -26,13 +22,8 @@ class TokenizerExtensionTest : public Test { void SetUp() override { executorch::runtime::runtime_init(); tokenizer_ = std::make_unique(); -#ifdef EXECUTORCH_FB_BUCK - modelPath_ = facebook::xplat::testing::getPathForTestResource( - "resources/test_bpe_tokenizer.bin"); -#else modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin"); -#endif } std::unique_ptr tokenizer_; @@ -50,7 +41,7 @@ TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) { } TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); auto result = tokenizer_->decode(0, 64000); // The vocab size is 32000, and token 64000 is out of vocab range. @@ -58,7 +49,7 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) { } TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); // test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded. EXPECT_EQ(tokenizer_->vocab_size(), 0); diff --git a/extension/llm/tokenizer/test/test_tiktoken.cpp b/extension/llm/tokenizer/test/test_tiktoken.cpp index 3132170683c..ed9c7ba2875 100644 --- a/extension/llm/tokenizer/test/test_tiktoken.cpp +++ b/extension/llm/tokenizer/test/test_tiktoken.cpp @@ -6,14 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#ifdef EXECUTORCH_FB_BUCK -#include -#endif #include #include #include #include -#include using namespace ::testing; using ::executorch::extension::llm::Tiktoken; @@ -49,15 +45,6 @@ static inline std::unique_ptr> _get_special_tokens() { } return special_tokens; } - -static inline std::string _get_resource_path(const std::string& name) { -#ifdef EXECUTORCH_FB_BUCK - return facebook::xplat::testing::getPathForTestResource("resources/" + name); -#else - return std::getenv("RESOURCES_PATH") + std::string("/") + name; -#endif -} - } // namespace class TiktokenExtensionTest : public Test { @@ -66,7 +53,8 @@ class TiktokenExtensionTest : public Test { executorch::runtime::runtime_init(); tokenizer_ = std::make_unique( _get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex); - modelPath_ = _get_resource_path("test_tiktoken_tokenizer.model"); + modelPath_ = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_tokenizer.model"); } std::unique_ptr tokenizer_; @@ -84,7 +72,7 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) { } TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); EXPECT_EQ(tokenizer_->vocab_size(), 128256); EXPECT_EQ(tokenizer_->bos_tok(), 128000); @@ -92,7 +80,7 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) { } TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); Result> out = tokenizer_->encode("hello world", 1, 0); EXPECT_EQ(out.error(), Error::Ok); @@ -103,7 +91,7 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) { } TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); std::vector expected = {"<|begin_of_text|>", "hello", " world"}; std::vector tokens = {128000, 15339, 1917}; @@ -115,7 +103,7 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) { } TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) { - Error res = tokenizer_->load(modelPath_.c_str()); + Error res = tokenizer_->load(modelPath_); EXPECT_EQ(res, Error::Ok); // The vocab size is 128256, addes 256 just so the token is out of vocab // range. @@ -160,31 +148,33 @@ TEST_F(TiktokenExtensionTest, LoadWithInvalidPath) { } TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidRank) { - auto invalidModelPath = - _get_resource_path("test_tiktoken_invalid_rank.model"); - Error res = tokenizer_->load(invalidModelPath.c_str()); + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_invalid_rank.model"); + Error res = tokenizer_->load(invalidModelPath); EXPECT_EQ(res, Error::InvalidArgument); } TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidBase64) { - auto invalidModelPath = - _get_resource_path("test_tiktoken_invalid_base64.model"); - Error res = tokenizer_->load(invalidModelPath.c_str()); + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_invalid_base64.model"); + Error res = tokenizer_->load(invalidModelPath); EXPECT_EQ(res, Error::InvalidArgument); } TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithNoSpace) { - auto invalidModelPath = _get_resource_path("test_tiktoken_no_space.model"); - Error res = tokenizer_->load(invalidModelPath.c_str()); + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_no_space.model"); + Error res = tokenizer_->load(invalidModelPath); EXPECT_EQ(res, Error::InvalidArgument); } TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithBPEFile) { - auto invalidModelPath = _get_resource_path("test_bpe_tokenizer.bin"); - Error res = tokenizer_->load(invalidModelPath.c_str()); + auto invalidModelPath = + std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin"); + Error res = tokenizer_->load(invalidModelPath); EXPECT_EQ(res, Error::InvalidArgument); } diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 012fded59ca..df7955c4d41 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -57,8 +57,6 @@ build_and_run_test() { if [[ "$test_dir" =~ .*examples/models/llama/tokenizer.* ]]; then RESOURCES_PATH=$(realpath examples/models/llama/tokenizer/test/resources) - elif [[ "$test_dir" =~ .*extension/llm/tokenizer.* ]]; then - RESOURCES_PATH=$(realpath extension/llm/tokenizer/test/resources) fi export RESOURCES_PATH