Add unit tests for sentencepiece tokenizer

larryliu0820 · larryliu0820 · commit 8b44cb0fee03 · 2024-11-25T17:21:23.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -123,6 +123,7 @@ code = 'NEWLINE'
 include_patterns = ['**']
 exclude_patterns = [
     'third-party/**',
+    'test/resources/*.model',
 ]
 command = [
     'python',
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,22 +15,35 @@
 # It should also be cmake-lint clean.
 #
 cmake_minimum_required(VERSION 3.24)
+set(CMAKE_CXX_STANDARD 17)
 
 project(Tokenizers)
 
+option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
+
 set(ABSL_ENABLE_INSTALL ON)
 set(ABSL_PROPAGATE_CXX_STD ON)
-set(_pic_flag
-${CMAKE_POSITION_INDEPENDENT_CODE})
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(third-party/sentencepiece)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
 add_library(tokenizers STATIC src/sentencepiece.cpp)
 
 # Using abseil from sentencepiece/third_party
-target_include_directories(
-  tokenizers PUBLIC third-party/sentencepiece/src
-                    third-party/sentencepiece include)
+target_include_directories(tokenizers PUBLIC third-party/sentencepiece/src
+                                            third-party/sentencepiece include)
 
 target_link_libraries(tokenizers PUBLIC sentencepiece-static)
+
+# Build test
+if(TOKENIZERS_BUILD_TEST)
+  find_package(GTest REQUIRED)
+  set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
+  add_executable(sentencepiece_test test/test_sentencepiece.cpp)
+  target_include_directories(
+    sentencepiece_test PUBLIC third-party/sentencepiece/src
+                              third-party/sentencepiece include)
+  target_link_libraries(sentencepiece_test PUBLIC tokenizers GTest::GTest
+                                                  GTest::Main)
+endif()
diff --git a/src/sentencepiece.cpp b/src/sentencepiece.cpp
@@ -38,8 +38,8 @@ Error SPTokenizer::load(const std::string &tokenizer_path) {
   const auto status = _processor->Load(tokenizer_path);
   if (!status.ok()) {
     fprintf(stderr,
-            "couldn't load %s\n. If this tokenizer artifact is for llama3, "
-            "please pass `-l 3`.",
+            "couldn't load %s\n. It is likely that the tokenizer artifact is "
+            "broken or of a different format.",
             tokenizer_path.c_str());
     return Error::LoadFailure;
   }
diff --git a/test/resources/test_sentencepiece.model b/test/resources/test_sentencepiece.model
diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "sentencepiece.h"
+#include "gtest/gtest.h"
+
+namespace tokenizers {
+
+TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
+  SPTokenizer tokenizer;
+  std::string text = "Hello world!";
+  auto result = tokenizer.encode(text, /*bos*/ 0, /*eos*/ 1);
+  EXPECT_EQ(result.error(), Error::Uninitialized);
+}
+
+TEST(SPTokenizerTest, TestDecodeWithoutLoad) {
+  SPTokenizer tokenizer;
+  auto result = tokenizer.decode(0, 0);
+  EXPECT_EQ(result.error(), Error::Uninitialized);
+}
+
+TEST(SPTokenizerTest, TestLoad) {
+  SPTokenizer tokenizer;
+  auto resources = std::getenv("RESOURCES_PATH");
+  auto path = resources + std::string("/test_sentencepiece.model");
+  auto error = tokenizer.load(path);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST(SPTokenizerTest, TestLoadInvalidPath) {
+  SPTokenizer tokenizer;
+  auto error = tokenizer.load("invalid_path");
+  EXPECT_EQ(error, Error::LoadFailure);
+}
+
+TEST(SPTokenizerTest, TestEncode) {
+  SPTokenizer tokenizer;
+  auto resources = std::getenv("RESOURCES_PATH");
+  auto path = resources + std::string("/test_sentencepiece.model");
+  auto error = tokenizer.load(path);
+  EXPECT_EQ(error, Error::Ok);
+  std::string text = "Hello world!";
+  auto result = tokenizer.encode(text, /*bos*/ 1, /*eos*/ 0);
+  EXPECT_TRUE(result.ok());
+  EXPECT_EQ(result.get().size(), 4);
+  EXPECT_EQ(result.get()[0], 1);
+  EXPECT_EQ(result.get()[1], 15043);
+  EXPECT_EQ(result.get()[2], 3186);
+  EXPECT_EQ(result.get()[3], 29991);
+}
+
+TEST(SPTokenizerTest, TestDecode) {
+  SPTokenizer tokenizer;
+  auto resources = std::getenv("RESOURCES_PATH");
+  auto path = resources + std::string("/test_sentencepiece.model");
+  auto error = tokenizer.load(path);
+  EXPECT_EQ(error, Error::Ok);
+  std::vector<uint64_t> tokens = {1, 15043, 3186, 29991};
+  std::vector<std::string> expected = {"", "Hello", " world", "!"};
+  for (auto i = 0; i < 3; ++i) {
+    auto result = tokenizer.decode(tokens[i], tokens[i + 1]);
+    EXPECT_TRUE(result.ok());
+    EXPECT_EQ(result.get(), expected[i + 1]);
+  }
+}
+
+} // namespace tokenizers

Original file line number	Diff line number	Diff line change
`@@ -123,6 +123,7 @@ code = 'NEWLINE'`
`123`	`123`	`include_patterns = ['**']`
`124`	`124`	`exclude_patterns = [`
`125`	`125`	`'third-party/**',`
	`126`	`+ 'test/resources/*.model',`
`126`	`127`	`]`
`127`	`128`	`command = [`
`128`	`129`	`'python',`