pytorch-labs · larryliu0820 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -14,7 +14,7 @@ concurrency:
 jobs:
   unittest-linux:
     name: unittest-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       fail-fast: false
     with:
@@ -26,4 +26,11 @@ jobs:
         set -ex
         cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
         cmake --build build/test -j9 --config Debug
-        cd build/test && ctest
+        pushd build/test && ctest && popd
+
+        # Install tokenizers
+        pip install . -v
+        pip install pytest blobfile
+
+        # Run tests
+        pytest
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -33,4 +33,11 @@ jobs:
         set -ex
         cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
         cmake --build build/test -j9 --config Debug
-        cd build/test && ctest
+        pushd build/test && ctest && popd
+
+        # Install tokenizers
+        ${CONDA_RUN} pip install . -v
+        ${CONDA_RUN} pip install pytest blobfile
+
+        # Run tests
+        ${CONDA_RUN} pytest
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,11 +13,12 @@
 #
 cmake_minimum_required(VERSION 3.18)
 set(CMAKE_CXX_STANDARD 17)
-
+set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
 project(Tokenizers)
 
 option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
 option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
+option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF)
 option(SUPPORT_REGEX_LOOKAHEAD
        "Support regex lookahead patterns (requires PCRE2)" OFF
 )
@@ -121,17 +122,49 @@ if(TOKENIZERS_BUILD_TOOLS)
   add_subdirectory(examples/tokenize_tool)
 endif()
 
+# Build Python bindings
+if(TOKENIZERS_BUILD_PYTHON)
+  include(FetchContent)
+  FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG v2.13.6
+  )
+  FetchContent_MakeAvailable(pybind11)
+
+  # Create the Python extension module
+  pybind11_add_module(pytorch_tokenizers_cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp
+  )
+
+  # Link with the tokenizers library
+  target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers)
+
+  # Set properties for the Python extension
+  target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION})
+
+  # Set the output name and let setuptools control the output directory
+  set_target_properties(pytorch_tokenizers_cpp PROPERTIES
+    OUTPUT_NAME "pytorch_tokenizers_cpp"
+  )
+
+  # Don't install the Python extension here - let setuptools handle it
+  # The setup.py will copy the built extension to the appropriate location
+endif()
+
 # Installation rules
 include(GNUInstallDirs)
 
-# Install the library and its dependencies
-install(
-  TARGETS tokenizers re2 sentencepiece-static
-  EXPORT tokenizers-targets
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-)
+if(NOT TOKENIZERS_BUILD_PYTHON)
+  # Install the library and its dependencies
+  install(
+    TARGETS tokenizers re2 sentencepiece-static
+    EXPORT tokenizers-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  )
+endif()
 
 # Install header files
 install(

diff --git a/include/pytorch/tokenizers/tiktoken.h b/include/pytorch/tokenizers/tiktoken.h
@@ -46,6 +46,27 @@ class Tiktoken : public detail::BPETokenizerBase {
     }
   }
 
+  explicit Tiktoken(
+      std::string pattern,
+      const std::vector<std::string>& special_tokens,
+      size_t bos_token_index,
+      size_t eos_token_index)
+      : Tiktoken(
+            pattern,
+            std::make_unique<std::vector<std::string>>(special_tokens),
+            bos_token_index,
+            eos_token_index) {}
+
+  explicit Tiktoken(
+      const std::vector<std::string>& special_tokens,
+      size_t bos_token_index,
+      size_t eos_token_index)
+      : Tiktoken(
+            _get_default_patern(),
+            std::make_unique<std::vector<std::string>>(special_tokens),
+            bos_token_index,
+            eos_token_index) {}
+
   explicit Tiktoken(
       std::unique_ptr<std::vector<std::string>> special_tokens,
       size_t bos_token_index,

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,8 @@ requires = [
   "pip>=23",  # For building the pip package.
   "setuptools>=63",  # For building the pip package contents.
   "wheel",  # For building the pip package archive.
+  "pytest",  # For running tests.
+  "pybind11",  # For building the pybind11 C++ extension.
 ]
 build-backend = "setuptools.build_meta"
 
@@ -64,12 +66,22 @@ Changelog = "https://github.com/pytorch/executorch/releases"
 [tool.setuptools.exclude-package-data]
 "*" = ["*.pyc"]
 
-[tool.usort]
-# Do not try to put "first-party" imports in their own section.
-first_party_detection = false
+[tool.pytest.ini_options]
+testpaths = ["test"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
 
 [tool.black]
-# Emit syntax compatible with older versions of python instead of only the range
-# specified by `requires-python`. TODO: Remove this once we support these older
-# versions of python and can expand the `requires-python` range.
-target-version = ["py38", "py39", "py310", "py311", "py312"]
+target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | build
+  | dist
+  | third-party
+)/
+'''
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,54 @@
+[pytest]
+# Pytest configuration for PyTorch Tokenizers
+
+# Test discovery
+testpaths = test
+python_files = test_*.py *_test.py
+python_classes = Test*
+python_functions = test_*
+
+# Output options with explicit ignores
+addopts =
+    # show summary of all tests that did not pass
+    -rEfX
+    # Make tracebacks shorter
+    --tb=native
+    # capture only Python print and C++ py::print, but not C output (low-level Python errors)
+    --capture=sys
+    # don't suppress warnings, but don't shove them all to the end either
+    -p no:warnings
+    # Ignore backends/arm tests you need to run examples/arm/setup.sh to install some tool to make them work
+    # For GitHub testing this is setup/executed in the unittest-arm job see .github/workflows/pull.yml for more info.
+    --ignore=third-party
+    --ignore=build
+    --ignore=cmake
+    --ignore=examples
+    --ignore=pytorch_tokenizers.egg-info
+
+# Directories to ignore during test collection
+norecursedirs =
+    build*
+    third-party*
+    cmake*
+    examples*
+    .git*
+    __pycache__*
+    *.egg-info*
+    *third-party*
+
+# Markers
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    integration: marks tests as integration tests
+    unit: marks tests as unit tests
+
+# Minimum version
+minversion = 6.0
+
+# Test timeout (in seconds)
+timeout = 300
+
+# Filter warnings
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
diff --git a/pytorch_tokenizers/__init__.py b/pytorch_tokenizers/__init__.py
@@ -3,25 +3,38 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-# @lint-ignore-every LICENSELINT
 
+"""
+PyTorch Tokenizers - Fast tokenizers for PyTorch
 
-from typing import Optional
+This package provides Python bindings for fast C++ tokenizer implementations
+including HuggingFace, TikToken, Llama2C, and SentencePiece tokenizers.
+"""
 
-from .hf_tokenizer import HuggingFaceTokenizer
-from .llama2c import Llama2cTokenizer
-from .tiktoken import TiktokenTokenizer
+try:
+    from .pytorch_tokenizers_cpp import (
+        Error,
+        TokenIndex,
+        Tokenizer,
+        HFTokenizer,
+        Tiktoken,
+        Llama2cTokenizer,
+        SPTokenizer,
+    )
+except ImportError as e:
+    raise ImportError(
+        f"Failed to import C++ tokenizer bindings: {e}. "
+        "Make sure the package was built correctly with pybind11."
+    ) from e
 
-__all__ = ["TiktokenTokenizer", "Llama2cTokenizer", "HuggingFaceTokenizer"]
+__version__ = "0.1.0"
 
-
-def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
-    if tokenizer_path.endswith(".json"):
-        tokenizer = HuggingFaceTokenizer(tokenizer_path, tokenizer_config_path)
-    else:
-        try:
-            tokenizer = Llama2cTokenizer(model_path=str(tokenizer_path))
-        except Exception:
-            print("Using Tiktokenizer")
-            tokenizer = TiktokenTokenizer(model_path=str(tokenizer_path))
-    return tokenizer
+__all__ = [
+    "Error",
+    "TokenIndex",
+    "Tokenizer",
+    "HFTokenizer",
+    "Tiktoken",
+    "Llama2cTokenizer",
+    "SPTokenizer",
+]