Skip to content

Add python bindings #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ concurrency:
jobs:
unittest-linux:
name: unittest-linux
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
strategy:
fail-fast: false
with:
Expand All @@ -26,4 +26,11 @@ jobs:
set -ex
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
cmake --build build/test -j9 --config Debug
cd build/test && ctest
pushd build/test && ctest && popd

# Install tokenizers
pip install . -v
pip install pytest blobfile

# Run tests
pytest
9 changes: 8 additions & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,11 @@ jobs:
set -ex
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
cmake --build build/test -j9 --config Debug
cd build/test && ctest
pushd build/test && ctest && popd

# Install tokenizers
${CONDA_RUN} pip install . -v
${CONDA_RUN} pip install pytest blobfile

# Run tests
${CONDA_RUN} pytest
51 changes: 42 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
#
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)

set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
project(Tokenizers)

option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF)
option(SUPPORT_REGEX_LOOKAHEAD
"Support regex lookahead patterns (requires PCRE2)" OFF
)
Expand Down Expand Up @@ -121,17 +122,49 @@ if(TOKENIZERS_BUILD_TOOLS)
add_subdirectory(examples/tokenize_tool)
endif()

# Build Python bindings
if(TOKENIZERS_BUILD_PYTHON)
include(FetchContent)
FetchContent_Declare(
pybind11
GIT_REPOSITORY https://github.com/pybind/pybind11.git
GIT_TAG v2.13.6
)
FetchContent_MakeAvailable(pybind11)

# Create the Python extension module
pybind11_add_module(pytorch_tokenizers_cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp
)

# Link with the tokenizers library
target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers)

# Set properties for the Python extension
target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION})

# Set the output name and let setuptools control the output directory
set_target_properties(pytorch_tokenizers_cpp PROPERTIES
OUTPUT_NAME "pytorch_tokenizers_cpp"
)

# Don't install the Python extension here - let setuptools handle it
# The setup.py will copy the built extension to the appropriate location
endif()

# Installation rules
include(GNUInstallDirs)

# Install the library and its dependencies
install(
TARGETS tokenizers re2 sentencepiece-static
EXPORT tokenizers-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
if(NOT TOKENIZERS_BUILD_PYTHON)
# Install the library and its dependencies
install(
TARGETS tokenizers re2 sentencepiece-static
EXPORT tokenizers-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
endif()

# Install header files
install(
Expand Down
21 changes: 21 additions & 0 deletions include/pytorch/tokenizers/tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,27 @@ class Tiktoken : public detail::BPETokenizerBase {
}
}

explicit Tiktoken(
std::string pattern,
const std::vector<std::string>& special_tokens,
size_t bos_token_index,
size_t eos_token_index)
: Tiktoken(
pattern,
std::make_unique<std::vector<std::string>>(special_tokens),
bos_token_index,
eos_token_index) {}

explicit Tiktoken(
const std::vector<std::string>& special_tokens,
size_t bos_token_index,
size_t eos_token_index)
: Tiktoken(
_get_default_patern(),
std::make_unique<std::vector<std::string>>(special_tokens),
bos_token_index,
eos_token_index) {}

explicit Tiktoken(
std::unique_ptr<std::vector<std::string>> special_tokens,
size_t bos_token_index,
Expand Down
26 changes: 19 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ requires = [
"pip>=23", # For building the pip package.
"setuptools>=63", # For building the pip package contents.
"wheel", # For building the pip package archive.
"pytest", # For running tests.
"pybind11", # For building the pybind11 C++ extension.
]
build-backend = "setuptools.build_meta"

Expand Down Expand Up @@ -64,12 +66,22 @@ Changelog = "https://github.com/pytorch/executorch/releases"
[tool.setuptools.exclude-package-data]
"*" = ["*.pyc"]

[tool.usort]
# Do not try to put "first-party" imports in their own section.
first_party_detection = false
[tool.pytest.ini_options]
testpaths = ["test"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]

[tool.black]
# Emit syntax compatible with older versions of python instead of only the range
# specified by `requires-python`. TODO: Remove this once we support these older
# versions of python and can expand the `requires-python` range.
target-version = ["py38", "py39", "py310", "py311", "py312"]
target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
include = '\.pyi?$'
extend-exclude = '''
/(
# directories
\.eggs
| \.git
| build
| dist
| third-party
)/
'''
54 changes: 54 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
[pytest]
# Pytest configuration for PyTorch Tokenizers

# Test discovery
testpaths = test
python_files = test_*.py *_test.py
python_classes = Test*
python_functions = test_*

# Output options with explicit ignores
addopts =
# show summary of all tests that did not pass
-rEfX
# Make tracebacks shorter
--tb=native
# capture only Python print and C++ py::print, but not C output (low-level Python errors)
--capture=sys
# don't suppress warnings, but don't shove them all to the end either
-p no:warnings
# Ignore backends/arm tests you need to run examples/arm/setup.sh to install some tool to make them work
# For GitHub testing this is setup/executed in the unittest-arm job see .github/workflows/pull.yml for more info.
--ignore=third-party
--ignore=build
--ignore=cmake
--ignore=examples
--ignore=pytorch_tokenizers.egg-info

# Directories to ignore during test collection
norecursedirs =
build*
third-party*
cmake*
examples*
.git*
__pycache__*
*.egg-info*
*third-party*

# Markers
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
integration: marks tests as integration tests
unit: marks tests as unit tests

# Minimum version
minversion = 6.0

# Test timeout (in seconds)
timeout = 300

# Filter warnings
filterwarnings =
ignore::DeprecationWarning
ignore::PendingDeprecationWarning
47 changes: 30 additions & 17 deletions pytorch_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,38 @@
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
# @lint-ignore-every LICENSELINT

"""
PyTorch Tokenizers - Fast tokenizers for PyTorch

from typing import Optional
This package provides Python bindings for fast C++ tokenizer implementations
including HuggingFace, TikToken, Llama2C, and SentencePiece tokenizers.
"""

from .hf_tokenizer import HuggingFaceTokenizer
from .llama2c import Llama2cTokenizer
from .tiktoken import TiktokenTokenizer
try:
from .pytorch_tokenizers_cpp import (
Error,
TokenIndex,
Tokenizer,
HFTokenizer,
Tiktoken,
Llama2cTokenizer,
SPTokenizer,
)
except ImportError as e:
raise ImportError(
f"Failed to import C++ tokenizer bindings: {e}. "
"Make sure the package was built correctly with pybind11."
) from e

__all__ = ["TiktokenTokenizer", "Llama2cTokenizer", "HuggingFaceTokenizer"]
__version__ = "0.1.0"


def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
if tokenizer_path.endswith(".json"):
tokenizer = HuggingFaceTokenizer(tokenizer_path, tokenizer_config_path)
else:
try:
tokenizer = Llama2cTokenizer(model_path=str(tokenizer_path))
except Exception:
print("Using Tiktokenizer")
tokenizer = TiktokenTokenizer(model_path=str(tokenizer_path))
return tokenizer
__all__ = [
"Error",
"TokenIndex",
"Tokenizer",
"HFTokenizer",
"Tiktoken",
"Llama2cTokenizer",
"SPTokenizer",
]
Loading