Skip to content

Commit c376a3e

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add python bindings (#98)
Summary: This pull request introduces Python bindings for the PyTorch Tokenizers library. It includes changes to support Python bindings in the build system, integration of `pybind11`, and updates to the Python package for distribution. Additionally, it modifies the tokenizer classes and adds testing configurations for the new bindings. ### Python Bindings Integration: * **Added Python bindings option in `CMakeLists.txt`**: Introduced the `TOKENIZERS_BUILD_PYTHON` option and the logic to build Python bindings using `pybind11`. This includes creating the `pytorch_tokenizers_cpp` extension module and linking it with the tokenizers library. [[1]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR21) [[2]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR125-R156) * **New `src/python_bindings.cpp` file**: Implemented Python bindings for tokenizers using `pybind11`. This includes binding classes like `Tokenizer`, `HFTokenizer`, `Tiktoken`, `Llama2cTokenizer`, and `SPTokenizer`. ### Python Package Updates: * **Updated `setup.py` for Python bindings**: Added support for building the Python extension module using CMake and `pybind11`. This includes defining a custom `CMakeBuild` class for handling the build process. * **Modified `pytorch_tokenizers/__init__.py`**: Updated the package to include the new C++ tokenizer bindings and removed older Python implementations. Added error handling for failed imports. ### Testing Enhancements: * **Added `pytest.ini` configuration**: Configured Pytest for the project, including test discovery rules, ignored directories, and markers for different test types. * **Defined Python tests in targets.bzl**: Introduced a `targets.bzl` target for testing the Python bindings (`test_python_bindings.py`). ### Tokenizer Class Changes: * **Added constructors to `Tiktoken` class**: Introduced new constructors to let pybind11 bind init() to constructors (it doesn't support `std::unique_ptr<std::vector<std::string>>`). ### Build System Changes: * **Added Bazel target for Python bindings**: Defined a `targets.bzl` target for building the Python bindings, including dependencies on tokenizer modules and `pybind11`. Reviewed By: jackzhxng Differential Revision: D78053854 Pulled By: larryliu0820
1 parent 23359bd commit c376a3e

File tree

17 files changed

+780
-37
lines changed

17 files changed

+780
-37
lines changed

.github/workflows/pull.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ concurrency:
1414
jobs:
1515
unittest-linux:
1616
name: unittest-linux
17-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
17+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1818
strategy:
1919
fail-fast: false
2020
with:
@@ -26,4 +26,11 @@ jobs:
2626
set -ex
2727
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
2828
cmake --build build/test -j9 --config Debug
29-
cd build/test && ctest
29+
pushd build/test && ctest && popd
30+
31+
# Install tokenizers
32+
pip install . -v
33+
pip install pytest blobfile
34+
35+
# Run tests
36+
pytest

.github/workflows/trunk.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,11 @@ jobs:
3333
set -ex
3434
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
3535
cmake --build build/test -j9 --config Debug
36-
cd build/test && ctest
36+
pushd build/test && ctest && popd
37+
38+
# Install tokenizers
39+
${CONDA_RUN} pip install . -v
40+
${CONDA_RUN} pip install pytest blobfile
41+
42+
# Run tests
43+
${CONDA_RUN} pytest

CMakeLists.txt

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@
1313
#
1414
cmake_minimum_required(VERSION 3.18)
1515
set(CMAKE_CXX_STANDARD 17)
16-
16+
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
1717
project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
21+
option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF)
2122
option(SUPPORT_REGEX_LOOKAHEAD
2223
"Support regex lookahead patterns (requires PCRE2)" OFF
2324
)
@@ -122,17 +123,49 @@ if(TOKENIZERS_BUILD_TOOLS)
122123
add_subdirectory(examples/tokenize_tool)
123124
endif()
124125

126+
# Build Python bindings
127+
if(TOKENIZERS_BUILD_PYTHON)
128+
include(FetchContent)
129+
FetchContent_Declare(
130+
pybind11
131+
GIT_REPOSITORY https://github.com/pybind/pybind11.git
132+
GIT_TAG v2.13.6
133+
)
134+
FetchContent_MakeAvailable(pybind11)
135+
136+
# Create the Python extension module
137+
pybind11_add_module(pytorch_tokenizers_cpp
138+
${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp
139+
)
140+
141+
# Link with the tokenizers library
142+
target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers)
143+
144+
# Set properties for the Python extension
145+
target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION})
146+
147+
# Set the output name and let setuptools control the output directory
148+
set_target_properties(pytorch_tokenizers_cpp PROPERTIES
149+
OUTPUT_NAME "pytorch_tokenizers_cpp"
150+
)
151+
152+
# Don't install the Python extension here - let setuptools handle it
153+
# The setup.py will copy the built extension to the appropriate location
154+
endif()
155+
125156
# Installation rules
126157
include(GNUInstallDirs)
127158

128-
# Install the library and its dependencies
129-
install(
130-
TARGETS tokenizers re2 sentencepiece-static
131-
EXPORT tokenizers-targets
132-
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
133-
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
134-
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
135-
)
159+
if(NOT TOKENIZERS_BUILD_PYTHON)
160+
# Install the library and its dependencies
161+
install(
162+
TARGETS tokenizers re2 sentencepiece-static
163+
EXPORT tokenizers-targets
164+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
165+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
166+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
167+
)
168+
endif()
136169

137170
# Install header files
138171
install(

include/pytorch/tokenizers/tiktoken.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,27 @@ class Tiktoken : public detail::BPETokenizerBase {
4646
}
4747
}
4848

49+
explicit Tiktoken(
50+
std::string pattern,
51+
const std::vector<std::string>& special_tokens,
52+
size_t bos_token_index,
53+
size_t eos_token_index)
54+
: Tiktoken(
55+
pattern,
56+
std::make_unique<std::vector<std::string>>(special_tokens),
57+
bos_token_index,
58+
eos_token_index) {}
59+
60+
explicit Tiktoken(
61+
const std::vector<std::string>& special_tokens,
62+
size_t bos_token_index,
63+
size_t eos_token_index)
64+
: Tiktoken(
65+
_get_default_patern(),
66+
std::make_unique<std::vector<std::string>>(special_tokens),
67+
bos_token_index,
68+
eos_token_index) {}
69+
4970
explicit Tiktoken(
5071
std::unique_ptr<std::vector<std::string>> special_tokens,
5172
size_t bos_token_index,

pyproject.toml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ requires = [
44
"pip>=23", # For building the pip package.
55
"setuptools>=63", # For building the pip package contents.
66
"wheel", # For building the pip package archive.
7+
"pytest", # For running tests.
8+
"pybind11", # For building the pybind11 C++ extension.
79
]
810
build-backend = "setuptools.build_meta"
911

@@ -64,12 +66,22 @@ Changelog = "https://github.com/pytorch/executorch/releases"
6466
[tool.setuptools.exclude-package-data]
6567
"*" = ["*.pyc"]
6668

67-
[tool.usort]
68-
# Do not try to put "first-party" imports in their own section.
69-
first_party_detection = false
69+
[tool.pytest.ini_options]
70+
testpaths = ["test"]
71+
python_files = ["test_*.py", "*_test.py"]
72+
python_classes = ["Test*"]
73+
python_functions = ["test_*"]
7074

7175
[tool.black]
72-
# Emit syntax compatible with older versions of python instead of only the range
73-
# specified by `requires-python`. TODO: Remove this once we support these older
74-
# versions of python and can expand the `requires-python` range.
75-
target-version = ["py38", "py39", "py310", "py311", "py312"]
76+
target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
77+
include = '\.pyi?$'
78+
extend-exclude = '''
79+
/(
80+
# directories
81+
\.eggs
82+
| \.git
83+
| build
84+
| dist
85+
| third-party
86+
)/
87+
'''

pytest.ini

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
[pytest]
2+
# Pytest configuration for PyTorch Tokenizers
3+
4+
# Test discovery
5+
testpaths = test
6+
python_files = test_*.py *_test.py
7+
python_classes = Test*
8+
python_functions = test_*
9+
10+
# Output options with explicit ignores
11+
addopts =
12+
# show summary of all tests that did not pass
13+
-rEfX
14+
# Make tracebacks shorter
15+
--tb=native
16+
# capture only Python print and C++ py::print, but not C output (low-level Python errors)
17+
--capture=sys
18+
# don't suppress warnings, but don't shove them all to the end either
19+
-p no:warnings
20+
# Ignore backends/arm tests you need to run examples/arm/setup.sh to install some tool to make them work
21+
# For GitHub testing this is setup/executed in the unittest-arm job see .github/workflows/pull.yml for more info.
22+
--ignore=third-party
23+
--ignore=build
24+
--ignore=cmake
25+
--ignore=examples
26+
--ignore=pytorch_tokenizers.egg-info
27+
28+
# Directories to ignore during test collection
29+
norecursedirs =
30+
build*
31+
third-party*
32+
cmake*
33+
examples*
34+
.git*
35+
__pycache__*
36+
*.egg-info*
37+
*third-party*
38+
39+
# Markers
40+
markers =
41+
slow: marks tests as slow (deselect with '-m "not slow"')
42+
integration: marks tests as integration tests
43+
unit: marks tests as unit tests
44+
45+
# Minimum version
46+
minversion = 6.0
47+
48+
# Test timeout (in seconds)
49+
timeout = 300
50+
51+
# Filter warnings
52+
filterwarnings =
53+
ignore::DeprecationWarning
54+
ignore::PendingDeprecationWarning

pytorch_tokenizers/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
55
load(":targets.bzl", "define_common_targets")
66

7-
oncall("executorch")
7+
oncall("ai_infra_mobile_platform")
88

99
define_common_targets()
1010

pytorch_tokenizers/__init__.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,38 @@
33
#
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
6-
# @lint-ignore-every LICENSELINT
76

7+
"""
8+
PyTorch Tokenizers - Fast tokenizers for PyTorch
9+
10+
This package provides Python bindings for fast C++ tokenizer implementations
11+
including HuggingFace, TikToken, Llama2C, and SentencePiece tokenizers.
12+
"""
13+
# @lint-ignore-every LICENSELINT
814

915
from typing import Optional
1016

1117
from .hf_tokenizer import HuggingFaceTokenizer
1218
from .llama2c import Llama2cTokenizer
1319
from .tiktoken import TiktokenTokenizer
1420

15-
__all__ = ["TiktokenTokenizer", "Llama2cTokenizer", "HuggingFaceTokenizer"]
21+
__version__ = "0.1.0"
22+
23+
try:
24+
from .pytorch_tokenizers_cpp import ( # @manual=//pytorch/tokenizers:pytorch_tokenizers_cpp
25+
Error,
26+
HFTokenizer as CppHFTokenizer,
27+
Llama2cTokenizer as CppLlama2cTokenizer,
28+
SPTokenizer as CppSPTokenizer,
29+
Tiktoken as CppTiktoken,
30+
TokenIndex,
31+
Tokenizer,
32+
)
33+
except ImportError as e:
34+
raise ImportError(
35+
f"Failed to import C++ tokenizer bindings: {e}. "
36+
"Make sure the package was built correctly with pybind11."
37+
) from e
1638

1739

1840
def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
@@ -25,3 +47,17 @@ def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = No
2547
print("Using Tiktokenizer")
2648
tokenizer = TiktokenTokenizer(model_path=str(tokenizer_path))
2749
return tokenizer
50+
51+
52+
__all__ = [
53+
"CppHFTokenizer",
54+
"CppLlama2cTokenizer",
55+
"CppSPTokenizer",
56+
"CppTiktoken",
57+
"Error",
58+
"HFTokenizer",
59+
"Llama2cTokenizer",
60+
"TiktokenTokenizer",
61+
"TokenIndex",
62+
"Tokenizer",
63+
]

pytorch_tokenizers/targets.bzl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,17 @@ def define_common_targets():
1111
srcs = [
1212
"__init__.py",
1313
"constants.py",
14+
"hf_tokenizer.py",
1415
"llama2c.py",
1516
"tiktoken.py",
16-
"hf_tokenizer.py",
1717
],
1818
base_module = "pytorch_tokenizers",
1919
visibility = ["PUBLIC"],
2020
_is_external_target = True,
21-
external_deps = [
22-
"sentencepiece-py",
23-
],
2421
deps = [
25-
"fbsource//third-party/pypi/blobfile:blobfile",
22+
"fbsource//third-party/pypi/sentencepiece:sentencepiece",
2623
"fbsource//third-party/pypi/tiktoken:tiktoken",
2724
"fbsource//third-party/pypi/tokenizers:tokenizers",
25+
"//pytorch/tokenizers:pytorch_tokenizers_cpp", # @manual
2826
],
2927
)

pytorch_tokenizers/tiktoken.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
AbstractSet,
1313
cast,
1414
Collection,
15-
Dict,
1615
Iterator,
1716
List,
1817
Literal,

0 commit comments

Comments
 (0)