Skip to content

Commit de18c1f

Browse files
committed
Add python bindings
1 parent cf543d0 commit de18c1f

File tree

12 files changed

+687
-33
lines changed

12 files changed

+687
-33
lines changed

CMakeLists.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
21+
option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF)
2122
option(SUPPORT_REGEX_LOOKAHEAD
2223
"Support regex lookahead patterns (requires PCRE2)" OFF
2324
)
@@ -121,6 +122,38 @@ if(TOKENIZERS_BUILD_TOOLS)
121122
add_subdirectory(examples/tokenize_tool)
122123
endif()
123124

125+
# Build Python bindings
126+
if(TOKENIZERS_BUILD_PYTHON)
127+
include(FetchContent)
128+
FetchContent_Declare(
129+
pybind11
130+
GIT_REPOSITORY https://github.com/pybind/pybind11.git
131+
GIT_TAG v2.13.6
132+
)
133+
FetchContent_MakeAvailable(pybind11)
134+
# Create the Python extension module
135+
pybind11_add_module(pytorch_tokenizers_cpp
136+
${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp
137+
)
138+
139+
# Link with the tokenizers library
140+
target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers)
141+
142+
# Set properties for the Python extension
143+
target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION})
144+
145+
# Set the output name and location
146+
set_target_properties(pytorch_tokenizers_cpp PROPERTIES
147+
OUTPUT_NAME "pytorch_tokenizers_cpp"
148+
)
149+
150+
# Install the Python extension
151+
install(TARGETS pytorch_tokenizers_cpp
152+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
153+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
154+
)
155+
endif()
156+
124157
# Installation rules
125158
include(GNUInstallDirs)
126159

include/pytorch/tokenizers/tiktoken.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,27 @@ class Tiktoken : public detail::BPETokenizerBase {
4646
}
4747
}
4848

49+
explicit Tiktoken(
50+
std::string pattern,
51+
const std::vector<std::string>& special_tokens,
52+
size_t bos_token_index,
53+
size_t eos_token_index)
54+
: Tiktoken(
55+
pattern,
56+
std::make_unique<std::vector<std::string>>(special_tokens),
57+
bos_token_index,
58+
eos_token_index) {}
59+
60+
explicit Tiktoken(
61+
const std::vector<std::string>& special_tokens,
62+
size_t bos_token_index,
63+
size_t eos_token_index)
64+
: Tiktoken(
65+
_get_default_patern(),
66+
std::make_unique<std::vector<std::string>>(special_tokens),
67+
bos_token_index,
68+
eos_token_index) {}
69+
4970
explicit Tiktoken(
5071
std::unique_ptr<std::vector<std::string>> special_tokens,
5172
size_t bos_token_index,

pyproject.toml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,22 @@ Changelog = "https://github.com/pytorch/executorch/releases"
6464
[tool.setuptools.exclude-package-data]
6565
"*" = ["*.pyc"]
6666

67-
[tool.usort]
68-
# Do not try to put "first-party" imports in their own section.
69-
first_party_detection = false
67+
[tool.pytest.ini_options]
68+
testpaths = ["test"]
69+
python_files = ["test_*.py", "*_test.py"]
70+
python_classes = ["Test*"]
71+
python_functions = ["test_*"]
7072

7173
[tool.black]
72-
# Emit syntax compatible with older versions of python instead of only the range
73-
# specified by `requires-python`. TODO: Remove this once we support these older
74-
# versions of python and can expand the `requires-python` range.
75-
target-version = ["py38", "py39", "py310", "py311", "py312"]
74+
target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
75+
include = '\.pyi?$'
76+
extend-exclude = '''
77+
/(
78+
# directories
79+
\.eggs
80+
| \.git
81+
| build
82+
| dist
83+
| third-party
84+
)/
85+
'''

pytest.ini

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
[pytest]
2+
# Pytest configuration for PyTorch Tokenizers
3+
4+
# Test discovery - be very explicit about what to test
5+
testpaths = test
6+
python_files = test_*.py *_test.py
7+
python_classes = Test*
8+
python_functions = test_*
9+
10+
# Add the project root to Python path
11+
pythonpath = .
12+
13+
# Output options with explicit ignores
14+
addopts =
15+
-v
16+
--tb=short
17+
--strict-markers
18+
--disable-warnings
19+
--ignore=third-party
20+
--ignore=build
21+
--ignore=cmake
22+
--ignore=examples
23+
--ignore=pytorch_tokenizers.egg-info
24+
25+
# Directories to ignore during test collection
26+
norecursedirs =
27+
build*
28+
third-party*
29+
cmake*
30+
examples*
31+
.git*
32+
__pycache__*
33+
*.egg-info*
34+
*third-party*
35+
36+
# Markers
37+
markers =
38+
slow: marks tests as slow (deselect with '-m "not slow"')
39+
integration: marks tests as integration tests
40+
unit: marks tests as unit tests
41+
42+
# Minimum version
43+
minversion = 6.0
44+
45+
# Test timeout (in seconds)
46+
timeout = 300
47+
48+
# Filter warnings
49+
filterwarnings =
50+
ignore::DeprecationWarning
51+
ignore::PendingDeprecationWarning

pytorch_tokenizers/__init__.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,38 @@
33
#
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
6-
# @lint-ignore-every LICENSELINT
76

7+
"""
8+
PyTorch Tokenizers - Fast tokenizers for PyTorch
89
9-
from typing import Optional
10+
This package provides Python bindings for fast C++ tokenizer implementations
11+
including HuggingFace, TikToken, Llama2C, and SentencePiece tokenizers.
12+
"""
1013

11-
from .hf_tokenizer import HuggingFaceTokenizer
12-
from .llama2c import Llama2cTokenizer
13-
from .tiktoken import TiktokenTokenizer
14+
try:
15+
from .pytorch_tokenizers_cpp import (
16+
Error,
17+
TokenIndex,
18+
Tokenizer,
19+
HFTokenizer,
20+
Tiktoken,
21+
Llama2cTokenizer,
22+
SPTokenizer,
23+
)
24+
except ImportError as e:
25+
raise ImportError(
26+
f"Failed to import C++ tokenizer bindings: {e}. "
27+
"Make sure the package was built correctly with pybind11."
28+
) from e
1429

15-
__all__ = ["TiktokenTokenizer", "Llama2cTokenizer", "HuggingFaceTokenizer"]
30+
__version__ = "0.1.0"
1631

17-
18-
def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
19-
if tokenizer_path.endswith(".json"):
20-
tokenizer = HuggingFaceTokenizer(tokenizer_path, tokenizer_config_path)
21-
else:
22-
try:
23-
tokenizer = Llama2cTokenizer(model_path=str(tokenizer_path))
24-
except Exception:
25-
print("Using Tiktokenizer")
26-
tokenizer = TiktokenTokenizer(model_path=str(tokenizer_path))
27-
return tokenizer
32+
__all__ = [
33+
"Error",
34+
"TokenIndex",
35+
"Tokenizer",
36+
"HFTokenizer",
37+
"Tiktoken",
38+
"Llama2cTokenizer",
39+
"SPTokenizer",
40+
]

requirements.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Build dependencies
2+
pybind11>=2.6.0
3+
setuptools>=45
4+
wheel
5+
cmake>=3.18
6+
ninja
7+
8+
# Optional dependencies for development
9+
pytest>=6.0
10+
numpy>=1.19.0

setup.py

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,151 @@
55
# LICENSE file in the root directory of this source tree.
66
# @lint-ignore-every LICENSELINT
77
# type: ignore[syntax]
8-
from setuptools import find_packages, setup
98

9+
import os
10+
import re
11+
import subprocess
12+
import sys
13+
import shutil
14+
from pathlib import Path
15+
16+
from setuptools import find_packages, setup, Extension
17+
from setuptools.command.build_ext import build_ext
18+
19+
# Read the README file
1020
with open("README.md", "r") as f:
1121
long_description = f.read()
1222

23+
class CMakeExtension(Extension):
24+
def __init__(self, name, sourcedir=""):
25+
Extension.__init__(self, name, sources=[])
26+
self.sourcedir = os.path.abspath(sourcedir)
27+
28+
class CMakeBuild(build_ext):
29+
def build_extension(self, ext):
30+
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
31+
32+
# Required for auto-detection & inclusion of auxiliary "native" libs
33+
if not extdir.endswith(os.path.sep):
34+
extdir += os.path.sep
35+
36+
debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
37+
cfg = "Debug" if debug else "Release"
38+
39+
# CMake lets you override the generator - we check this.
40+
# Can be set with Conda-Build, for example.
41+
cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
42+
43+
# Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
44+
cmake_args = [
45+
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
46+
f"-DPYTHON_EXECUTABLE={sys.executable}",
47+
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
48+
"-DSUPPORT_REGEX_LOOKAHEAD=ON",
49+
"-DTOKENIZERS_BUILD_PYTHON=ON",
50+
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
51+
]
52+
build_args = []
53+
54+
# Adding CMake arguments set as environment variable
55+
# (needed e.g. to build for ARM OSX on conda-forge)
56+
if "CMAKE_ARGS" in os.environ:
57+
cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
58+
59+
if self.compiler.compiler_type != "msvc":
60+
# Using Ninja-build since it a) is available as a wheel and b)
61+
# multithreads automatically. MSVC would require all variables be
62+
# exported for Ninja to pick it up, which is a little tricky to do.
63+
# Users can override the generator with CMAKE_GENERATOR in CMake
64+
# 3.15+.
65+
if not cmake_generator or cmake_generator == "Ninja":
66+
try:
67+
import ninja # noqa: F401
68+
69+
ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja")
70+
cmake_args += [
71+
"-GNinja",
72+
f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
73+
]
74+
except ImportError:
75+
pass
76+
77+
else:
78+
# Single config generators are handled "normally"
79+
single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
80+
81+
# CMake allows an arch-in-generator style for backward compatibility
82+
contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
83+
84+
# Specify the arch if using MSVC generator, but only if it doesn't
85+
# contain a backward-compatibility arch spec already in the
86+
# generator name.
87+
if not single_config and not contains_arch:
88+
cmake_args += ["-A", "x64"]
89+
90+
# Multi-config generators have a different way to specify configs
91+
if not single_config:
92+
cmake_args += [
93+
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"
94+
]
95+
build_args += ["--config", cfg]
96+
97+
if sys.platform.startswith("darwin"):
98+
# Cross-compile support for macOS - respect ARCHFLAGS if set
99+
archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
100+
if archs:
101+
cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
102+
103+
# Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
104+
# across all generators.
105+
if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
106+
# self.parallel is a Python 3 only way to set parallel jobs by hand
107+
# using -j in the build_ext call, not supported by pip or PyPA-build.
108+
if hasattr(self, "parallel") and self.parallel:
109+
# CMake 3.12+ only.
110+
build_args += [f"-j{self.parallel}"]
111+
112+
build_temp = Path(self.build_temp) / ext.name
113+
if not build_temp.exists():
114+
build_temp.mkdir(parents=True)
115+
116+
subprocess.run(
117+
["cmake", ext.sourcedir] + cmake_args, cwd=build_temp, check=True
118+
)
119+
subprocess.run(
120+
["cmake", "--build", "."] + build_args, cwd=build_temp, check=True
121+
)
122+
13123
setup(
124+
name="pytorch-tokenizers",
14125
version="0.1.0",
15126
long_description=long_description,
16127
long_description_content_type="text/markdown",
128+
url="https://github.com/pytorch-labs/tokenizers",
17129
packages=find_packages(),
130+
ext_modules=[CMakeExtension("pytorch_tokenizers_cpp")],
131+
cmdclass={"build_ext": CMakeBuild},
132+
zip_safe=False,
133+
python_requires=">=3.8",
134+
install_requires=[
135+
"pybind11>=2.6.0",
136+
],
137+
setup_requires=[
138+
"pybind11>=2.6.0",
139+
"cmake>=3.18",
140+
],
141+
classifiers=[
142+
"Development Status :: 3 - Alpha",
143+
"Intended Audience :: Developers",
144+
"License :: OSI Approved :: BSD License",
145+
"Operating System :: OS Independent",
146+
"Programming Language :: Python :: 3",
147+
"Programming Language :: Python :: 3.8",
148+
"Programming Language :: Python :: 3.9",
149+
"Programming Language :: Python :: 3.10",
150+
"Programming Language :: Python :: 3.11",
151+
"Programming Language :: Python :: 3.12",
152+
"Programming Language :: C++",
153+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
154+
],
18155
)

0 commit comments

Comments
 (0)