From 4d196097330c1a2e24b96e296bc3618bcb23e078 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 17 Jul 2025 12:42:17 -0700 Subject: [PATCH 1/3] Don't build any binaries for 3rd party deps like sentencepiece --- CMakeLists.txt | 66 ++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 501305e..f8e2c32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,12 +13,11 @@ # cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) -set(CMAKE_POLICY_VERSION_MINIMUM 3.5) + project(Tokenizers) option(TOKENIZERS_BUILD_TEST "Build tests" OFF) option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF) -option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF) option(SUPPORT_REGEX_LOOKAHEAD "Support regex lookahead patterns (requires PCRE2)" OFF ) @@ -38,7 +37,20 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece) + +if(NOT DEFINED SPM_BUILD_TEST) + set(SPM_BUILD_TEST OFF CACHE BOOL "") +endif() + +if(NOT DEFINED SPM_ENABLE_SHARED) + set(SPM_ENABLE_SHARED OFF CACHE BOOL "") +endif() + +add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece + ${CMAKE_CURRENT_BINARY_DIR}/sentencepiece-build + EXCLUDE_FROM_ALL +) set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) @@ -123,49 +135,17 @@ if(TOKENIZERS_BUILD_TOOLS) add_subdirectory(examples/tokenize_tool) endif() -# Build Python bindings -if(TOKENIZERS_BUILD_PYTHON) - include(FetchContent) - FetchContent_Declare( - pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.13.6 - ) - FetchContent_MakeAvailable(pybind11) - - # Create the Python extension module - pybind11_add_module(pytorch_tokenizers_cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp - ) - - # Link with the tokenizers library - target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers) - - # Set properties for the Python extension - target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION}) - - # Set the output name and let setuptools control the output directory - set_target_properties(pytorch_tokenizers_cpp PROPERTIES - OUTPUT_NAME "pytorch_tokenizers_cpp" - ) - - # Don't install the Python extension here - let setuptools handle it - # The setup.py will copy the built extension to the appropriate location -endif() - # Installation rules include(GNUInstallDirs) -if(NOT TOKENIZERS_BUILD_PYTHON) - # Install the library and its dependencies - install( - TARGETS tokenizers re2 sentencepiece-static - EXPORT tokenizers-targets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ) -endif() +# Install the library and its dependencies +install( + TARGETS tokenizers re2 sentencepiece-static + EXPORT tokenizers-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) # Install header files install( From 9caad9699eb654d5206531606f57e97d2d6564f5 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 17 Jul 2025 12:43:58 -0700 Subject: [PATCH 2/3] Update CMakeLists.txt --- CMakeLists.txt | 53 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8e2c32..0812808 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,11 +13,12 @@ # cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) - +set(CMAKE_POLICY_VERSION_MINIMUM 3.5) project(Tokenizers) option(TOKENIZERS_BUILD_TEST "Build tests" OFF) option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF) +option(TOKENIZERS_BUILD_PYTHON "Build Python bindings" OFF) option(SUPPORT_REGEX_LOOKAHEAD "Support regex lookahead patterns (requires PCRE2)" OFF ) @@ -36,8 +37,8 @@ set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) if(NOT DEFINED SPM_BUILD_TEST) set(SPM_BUILD_TEST OFF CACHE BOOL "") endif() @@ -135,17 +136,49 @@ if(TOKENIZERS_BUILD_TOOLS) add_subdirectory(examples/tokenize_tool) endif() +# Build Python bindings +if(TOKENIZERS_BUILD_PYTHON) + include(FetchContent) + FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG v2.13.6 + ) + FetchContent_MakeAvailable(pybind11) + + # Create the Python extension module + pybind11_add_module(pytorch_tokenizers_cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/python_bindings.cpp + ) + + # Link with the tokenizers library + target_link_libraries(pytorch_tokenizers_cpp PRIVATE tokenizers) + + # Set properties for the Python extension + target_compile_definitions(pytorch_tokenizers_cpp PRIVATE VERSION_INFO=${PROJECT_VERSION}) + + # Set the output name and let setuptools control the output directory + set_target_properties(pytorch_tokenizers_cpp PROPERTIES + OUTPUT_NAME "pytorch_tokenizers_cpp" + ) + + # Don't install the Python extension here - let setuptools handle it + # The setup.py will copy the built extension to the appropriate location +endif() + # Installation rules include(GNUInstallDirs) -# Install the library and its dependencies -install( - TARGETS tokenizers re2 sentencepiece-static - EXPORT tokenizers-targets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) +if(NOT TOKENIZERS_BUILD_PYTHON) + # Install the library and its dependencies + install( + TARGETS tokenizers re2 sentencepiece-static + EXPORT tokenizers-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ) +endif() # Install header files install( From 68e1e6d3366a8b2e85b2922b03b677fab2312482 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 17 Jul 2025 12:45:12 -0700 Subject: [PATCH 3/3] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0812808..fb6da74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,8 +37,8 @@ set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) + if(NOT DEFINED SPM_BUILD_TEST) set(SPM_BUILD_TEST OFF CACHE BOOL "") endif()