Skip to content

Commit 3d67b29

Browse files
authored
Enable install find package (#82)
* Support cmake install Allowing parent repo to be able to cmake --target install and then find_package(tokenizers CONFIG) This comes in handy when we want to build tokenizers as an external project or rely on FetchContent. * Enable install * Clean up and fix a bunch * Lint
1 parent fc32028 commit 3d67b29

File tree

3 files changed

+191
-50
lines changed

3 files changed

+191
-50
lines changed

CMakeLists.txt

Lines changed: 90 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@ project(Tokenizers)
1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
2121
option(SUPPORT_REGEX_LOOKAHEAD
22-
"Support regex lookahead patterns (requires PCRE2)" OFF)
22+
"Support regex lookahead patterns (requires PCRE2)" OFF
23+
)
2324

25+
# Include CMakePackageConfigHelpers for configure_package_config_file
26+
include(CMakePackageConfigHelpers)
2427
include(Utils.cmake)
28+
2529
# Ignore weak attribute warning
2630
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
2731

@@ -47,25 +51,31 @@ set(tokenizers_source_files
4751
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
4852
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
4953
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
50-
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp)
54+
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
55+
)
5156

5257
file(GLOB unicode_source_files
53-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
54-
add_library(tokenizers STATIC ${tokenizers_source_files}
55-
${unicode_source_files})
58+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp
59+
)
60+
add_library(
61+
tokenizers STATIC ${tokenizers_source_files} ${unicode_source_files}
62+
)
5663

5764
# Using abseil from sentencepiece/third_party
5865
target_include_directories(
5966
tokenizers
60-
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
61-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
62-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
63-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
64-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
65-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
67+
PUBLIC
68+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
69+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece>
70+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src>
71+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2>
72+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include>
73+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include>
74+
)
6675
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
6776

68-
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
77+
if(SUPPORT_REGEX_LOOKAHEAD)
78+
set(PCRE2_STATIC_PIC ON)
6979
set(PCRE2_BUILD_PCRE2_8 ON)
7080
set(PCRE2_BUILD_PCRE2_16 OFF)
7181
set(PCRE2_BUILD_PCRE2_32 OFF)
@@ -76,55 +86,85 @@ if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
7686
set(PCRE2_BUILD_DOCS OFF)
7787
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
7888
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
89+
90+
# Set the INTERFACE_INCLUDE_DIRECTORIES property for pcre2-8-static
91+
set_target_properties(
92+
pcre2-8-static
93+
PROPERTIES
94+
INTERFACE_INCLUDE_DIRECTORIES
95+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src>
96+
)
7997
add_library(
8098
regex_lookahead STATIC
8199
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
82100
${CMAKE_CURRENT_SOURCE_DIR}/src/regex_lookahead.cpp
83-
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
101+
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp
102+
)
84103
target_link_libraries(regex_lookahead PUBLIC pcre2-8)
85104
target_include_directories(
86-
regex_lookahead PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
87-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
88-
target_link_options_shared_lib(regex_lookahead)
105+
regex_lookahead
106+
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
107+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src>
108+
)
109+
target_link_options_shared_lib(regex_lookahead)
89110
target_link_libraries(tokenizers PUBLIC regex_lookahead)
90-
endif()
91-
92-
# Build test
93-
if(TOKENIZERS_BUILD_TEST)
94-
enable_testing()
95-
include(FetchContent)
96-
# CMAKE
97-
FetchContent_Declare(
98-
googletest
99-
# Specify the commit you depend on and update it regularly.
100-
URL https://github.com/google/googletest/archive/5376968f6948923e2411081fd9372e71a59d8e77.zip
111+
install(
112+
TARGETS regex_lookahead pcre2-8-static
113+
EXPORT tokenizers-targets
114+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
115+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
116+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
101117
)
102-
set(gtest_force_shared_crt
103-
ON
104-
CACHE BOOL "" FORCE)
105-
FetchContent_MakeAvailable(googletest)
106-
107-
file(GLOB test_source_files ${CMAKE_CURRENT_SOURCE_DIR}/test/test_*.cpp)
108-
109-
set(test_env "RESOURCES_PATH=${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
110-
foreach(test_source_file ${test_source_files})
111-
get_filename_component(test_name ${test_source_file} NAME_WE)
112-
message(STATUS "Configuring unit test ${test_name}")
113-
add_executable(${test_name} ${test_source_file})
114-
target_include_directories(
115-
${test_name}
116-
PRIVATE GTEST_INCLUDE_PATH
117-
${CMAKE_CURRENT_SOURCE_DIR}/include
118-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
119-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
120-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
121-
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers)
122-
add_test(${test_name} "${test_name}")
123-
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
124-
endforeach()
125118
endif()
126119

127120
# Build tools
128121
if(TOKENIZERS_BUILD_TOOLS)
129122
add_subdirectory(examples/tokenize_tool)
130123
endif()
124+
125+
# Installation rules
126+
include(GNUInstallDirs)
127+
128+
# Install the library and its dependencies
129+
install(
130+
TARGETS tokenizers re2 sentencepiece-static
131+
EXPORT tokenizers-targets
132+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
133+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
134+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
135+
)
136+
137+
# Install header files
138+
install(
139+
DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
140+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
141+
FILES_MATCHING
142+
PATTERN "*.h"
143+
)
144+
145+
install(
146+
DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include/
147+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
148+
FILES_MATCHING
149+
PATTERN "*.hpp"
150+
)
151+
152+
# Install the CMake config files
153+
install(
154+
EXPORT tokenizers-targets
155+
FILE tokenizers-targets.cmake
156+
NAMESPACE tokenizers::
157+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/tokenizers
158+
)
159+
160+
# Generate and install the config file
161+
configure_package_config_file(
162+
${CMAKE_CURRENT_SOURCE_DIR}/cmake/tokenizers-config.cmake.in
163+
${CMAKE_CURRENT_BINARY_DIR}/tokenizers-config.cmake
164+
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/tokenizers
165+
PATH_VARS CMAKE_INSTALL_INCLUDEDIR
166+
)
167+
168+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tokenizers-config.cmake
169+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/tokenizers
170+
)

cmake/tokenizers-config.cmake.in

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Config file for the tokenizers package
2+
# It defines the following variables:
3+
# TOKENIZERS_FOUND - True if the tokenizers library was found
4+
# TOKENIZERS_INCLUDE_DIRS - Include directories for tokenizers
5+
# TOKENIZERS_LIBRARIES - Libraries to link against
6+
7+
@PACKAGE_INIT@
8+
9+
include(CMakeFindDependencyMacro)
10+
11+
# Find dependencies
12+
find_dependency(re2 REQUIRED)
13+
find_dependency(absl REQUIRED)
14+
# Directly include sentencepiece library
15+
set(SENTENCEPIECE_LIBRARY "${CMAKE_INSTALL_PREFIX}/lib64/libsentencepiece.a")
16+
if(NOT EXISTS "${SENTENCEPIECE_LIBRARY}")
17+
message(
18+
FATAL_ERROR
19+
"Could not find sentencepiece library at ${SENTENCEPIECE_LIBRARY}"
20+
)
21+
endif()
22+
23+
# Include the exported targets file
24+
include("${CMAKE_CURRENT_LIST_DIR}/tokenizers-targets.cmake")
25+
26+
# Set the include directories
27+
set_and_check(TOKENIZERS_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
28+
29+
# Add --whole-archive linker flag for tokenizers library
30+
if(APPLE)
31+
set(TOKENIZERS_LINK_OPTIONS
32+
"SHELL:-force_load,$<TARGET_FILE:tokenizers::regex_lookahead>"
33+
)
34+
elseif(MSVC)
35+
set(TOKENIZERS_LINK_OPTIONS
36+
"SHELL:/WHOLEARCHIVE:$<TARGET_FILE:tokenizers::regex_lookahead>"
37+
)
38+
else()
39+
set(TOKENIZERS_LINK_OPTIONS
40+
"SHELL:LINKER:--whole-archive $<TARGET_FILE:tokenizers::regex_lookahead> LINKER:--no-whole-archive"
41+
)
42+
endif()
43+
44+
# Set the libraries and link options
45+
set(TOKENIZERS_LIBRARIES tokenizers::tokenizers)
46+
set_property(
47+
TARGET tokenizers::tokenizers
48+
APPEND
49+
PROPERTY INTERFACE_LINK_OPTIONS "${TOKENIZERS_LINK_OPTIONS}"
50+
)
51+
52+
# Check if the library was found
53+
check_required_components(tokenizers)

test/CMakeLists.txt

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#
4+
# Build tokenizers tests.
5+
#
6+
cmake_minimum_required(VERSION 3.18)
7+
set(CMAKE_CXX_STANDARD 17)
8+
9+
project(TokenizersTests)
10+
11+
# Include ExternalProject module
12+
include(FindPackageHandleStandardArgs)
13+
include(FetchContent)
14+
include(ExternalProject)
15+
FetchContent_Declare(
16+
tokenizers SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/.. BUILD_ALWAYS ON
17+
)
18+
set(SUPPORT_REGEX_LOOKAHEAD ON)
19+
FetchContent_MakeAvailable(tokenizers)
20+
21+
# Build test
22+
FetchContent_Declare(
23+
googletest
24+
DOWNLOAD_EXTRACT_TIMESTAMP ON
25+
# Specify the commit you depend on and update it regularly.
26+
URL https://github.com/google/googletest/archive/5376968f6948923e2411081fd9372e71a59d8e77.zip
27+
)
28+
set(gtest_force_shared_crt
29+
ON
30+
CACHE BOOL "" FORCE
31+
)
32+
FetchContent_MakeAvailable(googletest)
33+
34+
file(GLOB test_source_files test_*.cpp)
35+
36+
set(test_env "RESOURCES_PATH=${CMAKE_CURRENT_SOURCE_DIR}/resources")
37+
enable_testing()
38+
foreach(test_source_file ${test_source_files})
39+
get_filename_component(test_name ${test_source_file} NAME_WE)
40+
message(STATUS "Configuring unit test ${test_name}")
41+
add_executable(${test_name} ${test_source_file})
42+
target_include_directories(
43+
${test_name} PRIVATE GTEST_INCLUDE_PATH ${TOKENIZERS_INCLUDE_DIRS}
44+
)
45+
target_link_libraries(${test_name} PUBLIC gtest_main GTest::gmock tokenizers)
46+
add_test(${test_name} "${test_name}")
47+
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
48+
endforeach()

0 commit comments

Comments
 (0)