Skip to content

Commit 6e59981

Browse files
authored
Use tokenizers add_subdirectory in MediaTek CMake build (#17683)
The MediaTek CMakeLists.txt duplicated the entire tokenizers build setup: abseil, re2, json, unicode, pcre2 subdirectories, a hardcoded source file list, and regex_lookahead configuration. This list was missing post_processor.cpp, causing linker errors for undefined PostProcessorConfig symbols. Replace ~110 lines of duplicated build logic with a single add_subdirectory call to the canonical tokenizers CMakeLists.txt, which already handles all dependencies and source files correctly.
1 parent 41e31c2 commit 6e59981

1 file changed

Lines changed: 9 additions & 112 deletions

File tree

examples/mediatek/CMakeLists.txt

Lines changed: 9 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -116,116 +116,11 @@ if(${ANDROID})
116116
list(PREPEND _mtk_llama_executor_runner__srcs
117117
${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp
118118
)
119-
# Build ABSL and RE2
120-
set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
121-
set(THIRD_PARTY_ABSL_DIR
122-
${EXTENSIONS_LLM_DIR}/tokenizers/third-party/abseil-cpp
123-
)
124-
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/re2)
125-
set(THIRD_PARTY_JSON_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/json)
126-
set(THIRD_PARTY_UNICODE_DIR
127-
${EXTENSIONS_LLM_DIR}/tokenizers/third-party/llama.cpp-unicode
128-
)
129-
set(THIRD_PARTY_PCRE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/pcre2)
130-
set(ABSL_ENABLE_INSTALL ON)
131-
set(ABSL_PROPAGATE_CXX_STD ON)
132-
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
133-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
134-
add_subdirectory(
135-
${THIRD_PARTY_ABSL_DIR}
136-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/abseil
137-
)
138-
add_subdirectory(
139-
${THIRD_PARTY_RE2_DIR}
140-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/re2
141-
)
142-
add_subdirectory(
143-
${THIRD_PARTY_JSON_DIR}
144-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/json
145-
)
146-
add_subdirectory(
147-
${THIRD_PARTY_UNICODE_DIR}
148-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/llama.cpp-unicode
149-
)
150-
add_subdirectory(
151-
${THIRD_PARTY_PCRE2_DIR}
152-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/pcre2
153-
)
154-
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
155-
156-
# Build tokenizers
119+
# Build tokenizers library
157120
set(SUPPORT_REGEX_LOOKAHEAD ON)
158-
set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizers)
159-
add_library(tokenizer STATIC)
160-
target_include_directories(
161-
tokenizer
162-
PUBLIC ${_common_include_directories}
163-
${THIRD_PARTY_ABSL_DIR}
164-
${THIRD_PARTY_RE2_DIR}
165-
${LLAMA2_TOKENIZER_DIR}/include
166-
${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/pcre2
167-
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
168-
${THIRD_PARTY_JSON_DIR}
169-
${THIRD_PARTY_UNICODE_DIR}/include
170-
${THIRD_PARTY_PCRE2_DIR}
171-
)
172-
target_link_libraries(tokenizer PRIVATE re2::re2)
173-
174-
target_sources(
175-
tokenizer
176-
PRIVATE
177-
${LLAMA2_TOKENIZER_DIR}/src/tiktoken.cpp
178-
${LLAMA2_TOKENIZER_DIR}/src/llama2c_tokenizer.cpp
179-
${LLAMA2_TOKENIZER_DIR}/src/regex.cpp
180-
${LLAMA2_TOKENIZER_DIR}/src/bpe_tokenizer_base.cpp
181-
${LLAMA2_TOKENIZER_DIR}/src/re2_regex.cpp
182-
${LLAMA2_TOKENIZER_DIR}/src/hf_tokenizer.cpp
183-
${LLAMA2_TOKENIZER_DIR}/src/pre_tokenizer.cpp
184-
${LLAMA2_TOKENIZER_DIR}/src/token_decoder.cpp
185-
${LLAMA2_TOKENIZER_DIR}/src/normalizer.cpp
186-
${LLAMA2_TOKENIZER_DIR}/third-party/llama.cpp-unicode/src/unicode.cpp
187-
${LLAMA2_TOKENIZER_DIR}/third-party/llama.cpp-unicode/src/unicode-data.cpp
188-
${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp
189-
)
190-
191-
# Add support for regex_lookahead
192-
set(PCRE2_STATIC_PIC ON)
193-
set(PCRE2_BUILD_PCRE2_8 ON)
194-
set(PCRE2_BUILD_PCRE2_16 OFF)
195-
set(PCRE2_BUILD_PCRE2_32 OFF)
196-
set(PCRE2_BUILD_TESTS OFF)
197-
set(PCRE2_BUILD_PCRE2GREP OFF)
198-
set(PCRE2_BUILD_PCRE2TEST OFF)
199-
set(PCRE2_BUILD_PCRE2GPERF OFF)
200-
set(PCRE2_BUILD_DOCS OFF)
201-
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
202-
203-
# Set the INTERFACE_INCLUDE_DIRECTORIES property for pcre2-8-static
204-
set_target_properties(
205-
pcre2-8-static
206-
PROPERTIES
207-
INTERFACE_INCLUDE_DIRECTORIES
208-
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/pcre2>
209-
)
210-
add_library(
211-
regex_lookahead STATIC
212-
${LLAMA2_TOKENIZER_DIR}/src/pcre2_regex.cpp
213-
${LLAMA2_TOKENIZER_DIR}/src/regex_lookahead.cpp
214-
${LLAMA2_TOKENIZER_DIR}/src/std_regex.cpp
215-
)
216-
add_library(tokenizer::regex_lookahead ALIAS regex_lookahead)
217-
target_link_libraries(regex_lookahead PUBLIC pcre2-8-static)
218-
target_include_directories(
219-
regex_lookahead
220-
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
221-
)
222-
target_link_libraries(tokenizer PUBLIC regex_lookahead)
223-
install(
224-
TARGETS regex_lookahead pcre2-8-static
225-
EXPORT tokenizers-targets
226-
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
227-
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
228-
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
121+
add_subdirectory(
122+
${EXECUTORCH_ROOT}/extension/llm/tokenizers
123+
${CMAKE_CURRENT_BINARY_DIR}/tokenizers
229124
)
230125

231126
# Include directory for neuron headers
@@ -244,10 +139,12 @@ if(${ANDROID})
244139
mtk_llama_executor_runner ${_executor_runner_libs} neuron_backend gflags
245140
mtk_llama_executor_lib
246141
)
247-
target_link_libraries(
248-
mtk_llama_executor_runner tokenizer
249-
$<LINK_LIBRARY:WHOLE_ARCHIVE,regex_lookahead>
142+
target_sources(
143+
mtk_llama_executor_runner
144+
PRIVATE
145+
${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp
250146
)
147+
target_link_libraries(mtk_llama_executor_runner tokenizers)
251148
target_compile_options(
252149
mtk_llama_executor_runner PUBLIC ${_common_compile_options}
253150
)

0 commit comments

Comments
 (0)