Skip to content

Commit 2094659

Browse files
authored
Add pytorch-labs/tokenizers into ET submodules
Differential Revision: D70862880 Pull Request resolved: #9074
1 parent 7fb9d96 commit 2094659

File tree

17 files changed

+53
-77
lines changed

17 files changed

+53
-77
lines changed

.ci/scripts/build_llama_android.sh

+4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ install_executorch_and_backend_lib() {
4242

4343
build_llama_runner() {
4444
echo "Building llama runner for Android..."
45+
pushd extension/llm/tokenizers
46+
echo "Updating tokenizers submodule"
47+
git submodule update --init
48+
popd
4549
ANDROID_ABI=arm64-v8a
4650
cmake -DBUCK2="${BUCK2}" \
4751
-DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake \

.ci/scripts/test_ane_static_llama.sh

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ fi
1717

1818
which "${PYTHON_EXECUTABLE}"
1919

20+
# Update tokenizers submodule
21+
pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
22+
echo "Update tokenizers submodule"
23+
git submodule update --init
24+
popd
25+
2026
pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
2127

2228
# Download stories llama110m artifacts

.ci/scripts/test_llama.sh

+4
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@ cmake_install_executorch_libraries() {
173173

174174
cmake_build_llama_runner() {
175175
echo "Building llama runner"
176+
pushd extension/llm/tokenizers
177+
echo "Updating tokenizers submodule"
178+
git submodule update --init
179+
popd
176180
dir="examples/models/llama"
177181
retry cmake \
178182
-DCMAKE_INSTALL_PREFIX=cmake-out \

.gitmodules

+3-9
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,9 @@
2828
[submodule "backends/xnnpack/third-party/pthreadpool"]
2929
path = backends/xnnpack/third-party/pthreadpool
3030
url = https://github.com/Maratyszcza/pthreadpool.git
31-
[submodule "extension/llm/third-party/abseil-cpp"]
32-
path = extension/llm/third-party/abseil-cpp
33-
url = https://github.com/abseil/abseil-cpp.git
34-
[submodule "extension/llm/third-party/re2"]
35-
path = extension/llm/third-party/re2
36-
url = https://github.com/google/re2.git
37-
[submodule "extension/llm/third-party/sentencepiece"]
38-
path = extension/llm/third-party/sentencepiece
39-
url = https://github.com/google/sentencepiece.git
31+
[submodule "extension/llm/tokenizers"]
32+
path = extension/llm/tokenizers
33+
url = https://github.com/pytorch-labs/tokenizers.git
4034
[submodule "kernels/optimized/third-party/eigen"]
4135
path = kernels/optimized/third-party/eigen
4236
url = https://gitlab.com/libeigen/eigen.git

backends/qualcomm/scripts/build.sh

+4
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ if [ "$BUILD_X86_64" = true ]; then
144144
EXAMPLE_ROOT=examples/qualcomm
145145
CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
146146

147+
echo "Update tokenizers submodule..."
148+
pushd $PRJ_ROOT/extension/llm/tokenizers
149+
git submodule update --init
150+
popd
147151
cmake $PRJ_ROOT/$EXAMPLE_ROOT \
148152
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
149153
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \

build/build_android_library.sh

+5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ build_android_native_library() {
7070
fi
7171
cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
7272

73+
# Update tokenizers submodule
74+
pushd extension/llm/tokenizers
75+
echo "Update tokenizers submodule"
76+
git submodule update --init
77+
popd
7378
cmake extension/android \
7479
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
7580
-DANDROID_ABI="${ANDROID_ABI}" \

examples/mediatek/CMakeLists.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,17 @@ if(${ANDROID})
122122
)
123123
# Build ABSL and RE2
124124
set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
125-
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp)
126-
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2)
125+
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/abseil-cpp)
126+
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/re2)
127127
set(ABSL_ENABLE_INSTALL ON)
128128
set(ABSL_PROPAGATE_CXX_STD ON)
129129
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
130130
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
131131
add_subdirectory(
132-
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil
132+
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/abseil
133133
)
134134
add_subdirectory(
135-
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2
135+
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/re2
136136
)
137137
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
138138

examples/models/llama/runner/CMakeLists.txt

+5-3
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
6666
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
6767
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
6868
add_subdirectory(
69-
${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
69+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/abseil-cpp
7070
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
7171
)
7272
add_subdirectory(
73-
${EXECUTORCH_ROOT}/extension/llm/third-party/re2
73+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/re2
7474
${CMAKE_CURRENT_BINARY_DIR}/re2
7575
)
7676
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -82,6 +82,8 @@ set(llama_runner_deps executorch extension_data_loader extension_module
8282
target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
8383

8484
target_include_directories(
85-
llama_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
85+
llama_runner
86+
INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
87+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
8688
)
8789
target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})

examples/models/llama/tokenizer/test/CMakeLists.txt

+5-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
1919

2020
set(_tokenizer_test_srcs
2121
test_tiktoken.cpp
22-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
22+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/src/tiktoken.cpp
2323
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
2424
)
2525

@@ -29,11 +29,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
2929
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
3030
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
3131
add_subdirectory(
32-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
32+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
3333
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
3434
)
3535
add_subdirectory(
36-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
36+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/re2
3737
${CMAKE_CURRENT_BINARY_DIR}/re2
3838
)
3939
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -43,5 +43,6 @@ target_include_directories(
4343
tokenizer_test
4444
PRIVATE
4545
${CMAKE_INSTALL_PREFIX}/include
46-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
46+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/include
47+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
4748
)

examples/qualcomm/CMakeLists.txt

+6-3
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,10 @@ target_compile_options(
6363
full_portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED
6464
)
6565
target_include_directories(
66-
full_portable_ops_lib PUBLIC ${_common_include_directories}
66+
full_portable_ops_lib
67+
PUBLIC
68+
${_common_include_directories}
69+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/include
6770
)
6871

6972
# find RE2 for tokenizer
@@ -72,11 +75,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
7275
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
7376
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7477
add_subdirectory(
75-
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
78+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/abseil-cpp
7679
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
7780
)
7881
add_subdirectory(
79-
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
82+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/re2
8083
${CMAKE_CURRENT_BINARY_DIR}/re2
8184
)
8285
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

extension/llm/third-party/TARGETS

-47
This file was deleted.

extension/llm/third-party/abseil-cpp

-1
This file was deleted.

extension/llm/third-party/re2

-1
This file was deleted.
-1
This file was deleted.

extension/llm/tokenizer/CMakeLists.txt

+4-3
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
2121
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
2222
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2323
add_subdirectory(
24-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
24+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
2525
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
2626
)
2727
add_subdirectory(
28-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
28+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/re2
2929
${CMAKE_CURRENT_BINARY_DIR}/re2
3030
)
3131
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -35,6 +35,7 @@ add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
3535
target_include_directories(
3636
extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/..
3737
${_common_include_directories}
38+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/include
3839
)
3940

4041
target_link_libraries(extension_llm_tokenizer re2::re2)
@@ -53,7 +54,7 @@ install(
5354
target_include_directories(
5455
extension_llm_tokenizer
5556
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
56-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
57+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
5758
)
5859

5960
if(BUILD_TESTING)

extension/llm/tokenizers

Submodule tokenizers added at 4da2387

shim_et/xplat/executorch/build/env_interface.bzl

+2-1
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@ _EXTERNAL_DEPS = {
4343
"nlohmann_json": [], # Intentionally not supporting OSS buck build HF tokenizer.
4444
"prettytable": "//third-party:prettytable",
4545
"pybind11": "//third-party:pybind11",
46-
"re2": "//extension/llm/third-party:re2",
46+
"re2": "//extension/llm/tokenizers/third-party:re2",
4747
"sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
4848
"sentencepiece-py": [],
49+
"tiktoken": "//extension/llm/tokenizers:tiktoken",
4950
# Core C++ PyTorch functionality like Tensor and ScalarType.
5051
"torch-core-cpp": "//third-party:libtorch",
5152
"torchgen": "//third-party:torchgen",

0 commit comments

Comments
 (0)