Skip to content

Commit e8b2dcb

Browse files
authored
feat: add Jieba tokenizer for full-text search (#111)
1 parent cbacf3a commit e8b2dcb

20 files changed

Lines changed: 1572 additions & 589 deletions

CMakeLists.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,22 @@ if(PAIMON_ENABLE_LUMINA)
290290
DESTINATION ${CMAKE_INSTALL_LIBDIR})
291291
endif()
292292
293+
if(PAIMON_ENABLE_LUCENE)
294+
set(PAIMON_DICT_DEST "share/paimon/dict")
295+
296+
install(DIRECTORY ${JIEBA_DICT_DIR}/
297+
DESTINATION ${PAIMON_DICT_DEST}
298+
FILES_MATCHING
299+
PATTERN "jieba.dict.utf8"
300+
PATTERN "hmm_model.utf8"
301+
PATTERN "idf.utf8"
302+
PATTERN "stop_words.utf8"
303+
PATTERN "user.dict.utf8"
304+
PATTERN "pos_dict"
305+
PATTERN ".git*" EXCLUDE
306+
PATTERN "*.md" EXCLUDE)
307+
endif()
308+
293309
install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/
294310
DESTINATION "include"
295311
FILES_MATCHING
@@ -389,7 +405,6 @@ if(PAIMON_BUILD_TESTS)
389405
list(APPEND TEST_STATIC_LINK_LIBS paimon_lucene_index_shared)
390406
list(APPEND TEST_STATIC_LINK_LIBS "-Wl,--as-needed")
391407
endif()
392-
393408
endif()
394409
395410
include(CMakePackageConfigHelpers)

cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,7 @@ macro(build_lucene)
322322
set(LUCENE_INCLUDE_DIR "${LUCENE_PREFIX}/include")
323323
# The include directory must exist before it is referenced by a target.
324324
file(MAKE_DIRECTORY "${LUCENE_INCLUDE_DIR}")
325-
include_directories(SYSTEM ${LUCENE_INCLUDE_DIR} ${BOOST_INCLUDE_DIR}
326-
${BOOST_EXTRA_INCLUDE_DIR})
325+
include_directories(SYSTEM ${LUCENE_INCLUDE_DIR} ${BOOST_INCLUDE_DIR})
327326
add_library(lucene INTERFACE IMPORTED)
328327
target_include_directories(lucene SYSTEM INTERFACE "${LUCENE_INCLUDE_DIR}")
329328
target_compile_options(lucene INTERFACE -pthread)
@@ -343,6 +342,43 @@ macro(build_lucene)
343342
add_dependencies(lucene lucene_ep)
344343
endmacro()
345344

345+
macro(build_jieba)
346+
message(STATUS "Building jieba from source")
347+
set(JIEBA_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/jieba_ep-prefix")
348+
set(JIEBA_INSTALL "${CMAKE_CURRENT_BINARY_DIR}/jieba_ep-install")
349+
set(JIEBA_INCLUDE_DIR "${JIEBA_INSTALL}/include")
350+
set(JIEBA_DICT_DIR "${JIEBA_INSTALL}/dict")
351+
file(MAKE_DIRECTORY ${JIEBA_INCLUDE_DIR})
352+
file(MAKE_DIRECTORY ${JIEBA_DICT_DIR})
353+
354+
set(JIEBA_CMAKE_ARGS
355+
${EP_COMMON_CMAKE_ARGS} "-DENABLE_TEST=OFF" "-DCPPJIEBA_TOP_LEVEL_PROJECT=OFF"
356+
"-DCMAKE_INSTALL_PREFIX=${JIEBA_INSTALL}")
357+
358+
set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/jieba.diff")
359+
externalproject_add(jieba_ep
360+
${EP_COMMON_OPTIONS}
361+
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
362+
GIT_TAG ${PAIMON_JIEBA_BUILD_VERSION}
363+
GIT_SHALLOW FALSE
364+
GIT_PROGRESS TRUE
365+
GIT_SUBMODULES_RECURSE TRUE
366+
CMAKE_ARGS ${JIEBA_CMAKE_ARGS}
367+
LOG_PATCH ON
368+
PATCH_COMMAND ${CMAKE_COMMAND} -E chdir <SOURCE_DIR> bash -c
369+
"[ -f .patched ] && echo '<SOURCE_DIR> patch already applied, ignore...' || patch -s -N -p1 -i '${PATCH_FILE}' && touch .patched"
370+
INSTALL_COMMAND bash -c
371+
"cp -r ${JIEBA_PREFIX}/src/jieba_ep/include/* ${JIEBA_INSTALL}/include/ && cp -r ${JIEBA_PREFIX}/src/jieba_ep/dict/* ${JIEBA_INSTALL}/dict/ && cp -r ${JIEBA_PREFIX}/src/jieba_ep/deps/limonp/include/* ${JIEBA_INSTALL}/include/"
372+
)
373+
374+
# The include directory must exist before it is referenced by a target.
375+
include_directories(SYSTEM ${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR})
376+
add_library(jieba INTERFACE IMPORTED)
377+
target_include_directories(jieba SYSTEM
378+
INTERFACE "${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR}")
379+
add_dependencies(jieba jieba_ep)
380+
endmacro()
381+
346382
macro(build_rapidjson)
347383
message(STATUS "Building RapidJSON from source")
348384
set(RAPIDJSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/rapidjson_ep-install")
@@ -1272,4 +1308,5 @@ endif()
12721308
if(PAIMON_ENABLE_LUCENE)
12731309
build_boost()
12741310
build_lucene()
1311+
build_jieba()
12751312
endif()

cmake_modules/jieba.diff

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp
2+
index 24b2c40..c7c6a94 100644
3+
--- a/include/cppjieba/KeywordExtractor.hpp
4+
+++ b/include/cppjieba/KeywordExtractor.hpp
5+
@@ -89,6 +89,11 @@ class KeywordExtractor {
6+
std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
7+
keywords.resize(topN);
8+
}
9+
+
10+
+ const std::unordered_set<std::string>& GetStopWords() const {
11+
+ return stopWords_;
12+
+ }
13+
+
14+
private:
15+
void LoadIdfDict(const std::string& idfPath) {
16+
std::ifstream ifs(idfPath.c_str());

src/paimon/global_index/lucene/CMakeLists.txt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,25 @@
1313
# limitations under the License.
1414

1515
if(PAIMON_ENABLE_LUCENE)
16-
set(PAIMON_LUCENE lucene_global_index.cpp lucene_directory.cpp
17-
lucene_global_index_factory.cpp)
16+
set(PAIMON_LUCENE
17+
lucene_global_index.cpp
18+
lucene_directory.cpp
19+
lucene_utils.cpp
20+
jieba_analyzer.cpp
21+
lucene_global_index_writer.cpp
22+
lucene_global_index_reader.cpp
23+
lucene_global_index_factory.cpp)
1824

1925
add_paimon_lib(paimon_lucene_index
2026
SOURCES
2127
${PAIMON_LUCENE}
2228
EXTRA_INCLUDES
2329
${LUCENE_INCLUDE_DIR}
30+
${JIEBA_INCLUDE_DIR}
2431
DEPENDENCIES
2532
paimon_shared
2633
lucene
34+
jieba
2735
STATIC_LINK_LIBS
2836
lucene
2937
arrow
@@ -34,11 +42,18 @@ if(PAIMON_ENABLE_LUCENE)
3442
paimon_shared
3543
SHARED_LINK_FLAGS
3644
${PAIMON_VERSION_SCRIPT_FLAGS})
45+
if(PAIMON_BUILD_TESTS)
46+
target_compile_definitions(paimon_lucene_index_objlib
47+
PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
48+
49+
endif()
3750

3851
if(PAIMON_BUILD_TESTS)
3952
add_paimon_test(lucene_index_test
4053
SOURCES
54+
jieba_analyzer_test.cpp
4155
lucene_api_test.cpp
56+
jieba_api_test.cpp
4257
lucene_directory_test.cpp
4358
lucene_global_index_test.cpp
4459
lucene_filter_test.cpp
@@ -52,5 +67,6 @@ if(PAIMON_ENABLE_LUCENE)
5267
paimon_lucene_index_static
5368
"-Wl,--no-whole-archive"
5469
${GTEST_LINK_TOOLCHAIN})
70+
5571
endif()
5672
endif()
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "paimon/global_index/lucene/jieba_analyzer.h"
17+
18+
#include "paimon/global_index/lucene/lucene_utils.h"
19+
20+
namespace paimon::lucene {
21+
JiebaTokenizerContext::JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
22+
const std::shared_ptr<cppjieba::Jieba> _jieba,
23+
const std::shared_ptr<MemoryPool>& _pool,
24+
int32_t _buffer_size)
25+
: pool(_pool),
26+
tokenize_mode(_tokenize_mode),
27+
with_position(_with_position),
28+
buffer_size(_buffer_size),
29+
jieba(_jieba) {}
30+
31+
JiebaTokenizer::JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input)
32+
: Lucene::Tokenizer(input), context_(context) {
33+
term_att_ = addAttribute<Lucene::TermAttribute>();
34+
pos_att_ = addAttribute<Lucene::PositionIncrementAttribute>();
35+
buffer_ = static_cast<wchar_t*>(
36+
context_.pool->Malloc(context_.buffer_size * sizeof(wchar_t), /*alignment=*/8));
37+
}
38+
39+
JiebaTokenizer::~JiebaTokenizer() {
40+
if (buffer_) {
41+
context_.pool->Free(reinterpret_cast<void*>(buffer_),
42+
context_.buffer_size * sizeof(wchar_t),
43+
/*alignment=*/8);
44+
buffer_ = nullptr;
45+
}
46+
}
47+
48+
bool JiebaTokenizer::incrementToken() {
49+
if (term_index_ >= normalized_terms_.size()) {
50+
return false;
51+
}
52+
53+
const auto& term = normalized_terms_[term_index_++];
54+
clearAttributes();
55+
56+
term_att_->setTermBuffer(LuceneUtils::StringToWstring(term));
57+
58+
if (context_.with_position) {
59+
pos_att_->setPositionIncrement(1);
60+
} else {
61+
pos_att_->setPositionIncrement(0);
62+
}
63+
return true;
64+
}
65+
66+
void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
67+
const std::string& str, std::vector<std::string>* terms_ptr) {
68+
auto& terms = *terms_ptr;
69+
if (tokenize_mode == "mp") {
70+
jieba->CutSmall(str, terms, /*max_word_len=*/JiebaTokenizerContext::kMaxWordLen);
71+
} else if (tokenize_mode == "hmm") {
72+
jieba->CutHMM(str, terms);
73+
} else if (tokenize_mode == "mix") {
74+
jieba->Cut(str, terms, /*hmm=*/true);
75+
} else if (tokenize_mode == "full") {
76+
jieba->CutAll(str, terms);
77+
} else if (tokenize_mode == "query") {
78+
jieba->CutForSearch(str, terms, /*hmm=*/true);
79+
} else {
80+
throw Lucene::IllegalArgumentException(
81+
L"only support mp/hmm/mix/full/query in jieba tokenizer");
82+
}
83+
}
84+
85+
void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words,
86+
std::vector<std::string>* input_ptr,
87+
std::vector<std::string_view>* output_ptr) {
88+
auto& input = *input_ptr;
89+
auto& output = *output_ptr;
90+
output.clear();
91+
output.reserve(input.size());
92+
for (auto& term : input) {
93+
// remove stop words
94+
if (stop_words.find(term) != stop_words.end()) {
95+
continue;
96+
}
97+
98+
// to lower case
99+
bool is_alphanumeric = true;
100+
for (const auto& c : term) {
101+
if (!std::isalnum(static_cast<unsigned char>(c))) {
102+
is_alphanumeric = false;
103+
break;
104+
}
105+
}
106+
if (is_alphanumeric && !term.empty()) {
107+
std::transform(term.begin(), term.end(), term.begin(), [](char ch) {
108+
return static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
109+
});
110+
}
111+
output.emplace_back(term.data(), term.length());
112+
}
113+
}
114+
115+
void JiebaTokenizer::reset() {
116+
Lucene::Tokenizer::reset();
117+
InnerReset();
118+
}
119+
120+
void JiebaTokenizer::reset(const Lucene::ReaderPtr& input) {
121+
Lucene::Tokenizer::reset(input);
122+
InnerReset();
123+
}
124+
125+
void JiebaTokenizer::InnerReset() {
126+
terms_.clear();
127+
normalized_terms_.clear();
128+
term_index_ = 0;
129+
130+
// read wchar from input
131+
Lucene::String wstr;
132+
wstr.reserve(context_.buffer_size);
133+
while (true) {
134+
int32_t length = input->read(buffer_, /*offset=*/0, context_.buffer_size);
135+
if (length <= 0) {
136+
break;
137+
}
138+
wstr.append(buffer_, length);
139+
}
140+
141+
// jieba tokenize
142+
std::string doc_str = LuceneUtils::WstringToString(wstr);
143+
// TODO(xinyu.lxy): support porter2 stemmer
144+
CutWithMode(context_.tokenize_mode, context_.jieba.get(), doc_str, &terms_);
145+
Normalize(context_.jieba->extractor.GetStopWords(), &terms_, &normalized_terms_);
146+
}
147+
148+
} // namespace paimon::lucene
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
#include "cppjieba/Jieba.hpp"
18+
#include "lucene++/LuceneHeaders.h"
19+
#include "lucene++/MiscUtils.h"
20+
#include "lucene++/PositionIncrementAttribute.h"
21+
#include "lucene++/TermAttribute.h"
22+
#include "paimon/global_index/lucene/lucene_utils.h"
23+
#include "paimon/memory/memory_pool.h"
24+
namespace paimon::lucene {
25+
struct JiebaTokenizerContext {
26+
JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
27+
const std::shared_ptr<cppjieba::Jieba> _jieba,
28+
const std::shared_ptr<MemoryPool>& _pool,
29+
int32_t _buffer_size = kReadBufferSize);
30+
31+
std::shared_ptr<MemoryPool> pool;
32+
std::string tokenize_mode;
33+
bool with_position;
34+
int32_t buffer_size;
35+
std::shared_ptr<cppjieba::Jieba> jieba;
36+
37+
static inline const int32_t kReadBufferSize = 5 * 1024 * 1024;
38+
static inline const int32_t kMaxWordLen = 1024;
39+
};
40+
41+
class JiebaTokenizer : public Lucene::Tokenizer {
42+
public:
43+
JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input);
44+
45+
~JiebaTokenizer() override;
46+
47+
bool incrementToken() override;
48+
49+
void reset(const Lucene::ReaderPtr& input) override;
50+
51+
void reset() override;
52+
53+
static void CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
54+
const std::string& str, std::vector<std::string>* terms_ptr);
55+
56+
// In-place converts each string in `input` to lowercase to avoid data copying.
57+
static void Normalize(const std::unordered_set<std::string>& stop_words,
58+
std::vector<std::string>* input, std::vector<std::string_view>* output);
59+
60+
private:
61+
void InnerReset();
62+
63+
private:
64+
JiebaTokenizerContext context_;
65+
size_t term_index_ = 0;
66+
std::vector<std::string> terms_;
67+
std::vector<std::string_view> normalized_terms_;
68+
wchar_t* buffer_;
69+
Lucene::TermAttributePtr term_att_;
70+
Lucene::PositionIncrementAttributePtr pos_att_;
71+
};
72+
73+
class JiebaAnalyzer : public Lucene::Analyzer {
74+
public:
75+
explicit JiebaAnalyzer(const JiebaTokenizerContext& context) : context_(context) {}
76+
77+
~JiebaAnalyzer() override = default;
78+
79+
Lucene::TokenStreamPtr tokenStream(const Lucene::String& field_name,
80+
const Lucene::ReaderPtr& reader) override {
81+
return Lucene::newLucene<JiebaTokenizer>(context_, reader);
82+
}
83+
84+
private:
85+
JiebaTokenizerContext context_;
86+
};
87+
} // namespace paimon::lucene

0 commit comments

Comments
 (0)