apache
diff --git a/‎CMakeLists.txt‎
Lines changed: 16 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎cmake_modules/ThirdpartyToolchain.cmake‎
Lines changed: 39 additions & 2 deletions b/‎cmake_modules/ThirdpartyToolchain.cmake‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎cmake_modules/jieba.diff‎
Lines changed: 16 additions & 0 deletions b/‎cmake_modules/jieba.diff‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/paimon/global_index/lucene/CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions b/‎src/paimon/global_index/lucene/CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/paimon/global_index/lucene/jieba_analyzer.cpp‎
Lines changed: 148 additions & 0 deletions b/‎src/paimon/global_index/lucene/jieba_analyzer.cpp‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎src/paimon/global_index/lucene/jieba_analyzer.h‎
Lines changed: 87 additions & 0 deletions b/‎src/paimon/global_index/lucene/jieba_analyzer.h‎
Lines changed: 87 additions & 0 deletions
@@ -290,6 +290,22 @@ if(PAIMON_ENABLE_LUMINA)
             DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
+if(PAIMON_ENABLE_LUCENE)
+    set(PAIMON_DICT_DEST "share/paimon/dict")
+
+    install(DIRECTORY ${JIEBA_DICT_DIR}/
+            DESTINATION ${PAIMON_DICT_DEST}
+            FILES_MATCHING
+            PATTERN "jieba.dict.utf8"
+            PATTERN "hmm_model.utf8"
+            PATTERN "idf.utf8"
+            PATTERN "stop_words.utf8"
+            PATTERN "user.dict.utf8"
+            PATTERN "pos_dict"
+            PATTERN ".git*" EXCLUDE
+            PATTERN "*.md" EXCLUDE)
+endif()
+
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/
         DESTINATION "include"
         FILES_MATCHING
@@ -389,7 +405,6 @@ if(PAIMON_BUILD_TESTS)
         list(APPEND TEST_STATIC_LINK_LIBS paimon_lucene_index_shared)
         list(APPEND TEST_STATIC_LINK_LIBS "-Wl,--as-needed")
     endif()
-
 endif()
 
 include(CMakePackageConfigHelpers)
 
@@ -322,8 +322,7 @@ macro(build_lucene)
     set(LUCENE_INCLUDE_DIR "${LUCENE_PREFIX}/include")
     # The include directory must exist before it is referenced by a target.
     file(MAKE_DIRECTORY "${LUCENE_INCLUDE_DIR}")
-    include_directories(SYSTEM ${LUCENE_INCLUDE_DIR} ${BOOST_INCLUDE_DIR}
-                        ${BOOST_EXTRA_INCLUDE_DIR})
+    include_directories(SYSTEM ${LUCENE_INCLUDE_DIR} ${BOOST_INCLUDE_DIR})
     add_library(lucene INTERFACE IMPORTED)
     target_include_directories(lucene SYSTEM INTERFACE "${LUCENE_INCLUDE_DIR}")
     target_compile_options(lucene INTERFACE -pthread)
@@ -343,6 +342,43 @@ macro(build_lucene)
     add_dependencies(lucene lucene_ep)
 endmacro()
 
+macro(build_jieba)
+    message(STATUS "Building jieba from source")
+    set(JIEBA_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/jieba_ep-prefix")
+    set(JIEBA_INSTALL "${CMAKE_CURRENT_BINARY_DIR}/jieba_ep-install")
+    set(JIEBA_INCLUDE_DIR "${JIEBA_INSTALL}/include")
+    set(JIEBA_DICT_DIR "${JIEBA_INSTALL}/dict")
+    file(MAKE_DIRECTORY ${JIEBA_INCLUDE_DIR})
+    file(MAKE_DIRECTORY ${JIEBA_DICT_DIR})
+
+    set(JIEBA_CMAKE_ARGS
+        ${EP_COMMON_CMAKE_ARGS} "-DENABLE_TEST=OFF" "-DCPPJIEBA_TOP_LEVEL_PROJECT=OFF"
+        "-DCMAKE_INSTALL_PREFIX=${JIEBA_INSTALL}")
+
+    set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/jieba.diff")
+    externalproject_add(jieba_ep
+                        ${EP_COMMON_OPTIONS}
+                        GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
+                        GIT_TAG ${PAIMON_JIEBA_BUILD_VERSION}
+                        GIT_SHALLOW FALSE
+                        GIT_PROGRESS TRUE
+                        GIT_SUBMODULES_RECURSE TRUE
+                        CMAKE_ARGS ${JIEBA_CMAKE_ARGS}
+                        LOG_PATCH ON
+                        PATCH_COMMAND ${CMAKE_COMMAND} -E chdir <SOURCE_DIR> bash -c
+                                      "[ -f .patched ] && echo '<SOURCE_DIR> patch already applied, ignore...' || patch -s -N -p1 -i '${PATCH_FILE}' && touch .patched"
+                        INSTALL_COMMAND bash -c
+                                        "cp -r ${JIEBA_PREFIX}/src/jieba_ep/include/* ${JIEBA_INSTALL}/include/ && cp -r ${JIEBA_PREFIX}/src/jieba_ep/dict/* ${JIEBA_INSTALL}/dict/ && cp -r ${JIEBA_PREFIX}/src/jieba_ep/deps/limonp/include/* ${JIEBA_INSTALL}/include/"
+    )
+
+    # The include directory must exist before it is referenced by a target.
+    include_directories(SYSTEM ${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR})
+    add_library(jieba INTERFACE IMPORTED)
+    target_include_directories(jieba SYSTEM
+                               INTERFACE "${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR}")
+    add_dependencies(jieba jieba_ep)
+endmacro()
+
 macro(build_rapidjson)
     message(STATUS "Building RapidJSON from source")
     set(RAPIDJSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/rapidjson_ep-install")
@@ -1272,4 +1308,5 @@ endif()
 if(PAIMON_ENABLE_LUCENE)
     build_boost()
     build_lucene()
+    build_jieba()
 endif()
@@ -0,0 +1,16 @@
+diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp
+index 24b2c40..c7c6a94 100644
+--- a/include/cppjieba/KeywordExtractor.hpp
++++ b/include/cppjieba/KeywordExtractor.hpp
+@@ -89,6 +89,11 @@ class KeywordExtractor {
+     std::partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+     keywords.resize(topN);
+   }
++
++  const std::unordered_set<std::string>& GetStopWords() const {
++    return stopWords_;
++  }
++
+  private:
+   void LoadIdfDict(const std::string& idfPath) {
+     std::ifstream ifs(idfPath.c_str());
@@ -13,17 +13,25 @@
 # limitations under the License.
 
 if(PAIMON_ENABLE_LUCENE)
-    set(PAIMON_LUCENE lucene_global_index.cpp lucene_directory.cpp
-                      lucene_global_index_factory.cpp)
+    set(PAIMON_LUCENE
+        lucene_global_index.cpp
+        lucene_directory.cpp
+        lucene_utils.cpp
+        jieba_analyzer.cpp
+        lucene_global_index_writer.cpp
+        lucene_global_index_reader.cpp
+        lucene_global_index_factory.cpp)
 
     add_paimon_lib(paimon_lucene_index
                    SOURCES
                    ${PAIMON_LUCENE}
                    EXTRA_INCLUDES
                    ${LUCENE_INCLUDE_DIR}
+                   ${JIEBA_INCLUDE_DIR}
                    DEPENDENCIES
                    paimon_shared
                    lucene
+                   jieba
                    STATIC_LINK_LIBS
                    lucene
                    arrow
@@ -34,11 +42,18 @@ if(PAIMON_ENABLE_LUCENE)
                    paimon_shared
                    SHARED_LINK_FLAGS
                    ${PAIMON_VERSION_SCRIPT_FLAGS})
+    if(PAIMON_BUILD_TESTS)
+        target_compile_definitions(paimon_lucene_index_objlib
+                                   PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    endif()
 
     if(PAIMON_BUILD_TESTS)
         add_paimon_test(lucene_index_test
                         SOURCES
+                        jieba_analyzer_test.cpp
                         lucene_api_test.cpp
+                        jieba_api_test.cpp
                         lucene_directory_test.cpp
                         lucene_global_index_test.cpp
                         lucene_filter_test.cpp
@@ -52,5 +67,6 @@ if(PAIMON_ENABLE_LUCENE)
                         paimon_lucene_index_static
                         "-Wl,--no-whole-archive"
                         ${GTEST_LINK_TOOLCHAIN})
+
     endif()
 endif()
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "paimon/global_index/lucene/jieba_analyzer.h"
+
+#include "paimon/global_index/lucene/lucene_utils.h"
+
+namespace paimon::lucene {
+JiebaTokenizerContext::JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
+                                             const std::shared_ptr<cppjieba::Jieba> _jieba,
+                                             const std::shared_ptr<MemoryPool>& _pool,
+                                             int32_t _buffer_size)
+    : pool(_pool),
+      tokenize_mode(_tokenize_mode),
+      with_position(_with_position),
+      buffer_size(_buffer_size),
+      jieba(_jieba) {}
+
+JiebaTokenizer::JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input)
+    : Lucene::Tokenizer(input), context_(context) {
+    term_att_ = addAttribute<Lucene::TermAttribute>();
+    pos_att_ = addAttribute<Lucene::PositionIncrementAttribute>();
+    buffer_ = static_cast<wchar_t*>(
+        context_.pool->Malloc(context_.buffer_size * sizeof(wchar_t), /*alignment=*/8));
+}
+
+JiebaTokenizer::~JiebaTokenizer() {
+    if (buffer_) {
+        context_.pool->Free(reinterpret_cast<void*>(buffer_),
+                            context_.buffer_size * sizeof(wchar_t),
+                            /*alignment=*/8);
+        buffer_ = nullptr;
+    }
+}
+
+bool JiebaTokenizer::incrementToken() {
+    if (term_index_ >= normalized_terms_.size()) {
+        return false;
+    }
+
+    const auto& term = normalized_terms_[term_index_++];
+    clearAttributes();
+
+    term_att_->setTermBuffer(LuceneUtils::StringToWstring(term));
+
+    if (context_.with_position) {
+        pos_att_->setPositionIncrement(1);
+    } else {
+        pos_att_->setPositionIncrement(0);
+    }
+    return true;
+}
+
+void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
+                                 const std::string& str, std::vector<std::string>* terms_ptr) {
+    auto& terms = *terms_ptr;
+    if (tokenize_mode == "mp") {
+        jieba->CutSmall(str, terms, /*max_word_len=*/JiebaTokenizerContext::kMaxWordLen);
+    } else if (tokenize_mode == "hmm") {
+        jieba->CutHMM(str, terms);
+    } else if (tokenize_mode == "mix") {
+        jieba->Cut(str, terms, /*hmm=*/true);
+    } else if (tokenize_mode == "full") {
+        jieba->CutAll(str, terms);
+    } else if (tokenize_mode == "query") {
+        jieba->CutForSearch(str, terms, /*hmm=*/true);
+    } else {
+        throw Lucene::IllegalArgumentException(
+            L"only support mp/hmm/mix/full/query in jieba tokenizer");
+    }
+}
+
+void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words,
+                               std::vector<std::string>* input_ptr,
+                               std::vector<std::string_view>* output_ptr) {
+    auto& input = *input_ptr;
+    auto& output = *output_ptr;
+    output.clear();
+    output.reserve(input.size());
+    for (auto& term : input) {
+        // remove stop words
+        if (stop_words.find(term) != stop_words.end()) {
+            continue;
+        }
+
+        // to lower case
+        bool is_alphanumeric = true;
+        for (const auto& c : term) {
+            if (!std::isalnum(static_cast<unsigned char>(c))) {
+                is_alphanumeric = false;
+                break;
+            }
+        }
+        if (is_alphanumeric && !term.empty()) {
+            std::transform(term.begin(), term.end(), term.begin(), [](char ch) {
+                return static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
+            });
+        }
+        output.emplace_back(term.data(), term.length());
+    }
+}
+
+void JiebaTokenizer::reset() {
+    Lucene::Tokenizer::reset();
+    InnerReset();
+}
+
+void JiebaTokenizer::reset(const Lucene::ReaderPtr& input) {
+    Lucene::Tokenizer::reset(input);
+    InnerReset();
+}
+
+void JiebaTokenizer::InnerReset() {
+    terms_.clear();
+    normalized_terms_.clear();
+    term_index_ = 0;
+
+    // read wchar from input
+    Lucene::String wstr;
+    wstr.reserve(context_.buffer_size);
+    while (true) {
+        int32_t length = input->read(buffer_, /*offset=*/0, context_.buffer_size);
+        if (length <= 0) {
+            break;
+        }
+        wstr.append(buffer_, length);
+    }
+
+    // jieba tokenize
+    std::string doc_str = LuceneUtils::WstringToString(wstr);
+    // TODO(xinyu.lxy): support porter2 stemmer
+    CutWithMode(context_.tokenize_mode, context_.jieba.get(), doc_str, &terms_);
+    Normalize(context_.jieba->extractor.GetStopWords(), &terms_, &normalized_terms_);
+}
+
+}  // namespace paimon::lucene
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cppjieba/Jieba.hpp"
+#include "lucene++/LuceneHeaders.h"
+#include "lucene++/MiscUtils.h"
+#include "lucene++/PositionIncrementAttribute.h"
+#include "lucene++/TermAttribute.h"
+#include "paimon/global_index/lucene/lucene_utils.h"
+#include "paimon/memory/memory_pool.h"
+namespace paimon::lucene {
+struct JiebaTokenizerContext {
+    JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
+                          const std::shared_ptr<cppjieba::Jieba> _jieba,
+                          const std::shared_ptr<MemoryPool>& _pool,
+                          int32_t _buffer_size = kReadBufferSize);
+
+    std::shared_ptr<MemoryPool> pool;
+    std::string tokenize_mode;
+    bool with_position;
+    int32_t buffer_size;
+    std::shared_ptr<cppjieba::Jieba> jieba;
+
+    static inline const int32_t kReadBufferSize = 5 * 1024 * 1024;
+    static inline const int32_t kMaxWordLen = 1024;
+};
+
+class JiebaTokenizer : public Lucene::Tokenizer {
+ public:
+    JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input);
+
+    ~JiebaTokenizer() override;
+
+    bool incrementToken() override;
+
+    void reset(const Lucene::ReaderPtr& input) override;
+
+    void reset() override;
+
+    static void CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
+                            const std::string& str, std::vector<std::string>* terms_ptr);
+
+    // In-place converts each string in `input` to lowercase to avoid data copying.
+    static void Normalize(const std::unordered_set<std::string>& stop_words,
+                          std::vector<std::string>* input, std::vector<std::string_view>* output);
+
+ private:
+    void InnerReset();
+
+ private:
+    JiebaTokenizerContext context_;
+    size_t term_index_ = 0;
+    std::vector<std::string> terms_;
+    std::vector<std::string_view> normalized_terms_;
+    wchar_t* buffer_;
+    Lucene::TermAttributePtr term_att_;
+    Lucene::PositionIncrementAttributePtr pos_att_;
+};
+
+class JiebaAnalyzer : public Lucene::Analyzer {
+ public:
+    explicit JiebaAnalyzer(const JiebaTokenizerContext& context) : context_(context) {}
+
+    ~JiebaAnalyzer() override = default;
+
+    Lucene::TokenStreamPtr tokenStream(const Lucene::String& field_name,
+                                       const Lucene::ReaderPtr& reader) override {
+        return Lucene::newLucene<JiebaTokenizer>(context_, reader);
+    }
+
+ private:
+    JiebaTokenizerContext context_;
+};
+}  // namespace paimon::lucene