add variable ai_context_window_size

linrrzqqq · linrrzqqq · commit 9b321e321e24 · 2026-04-16T16:00:51.000+08:00
diff --git a/be/src/exprs/aggregate/aggregate_function_ai_agg.h b/be/src/exprs/aggregate/aggregate_function_ai_agg.h
@@ -37,11 +37,6 @@ class AggregateFunctionAIAggData {
     static constexpr const char* SEPARATOR = "\n";
     static constexpr uint8_t SEPARATOR_SIZE = sizeof(*SEPARATOR);
 
-    // 128K tokens is a relatively small context limit among mainstream AIs.
-    // currently, token count is conservatively approximated by size; this is a safe lower bound.
-    // a more efficient and accurate token calculation method may be introduced.
-    static constexpr size_t MAX_CONTEXT_SIZE = 128 * 1024;
-
     ColumnString::Chars data;
     bool inited = false;
 
@@ -196,14 +191,22 @@ class AggregateFunctionAIAggData {
 
     // handle overflow situations when adding content.
     bool handle_overflow(size_t additional_size) {
-        if (additional_size + data.size() <= MAX_CONTEXT_SIZE) {
+        const size_t max_context_size = get_ai_context_window_size();
+        if (additional_size + data.size() <= max_context_size) {
             return false;
         }
 
         process_current_context();
 
         // check if there is still an overflow after replacement.
-        return (additional_size + data.size() > MAX_CONTEXT_SIZE);
+        return (additional_size + data.size() > max_context_size);
+    }
+
+    static size_t get_ai_context_window_size() {
+        DORIS_CHECK(_ctx);
+
+        int64_t context_window_size = _ctx->query_options().ai_context_window_size;
+        return static_cast<size_t>(context_window_size > 0 ? context_window_size : 128 * 1024);
     }
 
     void append_data(const void* source, size_t size) {
diff --git a/be/src/exprs/function/ai/ai_adapter.h b/be/src/exprs/function/ai/ai_adapter.h
@@ -180,7 +180,8 @@ class AIAdapter {
             --end;
         }
 
-        if (begin < end && text[begin] == '[' && text[end - 1] == ']') {
+        if (begin < end && text[begin] == '[' && text[end - 1] == ']' && end - begin >= 4 &&
+            (text[begin + 1] == '"' || text[begin + 1] == '\'')) {
             rapidjson::Document doc;
             doc.Parse(text.data() + begin, end - begin);
             if (doc.HasParseError()) {
@@ -217,6 +218,50 @@ class AIAdapter {
             doc.AddMember(name, _config.dimensions, allocator);
         }
     }
+
+    // Validates common multimodal embedding request invariants shared by providers.
+    Status validate_multimodal_embedding_inputs(
+            std::string_view provider_name, const std::vector<MultimodalType>& media_types,
+            const std::vector<std::string>& media_urls,
+            std::initializer_list<MultimodalType> supported_types) const {
+        if (media_urls.empty()) {
+            return Status::InvalidArgument("{} multimodal embed inputs can not be empty",
+                                           provider_name);
+        }
+        if (media_types.size() != media_urls.size()) {
+            return Status::InvalidArgument(
+                    "{} multimodal embed input size mismatch, media_types={}, media_urls={}",
+                    provider_name, media_types.size(), media_urls.size());
+        }
+        for (MultimodalType media_type : media_types) {
+            bool supported = false;
+            for (MultimodalType supported_type : supported_types) {
+                if (media_type == supported_type) {
+                    supported = true;
+                    break;
+                }
+            }
+            if (!supported) [[unlikely]] {
+                return Status::InvalidArgument(
+                        "{} only supports {} multimodal embed, got {}", provider_name,
+                        supported_multimodal_types_to_string(supported_types),
+                        multimodal_type_to_string(media_type));
+            }
+        }
+        return Status::OK();
+    }
+
+    static std::string supported_multimodal_types_to_string(
+            std::initializer_list<MultimodalType> supported_types) {
+        std::string result;
+        for (MultimodalType type : supported_types) {
+            if (!result.empty()) {
+                result += "/";
+            }
+            result += multimodal_type_to_string(type);
+        }
+        return result;
+    }
 };
 
 // Most LLM-providers' Embedding formats are based on VoyageAI.
@@ -265,22 +310,9 @@ class VoyageAIAdapter : public AIAdapter {
     Status build_multimodal_embedding_request(const std::vector<MultimodalType>& media_types,
                                               const std::vector<std::string>& media_urls,
                                               std::string& request_body) const override {
-        if (media_urls.empty()) {
-            return Status::InvalidArgument("VoyageAI multimodal embed inputs can not be empty");
-        }
-        if (media_types.size() != media_urls.size()) {
-            return Status::InvalidArgument(
-                    "VoyageAI multimodal embed input size mismatch, media_types={}, media_urls={}",
-                    media_types.size(), media_urls.size());
-        }
-        for (MultimodalType media_type : media_types) {
-            if (media_type != MultimodalType::IMAGE && media_type != MultimodalType::VIDEO)
-                    [[unlikely]] {
-                return Status::InvalidArgument(
-                        "VoyageAI only supports image/video multimodal embed, got {}",
-                        multimodal_type_to_string(media_type));
-            }
-        }
+        RETURN_IF_ERROR(validate_multimodal_embedding_inputs(
+                "VoyageAI", media_types, media_urls,
+                {MultimodalType::IMAGE, MultimodalType::VIDEO}));
         if (_config.dimensions != -1) {
             LOG(WARNING) << "VoyageAI multimodal embedding currently ignores dimensions parameter, "
                          << "model=" << _config.model_name << ", dimensions=" << _config.dimensions;
@@ -937,21 +969,8 @@ class QwenAdapter : public OpenAIAdapter {
     Status build_multimodal_embedding_request(const std::vector<MultimodalType>& media_types,
                                               const std::vector<std::string>& media_urls,
                                               std::string& request_body) const override {
-        if (media_urls.empty()) {
-            return Status::InvalidArgument("QWEN multimodal embed inputs can not be empty");
-        }
-        if (media_types.size() != media_urls.size()) {
-            return Status::InvalidArgument(
-                    "QWEN multimodal embed input size mismatch, media_types={}, media_urls={}",
-                    media_types.size(), media_urls.size());
-        }
-        for (MultimodalType media_type : media_types) {
-            if (media_type != MultimodalType::IMAGE && media_type != MultimodalType::VIDEO) {
-                return Status::InvalidArgument(
-                        "QWEN only supports image/video multimodal embed, got {}",
-                        multimodal_type_to_string(media_type));
-            }
-        }
+        RETURN_IF_ERROR(validate_multimodal_embedding_inputs(
+                "QWEN", media_types, media_urls, {MultimodalType::IMAGE, MultimodalType::VIDEO}));
 
         rapidjson::Document doc;
         doc.SetObject();
@@ -1058,22 +1077,8 @@ class JinaAdapter : public VoyageAIAdapter {
     Status build_multimodal_embedding_request(const std::vector<MultimodalType>& media_types,
                                               const std::vector<std::string>& media_urls,
                                               std::string& request_body) const override {
-        if (media_urls.empty()) {
-            return Status::InvalidArgument("JINA multimodal embed inputs can not be empty");
-        }
-        if (media_types.size() != media_urls.size()) {
-            return Status::InvalidArgument(
-                    "JINA multimodal embed input size mismatch, media_types={}, media_urls={}",
-                    media_types.size(), media_urls.size());
-        }
-        for (MultimodalType media_type : media_types) {
-            if (media_type != MultimodalType::IMAGE && media_type != MultimodalType::VIDEO)
-                    [[unlikely]] {
-                return Status::InvalidArgument(
-                        "JINA only supports image/video multimodal embed, got {}",
-                        multimodal_type_to_string(media_type));
-            }
-        }
+        RETURN_IF_ERROR(validate_multimodal_embedding_inputs(
+                "JINA", media_types, media_urls, {MultimodalType::IMAGE, MultimodalType::VIDEO}));
 
         rapidjson::Document doc;
         doc.SetObject();
@@ -1318,14 +1323,9 @@ class GeminiAdapter : public AIAdapter {
     Status build_multimodal_embedding_request(const std::vector<MultimodalType>& media_types,
                                               const std::vector<std::string>& media_urls,
                                               std::string& request_body) const override {
-        if (media_urls.empty()) {
-            return Status::InvalidArgument("Gemini multimodal embed inputs can not be empty");
-        }
-        if (media_types.size() != media_urls.size()) {
-            return Status::InvalidArgument(
-                    "Gemini multimodal embed input size mismatch, media_types={}, media_urls={}",
-                    media_types.size(), media_urls.size());
-        }
+        RETURN_IF_ERROR(validate_multimodal_embedding_inputs(
+                "Gemini", media_types, media_urls,
+                {MultimodalType::IMAGE, MultimodalType::AUDIO, MultimodalType::VIDEO}));
 
         rapidjson::Document doc;
         doc.SetObject();
diff --git a/be/src/exprs/function/ai/ai_functions.h b/be/src/exprs/function/ai/ai_functions.h
@@ -54,8 +54,6 @@ namespace doris {
 template <typename Derived>
 class AIFunction : public IFunction {
 public:
-    static constexpr size_t max_batch_prompt_size = 128 * 1024;
-
     std::string get_name() const override { return assert_cast<const Derived&>(*this).name; }
 
     // If the user doesn't provide the first arg, `resource_name`
@@ -90,6 +88,17 @@ class AIFunction : public IFunction {
     }
 
 protected:
+    // Reads the shared AI context window size from query options. String AI batch functions and
+    // ai_agg both use the same byte-based session variable so batching behavior stays consistent.
+    static int64_t get_ai_context_window_size(FunctionContext* context) {
+        DORIS_CHECK(context != nullptr);
+        QueryContext* query_ctx = context->state()->get_query_ctx();
+        DORIS_CHECK(query_ctx != nullptr);
+
+        int64_t context_window_size = query_ctx->query_options().ai_context_window_size;
+        return context_window_size > 0 ? context_window_size : 128 * 1024;
+    }
+
     // Derived classes can override this method for non-text/default behavior.
     // The base implementation handles all string-input/string-output batchable functions.
     Status execute_with_adapter(FunctionContext* context, Block& block,
@@ -117,10 +126,18 @@ class AIFunction : public IFunction {
     }
 
     static void normalize_endpoint(TAIResource& config) {
-        // If users configure only the version root like `.../v1` or `.../v1beta`, append
-        // `models/<model>:batchEmbedContents` for `embed`, and `models/<model>:generateContent`
-        // for other AI scalar functions. If the endpoint is already a full method path, keep it.
+        // 1. If users configure only the version root like `.../v1` or `.../v1beta`, append
+        //    `models/<model>:batchEmbedContents` for `embed`, and `models/<model>:generateContent`
+        //    for other AI scalar functions.
+        // 2. `:embedContent` -> `:batchEmbedContents`
         if (iequal(config.provider_type, "GEMINI")) {
+            if (iequal(Derived::name, "embed") && config.endpoint.ends_with(":embedContent")) {
+                static constexpr std::string_view legacy_suffix = ":embedContent";
+                config.endpoint.replace(config.endpoint.size() - legacy_suffix.size(),
+                                        legacy_suffix.size(), ":batchEmbedContents");
+                return;
+            }
+
             if (!config.endpoint.ends_with("v1") && !config.endpoint.ends_with("v1beta")) {
                 return;
             }
@@ -270,6 +287,8 @@ class AIFunction : public IFunction {
                                    IColumn& col_result) const {
         std::vector<std::string> batch_prompts;
         size_t current_batch_size = 2; // []
+        const size_t max_batch_prompt_size =
+                static_cast<size_t>(get_ai_context_window_size(context));
 
         for (size_t i = 0; i < input_rows_count; ++i) {
             std::string prompt;
diff --git a/be/src/exprs/function/ai/embed.h b/be/src/exprs/function/ai/embed.h
@@ -87,14 +87,16 @@ class FunctionEmbed : public AIFunction<FunctionEmbed> {
         std::vector<std::string> batch_prompts;
         size_t current_batch_size = 0;
         const int32_t max_batch_size = _get_embed_max_batch_size(context);
+        const size_t max_context_window_size =
+                static_cast<size_t>(get_ai_context_window_size(context));
 
         for (size_t i = 0; i < input_rows_count; ++i) {
             std::string prompt;
             RETURN_IF_ERROR(build_prompt(block, arguments, i, prompt));
 
             const size_t prompt_size = prompt.size();
 
-            if (prompt_size > max_batch_prompt_size) {
+            if (prompt_size > max_context_window_size) {
                 // flush history batch
                 RETURN_IF_ERROR(_flush_text_embedding_batch(batch_prompts, *col_result, config,
                                                             adapter, context));
@@ -107,7 +109,7 @@ class FunctionEmbed : public AIFunction<FunctionEmbed> {
             }
 
             if (!batch_prompts.empty() &&
-                (current_batch_size + prompt_size > max_batch_prompt_size ||
+                (current_batch_size + prompt_size > max_context_window_size ||
                  batch_prompts.size() >= static_cast<size_t>(max_batch_size))) {
                 RETURN_IF_ERROR(_flush_text_embedding_batch(batch_prompts, *col_result, config,
                                                             adapter, context));
diff --git a/be/test/ai/aggregate_function_ai_agg_test.cpp b/be/test/ai/aggregate_function_ai_agg_test.cpp
@@ -389,6 +389,41 @@ TEST_F(AggregateFunctionAIAggTest, add_batch_single_place_multiple_calls_test) {
     _agg_function->destroy(place);
 }
 
+TEST_F(AggregateFunctionAIAggTest, ai_context_window_size_session_variable_test) {
+    TQueryOptions query_options = create_fake_query_options();
+    query_options.__set_ai_context_window_size(8);
+    auto query_ctx = MockQueryContext::create(TUniqueId(), ExecEnv::GetInstance(), query_options);
+    query_ctx->set_mock_ai_resource();
+    _query_ctx = query_ctx;
+    _agg_function->set_query_context(query_ctx.get());
+
+    auto resource_col = ColumnString::create();
+    auto text_col = ColumnString::create();
+    auto task_col = ColumnString::create();
+
+    resource_col->insert_data("mock_resource", 13);
+    text_col->insert_data("abcd", 4);
+    task_col->insert_data("summarize", 9);
+
+    resource_col->insert_data("mock_resource", 13);
+    text_col->insert_data("efgh", 4);
+    task_col->insert_data("summarize", 9);
+
+    std::unique_ptr<char[]> memory(new char[_agg_function->size_of_data()]);
+    AggregateDataPtr place = memory.get();
+    _agg_function->create(place);
+
+    const IColumn* columns[3] = {resource_col.get(), text_col.get(), task_col.get()};
+    _agg_function->add(place, columns, 0, _arena);
+    _agg_function->add(place, columns, 1, _arena);
+
+    const auto& data = *reinterpret_cast<const AggregateFunctionAIAggData*>(place);
+    std::string actual(reinterpret_cast<const char*>(data.data.data()), data.data.size());
+    EXPECT_EQ(actual, "this is a mock response\nefgh");
+
+    _agg_function->destroy(place);
+}
+
 TEST_F(AggregateFunctionAIAggTest, mock_resource_send_request_test) {
     std::vector<std::string> resources = {"mock_resource"};
     std::vector<std::string> texts = {"test input"};
diff --git a/be/test/ai/ai_adapter_test.cpp b/be/test/ai/ai_adapter_test.cpp
@@ -391,6 +391,23 @@ TEST(AI_ADAPTER_TEST, openai_adapter_responses_parse_response) {
     ASSERT_EQ(results[0], "openai response result");
 }
 
+TEST(AI_ADAPTER_TEST, openai_adapter_parse_response_keeps_mask_literals) {
+    OpenAIAdapter adapter;
+    std::string resp = R"({"choices":[{"message":{"content":"[MSKED]"}}]})";
+    std::vector<std::string> results;
+    Status st = adapter.parse_response(resp, results);
+    ASSERT_TRUE(st.ok()) << st.to_string();
+    ASSERT_EQ(results.size(), 1);
+    ASSERT_EQ(results[0], "[MSKED]");
+
+    resp = R"({"choices":[{"message":{"content":"[MASK]"}}]})";
+    results.clear();
+    st = adapter.parse_response(resp, results);
+    ASSERT_TRUE(st.ok()) << st.to_string();
+    ASSERT_EQ(results.size(), 1);
+    ASSERT_EQ(results[0], "[MASK]");
+}
+
 TEST(AI_ADAPTER_TEST, gemini_adapter_request) {
     GeminiAdapter adapter;
     TAIResource config;
diff --git a/be/test/ai/ai_function_test.cpp b/be/test/ai/ai_function_test.cpp
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift