Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions tests/test-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5503,6 +5503,71 @@ static void test_developer_role_to_system_workaround() {
}
}

static void test_reasoning_budget_tokens_per_request() {
LOG_DBG("%s\n", __func__);
// Use Qwen3 template which has <think>...</think> reasoning markers.
// The autoparser detects them and sets thinking_start/end_tag, which enables
// the reasoning-budget code path in oaicompat_chat_params_parse.
auto tmpls = read_templates("models/templates/Qwen-Qwen3-0.6B.jinja");

server_chat_params opt;
opt.tmpls = std::move(tmpls);
opt.use_jinja = true;
opt.enable_thinking = true;
opt.reasoning_budget = -1;
opt.reasoning_format = COMMON_REASONING_FORMAT_NONE;

// Body with per-request reasoning_budget_tokens=0 (suppress thinking).
json body = {
{"messages", json::array({json{{"role", "user"}, {"content", "hello"}}})},
{"reasoning_budget_tokens", 0},
};
std::vector<raw_buffer> out_files;
auto llama_params = oaicompat_chat_params_parse(body, opt, out_files);

// The per-request value must win over the server default (-1).
if (!llama_params.contains("reasoning_budget_tokens")) {
throw std::runtime_error("reasoning_budget_tokens missing from llama_params (thinking_end_tag may be empty for this template)");
}
int got = llama_params["reasoning_budget_tokens"].get<int>();
if (got != 0) {
throw std::runtime_error(std::string("Expected reasoning_budget_tokens=0, got ") + std::to_string(got));
}
}

static void test_reasoning_budget_message_per_request() {
LOG_DBG("%s\n", __func__);
// Same code path as test_reasoning_budget_tokens_per_request: the Qwen3 template's
// <think>...</think> markers enable the reasoning-budget block in oaicompat_chat_params_parse.
auto tmpls = read_templates("models/templates/Qwen-Qwen3-0.6B.jinja");

server_chat_params opt;
opt.tmpls = std::move(tmpls);
opt.use_jinja = true;
opt.enable_thinking = true;
opt.reasoning_budget = -1;
opt.reasoning_format = COMMON_REASONING_FORMAT_NONE;
opt.reasoning_budget_message = "server default";

// Body with a per-request reasoning_budget_message override.
const std::string per_request_message = "per-request message";
json body = {
{"messages", json::array({json{{"role", "user"}, {"content", "hello"}}})},
{"reasoning_budget_message", per_request_message},
};
std::vector<raw_buffer> out_files;
auto llama_params = oaicompat_chat_params_parse(body, opt, out_files);

// The per-request value must win over the server default.
if (!llama_params.contains("reasoning_budget_message")) {
throw std::runtime_error("reasoning_budget_message missing from llama_params (thinking_end_tag may be empty for this template)");
}
std::string got = llama_params["reasoning_budget_message"].get<std::string>();
if (got != per_request_message) {
throw std::runtime_error("Expected reasoning_budget_message='" + per_request_message + "', got '" + got + "'");
}
}

static void test_msg_diffs_compute() {
LOG_DBG("%s\n", __func__);
{
Expand Down Expand Up @@ -5660,6 +5725,8 @@ int main(int argc, char ** argv) {
test_convert_responses_to_chatcmpl();
test_developer_role_to_system_workaround();
test_template_generation_prompt();
test_reasoning_budget_tokens_per_request();
test_reasoning_budget_message_per_request();
test_template_output_peg_parsers(detailed_debug);
std::cout << "\n[chat] All tests passed!" << '\n';
}
Expand Down
12 changes: 10 additions & 2 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,16 +1134,24 @@ json oaicompat_chat_params_parse(

// Reasoning budget: pass parameters through to sampling layer
{
int reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
// Per-request overrides, read before writing to llama_params so the generic copy
// loop (which skips keys already present) won't clobber the caller-supplied values.
// Precedence: canonical reasoning_budget_tokens > Anthropic thinking_budget_tokens
// alias > server-level default.
int reasoning_budget = json_value(body, "reasoning_budget_tokens", -1);
if (reasoning_budget == -1) {
reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
}
if (reasoning_budget == -1) {
reasoning_budget = opt.reasoning_budget;
}
std::string reasoning_budget_message = json_value(body, "reasoning_budget_message", opt.reasoning_budget_message);

if (!chat_params.thinking_end_tag.empty()) {
llama_params["reasoning_budget_tokens"] = reasoning_budget;
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
llama_params["reasoning_budget_message"] = reasoning_budget_message;
llama_params["reasoning_control"] = json_value(body, "reasoning_control", false);
}
}
Expand Down
Loading