server, webui: accept continue_final_message flag for vLLM API compat (#23012)

ServeurpersoCom · web-flow · commit 95d469a91533 · 2026-05-13T20:47:58.000+02:00
* server, webui: accept continue_final_message flag for vLLM API compat

Add the continue_final_message body flag from the vLLM and transformers
API. When set together with add_generation_prompt false, it triggers the
existing prefill_assistant code path, regardless of the server side
opt.prefill_assistant option. Mutual exclusion with add_generation_prompt
true is enforced, matching vLLM behavior.

WebUI sends continue_final_message and add_generation_prompt false on
the Continue button, with the matching opt in option on the chat service.

Pure API alignment, no change to the prefill logic itself. Paves the way
for the upcoming per-template prefill plumbing in common/chat.

* test: add coverage for continue_final_message vLLM compat flag

Two cases on top of the existing assistant prefill coverage. First,
continue_final_message true with add_generation_prompt false produces
the same rendered prompt as the prefill_assistant heuristic, proving
the new flag is a correct alias of the existing path. Second, both
flags set to true is rejected with HTTP 400, matching the
vLLM/transformers mutual exclusion contract.

* chore: update webui build output
diff --git a/tools/server/public/bundle.js b/tools/server/public/bundle.js
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse(
     inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
+    const bool continue_final_message = json_value(body, "continue_final_message", false);
+    if (continue_final_message && inputs.add_generation_prompt) {
+        throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
+    }
     inputs.reasoning_format      = opt.reasoning_format;
     if (body.contains("reasoning_format")) {
         inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
@@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse(
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
+    // continue_final_message is the explicit opt in alias from the vLLM/transformers API,
+    // equivalent to the prefill_assistant heuristic
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
+        && (continue_final_message || opt.prefill_assistant);
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
@@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
     assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
 
 
+def test_chat_template_continue_final_message_vllm_compat():
+    """continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
+    Both must produce the same prompt."""
+    global server
+    server.chat_template = "llama3"
+    server.debug = True
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "add_generation_prompt": False,
+        "continue_final_message": True,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+            {"role": "assistant", "content": "Whill"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "__verbose" in res.body
+    assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"
+
+
+def test_chat_template_continue_final_message_mutual_exclusion():
+    """add_generation_prompt and continue_final_message both set to true must be rejected"""
+    global server
+    server.chat_template = "llama3"
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "add_generation_prompt": True,
+        "continue_final_message": True,
+        "messages": [
+            {"role": "user", "content": "Hi"},
+            {"role": "assistant", "content": "Hello"},
+        ]
+    })
+    assert res.status_code == 400
+
+
 def test_apply_chat_template():
     global server
     server.chat_template = "command-r"
diff --git a/tools/server/webui/src/lib/services/chat.service.ts b/tools/server/webui/src/lib/services/chat.service.ts
@@ -130,7 +130,8 @@ export class ChatService {
 			timings_per_token,
 			// Config options
 			disableReasoningParsing,
-			excludeReasoningFromContext
+			excludeReasoningFromContext,
+			continueFinalMessage
 		} = options;
 
 		const normalizedMessages: ApiChatMessageData[] = messages
@@ -209,6 +210,11 @@ export class ChatService {
 			? ReasoningFormat.NONE
 			: ReasoningFormat.AUTO;
 
+		if (continueFinalMessage) {
+			requestBody.continue_final_message = true;
+			requestBody.add_generation_prompt = false;
+		}
+
 		if (temperature !== undefined) requestBody.temperature = temperature;
 		if (max_tokens !== undefined) {
 			// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -1301,6 +1301,7 @@ class ChatStore {
 				contextWithContinue,
 				{
 					...this.getApiOptions(),
+					continueFinalMessage: true,
 					onChunk: (chunk: string) => {
 						appendedContent += chunk;
 						hasReceivedContent = true;
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
@@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {
 	// Custom parameters (JSON string)
 	custom?: Record<string, unknown>;
 	timings_per_token?: boolean;
+	// Continuation control (vLLM compat)
+	add_generation_prompt?: boolean;
+	continue_final_message?: boolean;
 }
 
 export interface ApiChatCompletionToolCallFunctionDelta {
diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts
@@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions {
 	// Custom parameters
 	custom?: string;
 	timings_per_token?: boolean;
+	// Continuation control (vLLM compat), opt in to the explicit continue final message flag
+	continueFinalMessage?: boolean;
 	// Callbacks
 	onChunk?: (chunk: string) => void;
 	onReasoningChunk?: (chunk: string) => void;

Original file line number	Diff line number	Diff line change
`@@ -1301,6 +1301,7 @@ class ChatStore {`
`1301`	`1301`	`contextWithContinue,`
`1302`	`1302`	`{`
`1303`	`1303`	`...this.getApiOptions(),`
	`1304`	`+ continueFinalMessage: true,`
`1304`	`1305`	`onChunk: (chunk: string) => {`
`1305`	`1306`	`appendedContent += chunk;`
`1306`	`1307`	`hasReceivedContent = true;`
Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {`
`239`	`239`	`// Custom parameters (JSON string)`
`240`	`240`	`custom?: Record<string, unknown>;`
`241`	`241`	`timings_per_token?: boolean;`
	`242`	`+ // Continuation control (vLLM compat)`
	`243`	`+ add_generation_prompt?: boolean;`
	`244`	`+ continue_final_message?: boolean;`
`242`	`245`	`}`
`243`	`246`
`244`	`247`	`export interface ApiChatCompletionToolCallFunctionDelta {`