Skip to content

Commit 95d469a

Browse files
server, webui: accept continue_final_message flag for vLLM API compat (#23012)
* server, webui: accept continue_final_message flag for vLLM API compat Add the continue_final_message body flag from the vLLM and transformers API. When set together with add_generation_prompt false, it triggers the existing prefill_assistant code path, regardless of the server side opt.prefill_assistant option. Mutual exclusion with add_generation_prompt true is enforced, matching vLLM behavior. WebUI sends continue_final_message and add_generation_prompt false on the Continue button, with the matching opt in option on the chat service. Pure API alignment, no change to the prefill logic itself. Paves the way for the upcoming per-template prefill plumbing in common/chat. * test: add coverage for continue_final_message vLLM compat flag Two cases on top of the existing assistant prefill coverage. First, continue_final_message true with add_generation_prompt false produces the same rendered prompt as the prefill_assistant heuristic, proving the new flag is a correct alias of the existing path. Second, both flags set to true is rejected with HTTP 400, matching the vLLM/transformers mutual exclusion contract. * chore: update webui build output
1 parent 1e4579f commit 95d469a

7 files changed

Lines changed: 176 additions & 118 deletions

File tree

tools/server/public/bundle.js

Lines changed: 116 additions & 116 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tools/server/server-common.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse(
10401040
inputs.use_jinja = opt.use_jinja;
10411041
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
10421042
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
1043+
const bool continue_final_message = json_value(body, "continue_final_message", false);
1044+
if (continue_final_message && inputs.add_generation_prompt) {
1045+
throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
1046+
}
10431047
inputs.reasoning_format = opt.reasoning_format;
10441048
if (body.contains("reasoning_format")) {
10451049
inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
@@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse(
10711075

10721076
// if the assistant message appears at the end of list, we do not add end-of-turn token
10731077
// for ex. this can be useful to modify the reasoning process in reasoning models
1074-
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
1078+
// continue_final_message is the explicit opt in alias from the vLLM/transformers API,
1079+
// equivalent to the prefill_assistant heuristic
1080+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
1081+
&& (continue_final_message || opt.prefill_assistant);
10751082
common_chat_msg last_message;
10761083
if (prefill_assistant_message) {
10771084
last_message = inputs.messages.back();

tools/server/tests/unit/test_chat_completion.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
178178
assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
179179

180180

181+
def test_chat_template_continue_final_message_vllm_compat():
182+
"""continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
183+
Both must produce the same prompt."""
184+
global server
185+
server.chat_template = "llama3"
186+
server.debug = True
187+
server.start()
188+
res = server.make_request("POST", "/chat/completions", data={
189+
"max_tokens": 8,
190+
"add_generation_prompt": False,
191+
"continue_final_message": True,
192+
"messages": [
193+
{"role": "system", "content": "Book"},
194+
{"role": "user", "content": "What is the best book"},
195+
{"role": "assistant", "content": "Whill"},
196+
]
197+
})
198+
assert res.status_code == 200
199+
assert "__verbose" in res.body
200+
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"
201+
202+
203+
def test_chat_template_continue_final_message_mutual_exclusion():
204+
"""add_generation_prompt and continue_final_message both set to true must be rejected"""
205+
global server
206+
server.chat_template = "llama3"
207+
server.start()
208+
res = server.make_request("POST", "/chat/completions", data={
209+
"max_tokens": 8,
210+
"add_generation_prompt": True,
211+
"continue_final_message": True,
212+
"messages": [
213+
{"role": "user", "content": "Hi"},
214+
{"role": "assistant", "content": "Hello"},
215+
]
216+
})
217+
assert res.status_code == 400
218+
219+
181220
def test_apply_chat_template():
182221
global server
183222
server.chat_template = "command-r"

tools/server/webui/src/lib/services/chat.service.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ export class ChatService {
130130
timings_per_token,
131131
// Config options
132132
disableReasoningParsing,
133-
excludeReasoningFromContext
133+
excludeReasoningFromContext,
134+
continueFinalMessage
134135
} = options;
135136

136137
const normalizedMessages: ApiChatMessageData[] = messages
@@ -209,6 +210,11 @@ export class ChatService {
209210
? ReasoningFormat.NONE
210211
: ReasoningFormat.AUTO;
211212

213+
if (continueFinalMessage) {
214+
requestBody.continue_final_message = true;
215+
requestBody.add_generation_prompt = false;
216+
}
217+
212218
if (temperature !== undefined) requestBody.temperature = temperature;
213219
if (max_tokens !== undefined) {
214220
// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null

tools/server/webui/src/lib/stores/chat.svelte.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,7 @@ class ChatStore {
13011301
contextWithContinue,
13021302
{
13031303
...this.getApiOptions(),
1304+
continueFinalMessage: true,
13041305
onChunk: (chunk: string) => {
13051306
appendedContent += chunk;
13061307
hasReceivedContent = true;

tools/server/webui/src/lib/types/api.d.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {
239239
// Custom parameters (JSON string)
240240
custom?: Record<string, unknown>;
241241
timings_per_token?: boolean;
242+
// Continuation control (vLLM compat)
243+
add_generation_prompt?: boolean;
244+
continue_final_message?: boolean;
242245
}
243246

244247
export interface ApiChatCompletionToolCallFunctionDelta {

tools/server/webui/src/lib/types/settings.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions {
9292
// Custom parameters
9393
custom?: string;
9494
timings_per_token?: boolean;
95+
// Continuation control (vLLM compat), opt in to the explicit continue final message flag
96+
continueFinalMessage?: boolean;
9597
// Callbacks
9698
onChunk?: (chunk: string) => void;
9799
onReasoningChunk?: (chunk: string) => void;

0 commit comments

Comments
 (0)