Skip to content

Commit 712eb2a

Browse files
authored
webui: Add option to pre-encode conversation for faster next turns (ggml-org#21034)
1 parent dc5a3f9 commit 712eb2a

7 files changed

Lines changed: 267 additions & 81 deletions

File tree

tools/server/public/bundle.js

Lines changed: 75 additions & 75 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tools/server/public/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
<div style="display: contents">
1919
<script>
2020
{
21-
__sveltekit_1610ad9 = {
21+
__sveltekit_nl4lme = {
2222
base: new URL('.', location).pathname.slice(0, -1)
2323
};
2424

tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,14 +291,19 @@
291291
title: SETTINGS_SECTION_TITLES.DEVELOPER,
292292
icon: Code,
293293
fields: [
294+
{
295+
key: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
296+
label: 'Pre-fill KV cache after response',
297+
type: SettingsFieldType.CHECKBOX
298+
},
294299
{
295300
key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
296-
label: 'Disable reasoning content parsing',
301+
label: 'Disable server-side thinking extraction',
297302
type: SettingsFieldType.CHECKBOX
298303
},
299304
{
300305
key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
301-
label: 'Exclude reasoning from context',
306+
label: 'Strip thinking from message history',
302307
type: SettingsFieldType.CHECKBOX
303308
},
304309
{

tools/server/webui/src/lib/constants/settings-config.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
5656
dry_penalty_last_n: undefined,
5757
max_tokens: undefined,
5858
custom: '', // custom json-stringified object
59+
preEncodeConversation: false,
5960
// experimental features
6061
pyInterpreterEnabled: false,
6162
enableContinueGeneration: false
@@ -106,9 +107,9 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
106107
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
107108
showThoughtInProgress: 'Expand thought process by default when generating messages.',
108109
disableReasoningParsing:
109-
'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
110+
'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
110111
excludeReasoningFromContext:
111-
'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
112+
'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
112113
showRawOutputSwitch:
113114
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
114115
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
@@ -143,6 +144,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
143144
'Automatically expand tool call details while executing and keep them expanded after completion.',
144145
pyInterpreterEnabled:
145146
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
147+
preEncodeConversation:
148+
'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
146149
enableContinueGeneration:
147150
'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
148151
};

tools/server/webui/src/lib/constants/settings-keys.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ export const SETTINGS_KEYS = {
5252
ALWAYS_SHOW_AGENTIC_TURNS: 'alwaysShowAgenticTurns',
5353
AGENTIC_MAX_TOOL_PREVIEW_LINES: 'agenticMaxToolPreviewLines',
5454
SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
55+
// Performance
56+
PRE_ENCODE_CONVERSATION: 'preEncodeConversation',
5557
// Developer
5658
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
5759
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',

tools/server/webui/src/lib/services/chat.service.ts

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
44
import {
55
ATTACHMENT_LABEL_PDF_FILE,
66
ATTACHMENT_LABEL_MCP_PROMPT,
7-
ATTACHMENT_LABEL_MCP_RESOURCE
7+
ATTACHMENT_LABEL_MCP_RESOURCE,
8+
LEGACY_AGENTIC_REGEX
89
} from '$lib/constants';
910
import {
1011
AttachmentType,
@@ -279,6 +280,107 @@ export class ChatService {
279280
}
280281
}
281282

283+
/**
284+
* Checks whether all server slots are currently idle (not processing any requests).
285+
* Queries the /slots endpoint (requires --slots flag on the server).
286+
* Returns true if all slots are idle, false if any is processing.
287+
* If the endpoint is unavailable or errors out, returns true (best-effort fallback).
288+
*
289+
* @param signal - Optional AbortSignal to cancel the request if needed
290+
* @param model - Optional model name to check slots for (required in ROUTER mode)
291+
* @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
292+
*/
293+
static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
294+
try {
295+
const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
296+
const res = await fetch(url, { signal });
297+
if (!res.ok) return true;
298+
299+
const slots: { is_processing: boolean }[] = await res.json();
300+
return slots.every((s) => !s.is_processing);
301+
} catch {
302+
return true;
303+
}
304+
}
305+
306+
/**
307+
* Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
308+
* After a response completes, this re-submits the full conversation
309+
* using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
310+
* This warms the cache for the next turn, making it faster.
311+
*
312+
* When excludeReasoningFromContext is true, reasoning content is stripped from the messages
313+
* to match what sendMessage would send on the next turn (avoiding cache misses).
314+
* When false, reasoning_content is preserved so the cached prompt matches the next request.
315+
*
316+
* @param messages - The full conversation including the latest assistant response
317+
* @param model - Optional model name (required in ROUTER mode)
318+
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
319+
* @param signal - Optional AbortSignal to cancel the pre-encode request
320+
*/
321+
static async preEncode(
322+
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
323+
model?: string | null,
324+
excludeReasoning?: boolean,
325+
signal?: AbortSignal
326+
): Promise<void> {
327+
const normalizedMessages: ApiChatMessageData[] = messages
328+
.map((msg) => {
329+
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
330+
return ChatService.convertDbMessageToApiChatMessageData(
331+
msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
332+
);
333+
}
334+
335+
return msg as ApiChatMessageData;
336+
})
337+
.filter((msg) => {
338+
if (msg.role === MessageRole.SYSTEM) {
339+
const content = typeof msg.content === 'string' ? msg.content : '';
340+
341+
return content.trim().length > 0;
342+
}
343+
344+
return true;
345+
});
346+
347+
const requestBody: Record<string, unknown> = {
348+
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
349+
const mapped: Record<string, unknown> = {
350+
role: msg.role,
351+
content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
352+
tool_calls: msg.tool_calls,
353+
tool_call_id: msg.tool_call_id
354+
};
355+
356+
if (!excludeReasoning && msg.reasoning_content) {
357+
mapped.reasoning_content = msg.reasoning_content;
358+
}
359+
360+
return mapped;
361+
}),
362+
stream: false,
363+
n_predict: 0
364+
};
365+
366+
if (model) {
367+
requestBody.model = model;
368+
}
369+
370+
try {
371+
await fetch(`./v1/chat/completions`, {
372+
method: 'POST',
373+
headers: getJsonHeaders(),
374+
body: JSON.stringify(requestBody),
375+
signal
376+
});
377+
} catch (error) {
378+
if (!isAbortError(error)) {
379+
console.warn('[ChatService] Pre-encode request failed:', error);
380+
}
381+
}
382+
}
383+
282384
/**
283385
*
284386
*
@@ -799,6 +901,28 @@ export class ChatService {
799901
*
800902
*/
801903

904+
/**
905+
* Strips legacy inline reasoning content tags from message content.
906+
* Handles both plain string content and multipart content arrays.
907+
*/
908+
private static stripReasoningContent(
909+
content: string | ApiChatMessageContentPart[]
910+
): string | ApiChatMessageContentPart[] {
911+
const stripFromString = (text: string): string =>
912+
text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
913+
914+
if (typeof content === 'string') {
915+
return stripFromString(content);
916+
}
917+
918+
return content.map((part) => {
919+
if (part.type === ContentPartType.TEXT && part.text) {
920+
return { ...part, text: stripFromString(part.text) };
921+
}
922+
return part;
923+
});
924+
}
925+
802926
/**
803927
* Parses error response and creates appropriate error with context information
804928
* @param response - HTTP response object

tools/server/webui/src/lib/stores/chat.svelte.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class ChatStore {
5858
chatLoadingStates = new SvelteMap<string, boolean>();
5959
chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
6060
private abortControllers = new SvelteMap<string, AbortController>();
61+
private preEncodeAbortController: AbortController | null = null;
6162
private processingStates = new SvelteMap<string, ApiProcessingState | null>();
6263
private conversationStateTimestamps = new SvelteMap<string, ConversationStateEntry>();
6364
private activeConversationId = $state<string | null>(null);
@@ -462,6 +463,9 @@ class ChatStore {
462463
const activeConv = conversationsStore.activeConversation;
463464
if (activeConv && this.isChatLoadingInternal(activeConv.id)) return;
464465

466+
// Cancel any in-flight pre-encode request
467+
this.cancelPreEncode();
468+
465469
// Consume MCP resource attachments - converts them to extras and clears the live store
466470
const resourceExtras = mcpStore.consumeResourceAttachmentsAsExtras();
467471
const allExtras = resourceExtras.length > 0 ? [...(extras || []), ...resourceExtras] : extras;
@@ -724,6 +728,16 @@ class ChatStore {
724728

725729
if (onComplete) onComplete(streamedContent);
726730
if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
731+
// Pre-encode conversation in KV cache for faster next turn
732+
if (config().preEncodeConversation) {
733+
this.triggerPreEncode(
734+
allMessages,
735+
assistantMessage,
736+
streamedContent,
737+
effectiveModel,
738+
!!config().excludeReasoningFromContext
739+
);
740+
}
727741
},
728742
onError: (error: Error) => {
729743
this.setStreamingActive(false);
@@ -911,6 +925,7 @@ class ChatStore {
911925
async regenerateMessage(messageId: string): Promise<void> {
912926
const activeConv = conversationsStore.activeConversation;
913927
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
928+
this.cancelPreEncode();
914929
const result = this.getMessageByIdWithRole(messageId, MessageRole.ASSISTANT);
915930
if (!result) return;
916931
const { index: messageIndex } = result;
@@ -940,6 +955,7 @@ class ChatStore {
940955
async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
941956
const activeConv = conversationsStore.activeConversation;
942957
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
958+
this.cancelPreEncode();
943959
try {
944960
const idx = conversationsStore.findMessageIndex(messageId);
945961
if (idx === -1) return;
@@ -1616,6 +1632,42 @@ class ChatStore {
16161632

16171633
return apiOptions;
16181634
}
1635+
1636+
private cancelPreEncode(): void {
1637+
if (this.preEncodeAbortController) {
1638+
this.preEncodeAbortController.abort();
1639+
this.preEncodeAbortController = null;
1640+
}
1641+
}
1642+
1643+
private async triggerPreEncode(
1644+
allMessages: DatabaseMessage[],
1645+
assistantMessage: DatabaseMessage,
1646+
assistantContent: string,
1647+
model?: string | null,
1648+
excludeReasoning?: boolean
1649+
): Promise<void> {
1650+
this.cancelPreEncode();
1651+
this.preEncodeAbortController = new AbortController();
1652+
1653+
const signal = this.preEncodeAbortController.signal;
1654+
1655+
try {
1656+
const allIdle = await ChatService.areAllSlotsIdle(model, signal);
1657+
if (!allIdle || signal.aborted) return;
1658+
1659+
const messagesWithAssistant: DatabaseMessage[] = [
1660+
...allMessages,
1661+
{ ...assistantMessage, content: assistantContent }
1662+
];
1663+
1664+
await ChatService.preEncode(messagesWithAssistant, model, excludeReasoning, signal);
1665+
} catch (err) {
1666+
if (!isAbortError(err)) {
1667+
console.warn('[ChatStore] Pre-encode failed:', err);
1668+
}
1669+
}
1670+
}
16191671
}
16201672

16211673
export const chatStore = new ChatStore();

0 commit comments

Comments
 (0)