webui: Add option to pre-encode conversation for faster next turns (ggml-org#21034)

allozaur · web-flow · commit 712eb2a55a68 · 2026-04-09T09:10:18.000+02:00
diff --git a/tools/server/public/bundle.js b/tools/server/public/bundle.js
diff --git a/tools/server/public/index.html b/tools/server/public/index.html
@@ -18,7 +18,7 @@
 		<div style="display: contents">
 			<script>
 				{
-					__sveltekit_1610ad9 = {
+					__sveltekit_nl4lme = {
 						base: new URL('.', location).pathname.slice(0, -1)
 					};
 
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -291,14 +291,19 @@
 			title: SETTINGS_SECTION_TITLES.DEVELOPER,
 			icon: Code,
 			fields: [
+				{
+					key: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
+					label: 'Pre-fill KV cache after response',
+					type: SettingsFieldType.CHECKBOX
+				},
 				{
 					key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
-					label: 'Disable reasoning content parsing',
+					label: 'Disable server-side thinking extraction',
 					type: SettingsFieldType.CHECKBOX
 				},
 				{
 					key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
-					label: 'Exclude reasoning from context',
+					label: 'Strip thinking from message history',
 					type: SettingsFieldType.CHECKBOX
 				},
 				{
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -56,6 +56,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
 	dry_penalty_last_n: undefined,
 	max_tokens: undefined,
 	custom: '', // custom json-stringified object
+	preEncodeConversation: false,
 	// experimental features
 	pyInterpreterEnabled: false,
 	enableContinueGeneration: false
@@ -106,9 +107,9 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
 	showThoughtInProgress: 'Expand thought process by default when generating messages.',
 	disableReasoningParsing:
-		'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
+		'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
 	excludeReasoningFromContext:
-		'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
+		'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
 	showRawOutputSwitch:
 		'Show toggle button to display messages as plain text instead of Markdown-formatted content',
 	keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
@@ -143,6 +144,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'Automatically expand tool call details while executing and keep them expanded after completion.',
 	pyInterpreterEnabled:
 		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
+	preEncodeConversation:
+		'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
 	enableContinueGeneration:
 		'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
 };
diff --git a/tools/server/webui/src/lib/constants/settings-keys.ts b/tools/server/webui/src/lib/constants/settings-keys.ts
@@ -52,6 +52,8 @@ export const SETTINGS_KEYS = {
 	ALWAYS_SHOW_AGENTIC_TURNS: 'alwaysShowAgenticTurns',
 	AGENTIC_MAX_TOOL_PREVIEW_LINES: 'agenticMaxToolPreviewLines',
 	SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
+	// Performance
+	PRE_ENCODE_CONVERSATION: 'preEncodeConversation',
 	// Developer
 	DISABLE_REASONING_PARSING: 'disableReasoningParsing',
 	EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
diff --git a/tools/server/webui/src/lib/services/chat.service.ts b/tools/server/webui/src/lib/services/chat.service.ts
@@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
 import {
 	ATTACHMENT_LABEL_PDF_FILE,
 	ATTACHMENT_LABEL_MCP_PROMPT,
-	ATTACHMENT_LABEL_MCP_RESOURCE
+	ATTACHMENT_LABEL_MCP_RESOURCE,
+	LEGACY_AGENTIC_REGEX
 } from '$lib/constants';
 import {
 	AttachmentType,
@@ -279,6 +280,107 @@ export class ChatService {
 		}
 	}
 
+	/**
+	 * Checks whether all server slots are currently idle (not processing any requests).
+	 * Queries the /slots endpoint (requires --slots flag on the server).
+	 * Returns true if all slots are idle, false if any is processing.
+	 * If the endpoint is unavailable or errors out, returns true (best-effort fallback).
+	 *
+	 * @param signal - Optional AbortSignal to cancel the request if needed
+	 * @param model - Optional model name to check slots for (required in ROUTER mode)
+	 * @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
+	 */
+	static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
+		try {
+			const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
+			const res = await fetch(url, { signal });
+			if (!res.ok) return true;
+
+			const slots: { is_processing: boolean }[] = await res.json();
+			return slots.every((s) => !s.is_processing);
+		} catch {
+			return true;
+		}
+	}
+
+	/**
+	 * Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
+	 * After a response completes, this re-submits the full conversation
+	 * using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
+	 * This warms the cache for the next turn, making it faster.
+	 *
+	 * When excludeReasoningFromContext is true, reasoning content is stripped from the messages
+	 * to match what sendMessage would send on the next turn (avoiding cache misses).
+	 * When false, reasoning_content is preserved so the cached prompt matches the next request.
+	 *
+	 * @param messages - The full conversation including the latest assistant response
+	 * @param model - Optional model name (required in ROUTER mode)
+	 * @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
+	 * @param signal - Optional AbortSignal to cancel the pre-encode request
+	 */
+	static async preEncode(
+		messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
+		model?: string | null,
+		excludeReasoning?: boolean,
+		signal?: AbortSignal
+	): Promise<void> {
+		const normalizedMessages: ApiChatMessageData[] = messages
+			.map((msg) => {
+				if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
+					return ChatService.convertDbMessageToApiChatMessageData(
+						msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
+					);
+				}
+
+				return msg as ApiChatMessageData;
+			})
+			.filter((msg) => {
+				if (msg.role === MessageRole.SYSTEM) {
+					const content = typeof msg.content === 'string' ? msg.content : '';
+
+					return content.trim().length > 0;
+				}
+
+				return true;
+			});
+
+		const requestBody: Record<string, unknown> = {
+			messages: normalizedMessages.map((msg: ApiChatMessageData) => {
+				const mapped: Record<string, unknown> = {
+					role: msg.role,
+					content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
+					tool_calls: msg.tool_calls,
+					tool_call_id: msg.tool_call_id
+				};
+
+				if (!excludeReasoning && msg.reasoning_content) {
+					mapped.reasoning_content = msg.reasoning_content;
+				}
+
+				return mapped;
+			}),
+			stream: false,
+			n_predict: 0
+		};
+
+		if (model) {
+			requestBody.model = model;
+		}
+
+		try {
+			await fetch(`./v1/chat/completions`, {
+				method: 'POST',
+				headers: getJsonHeaders(),
+				body: JSON.stringify(requestBody),
+				signal
+			});
+		} catch (error) {
+			if (!isAbortError(error)) {
+				console.warn('[ChatService] Pre-encode request failed:', error);
+			}
+		}
+	}
+
 	/**
 	 *
 	 *
@@ -799,6 +901,28 @@ export class ChatService {
 	 *
 	 */
 
+	/**
+	 * Strips legacy inline reasoning content tags from message content.
+	 * Handles both plain string content and multipart content arrays.
+	 */
+	private static stripReasoningContent(
+		content: string | ApiChatMessageContentPart[]
+	): string | ApiChatMessageContentPart[] {
+		const stripFromString = (text: string): string =>
+			text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
+
+		if (typeof content === 'string') {
+			return stripFromString(content);
+		}
+
+		return content.map((part) => {
+			if (part.type === ContentPartType.TEXT && part.text) {
+				return { ...part, text: stripFromString(part.text) };
+			}
+			return part;
+		});
+	}
+
 	/**
 	 * Parses error response and creates appropriate error with context information
 	 * @param response - HTTP response object
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -58,6 +58,7 @@ class ChatStore {
 	chatLoadingStates = new SvelteMap<string, boolean>();
 	chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
 	private abortControllers = new SvelteMap<string, AbortController>();
+	private preEncodeAbortController: AbortController | null = null;
 	private processingStates = new SvelteMap<string, ApiProcessingState | null>();
 	private conversationStateTimestamps = new SvelteMap<string, ConversationStateEntry>();
 	private activeConversationId = $state<string | null>(null);
@@ -462,6 +463,9 @@ class ChatStore {
 		const activeConv = conversationsStore.activeConversation;
 		if (activeConv && this.isChatLoadingInternal(activeConv.id)) return;
 
+		// Cancel any in-flight pre-encode request
+		this.cancelPreEncode();
+
 		// Consume MCP resource attachments - converts them to extras and clears the live store
 		const resourceExtras = mcpStore.consumeResourceAttachmentsAsExtras();
 		const allExtras = resourceExtras.length > 0 ? [...(extras || []), ...resourceExtras] : extras;
@@ -724,6 +728,16 @@ class ChatStore {
 
 				if (onComplete) onComplete(streamedContent);
 				if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
+				// Pre-encode conversation in KV cache for faster next turn
+				if (config().preEncodeConversation) {
+					this.triggerPreEncode(
+						allMessages,
+						assistantMessage,
+						streamedContent,
+						effectiveModel,
+						!!config().excludeReasoningFromContext
+					);
+				}
 			},
 			onError: (error: Error) => {
 				this.setStreamingActive(false);
@@ -911,6 +925,7 @@ class ChatStore {
 	async regenerateMessage(messageId: string): Promise<void> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
+		this.cancelPreEncode();
 		const result = this.getMessageByIdWithRole(messageId, MessageRole.ASSISTANT);
 		if (!result) return;
 		const { index: messageIndex } = result;
@@ -940,6 +955,7 @@ class ChatStore {
 	async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
+		this.cancelPreEncode();
 		try {
 			const idx = conversationsStore.findMessageIndex(messageId);
 			if (idx === -1) return;
@@ -1616,6 +1632,42 @@ class ChatStore {
 
 		return apiOptions;
 	}
+
+	private cancelPreEncode(): void {
+		if (this.preEncodeAbortController) {
+			this.preEncodeAbortController.abort();
+			this.preEncodeAbortController = null;
+		}
+	}
+
+	private async triggerPreEncode(
+		allMessages: DatabaseMessage[],
+		assistantMessage: DatabaseMessage,
+		assistantContent: string,
+		model?: string | null,
+		excludeReasoning?: boolean
+	): Promise<void> {
+		this.cancelPreEncode();
+		this.preEncodeAbortController = new AbortController();
+
+		const signal = this.preEncodeAbortController.signal;
+
+		try {
+			const allIdle = await ChatService.areAllSlotsIdle(model, signal);
+			if (!allIdle || signal.aborted) return;
+
+			const messagesWithAssistant: DatabaseMessage[] = [
+				...allMessages,
+				{ ...assistantMessage, content: assistantContent }
+			];
+
+			await ChatService.preEncode(messagesWithAssistant, model, excludeReasoning, signal);
+		} catch (err) {
+			if (!isAbortError(err)) {
+				console.warn('[ChatStore] Pre-encode failed:', err);
+			}
+		}
+	}
 }
 
 export const chatStore = new ChatStore();

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`<div style="display: contents">`
`19`	`19`	`<script>`
`20`	`20`	`{`
`21`		`- __sveltekit_1610ad9 = {`
	`21`	`+ __sveltekit_nl4lme = {`
`22`	`22`	`base: new URL('.', location).pathname.slice(0, -1)`
`23`	`23`	`};`
`24`	`24`