Added support for anthropic prompt caching. Display portion of cached input tokens in the chat UI

jsourcebot · jsourcebot · commit 499b7e81e63c · 2026-06-04T14:41:22.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- [EE] Added prompt caching for Ask Sourcebot. For Anthropic models, the static prompt prefix (tool definitions, system prompt, and conversation history) is marked with a cache breakpoint so it is billed at the provider's discounted cache-read rate on subsequent agent steps and follow-up turns. Toggle with `SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED` (default `true`). [#<PR>](https://github.com/sourcebot-dev/sourcebot/pull/<PR>)
+- [EE] Added a cached-token breakdown to the Ask Sourcebot message details, showing what share of the input tokens were served from the model provider's prompt cache. [#<PR>](https://github.com/sourcebot-dev/sourcebot/pull/<PR>)
+
 ## [5.0.1] - 2026-06-04
 
 ### Fixed
diff --git a/packages/shared/src/env.server.ts b/packages/shared/src/env.server.ts
@@ -283,6 +283,7 @@ const options = {
          */
         SOURCEBOT_CHAT_MODEL_TEMPERATURE: numberSchema.optional(),
         SOURCEBOT_CHAT_MAX_STEP_COUNT: numberSchema.default(100),
+        SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED: booleanSchema.default('true'),
         SOURCEBOT_MCP_TOOL_CALL_TIMEOUT_MS: numberSchema.int().positive().max(maxTimerDelayMs).default(60000),
 
         DEBUG_WRITE_CHAT_MESSAGES_TO_FILE: booleanSchema.default('false'),
diff --git a/packages/web/src/ee/features/chat/agent.ts b/packages/web/src/ee/features/chat/agent.ts
@@ -197,6 +197,8 @@ export const createMessageStream = async ({
                     totalTokens: (priorMetadata?.totalTokens ?? 0) + (totalUsage.totalTokens ?? 0),
                     totalInputTokens: (priorMetadata?.totalInputTokens ?? 0) + (totalUsage.inputTokens ?? 0),
                     totalOutputTokens: (priorMetadata?.totalOutputTokens ?? 0) + (totalUsage.outputTokens ?? 0),
+                    totalCacheReadTokens: (priorMetadata?.totalCacheReadTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheReadTokens ?? 0),
+                    totalCacheWriteTokens: (priorMetadata?.totalCacheWriteTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheWriteTokens ?? 0),
                     totalResponseTimeMs: (priorMetadata?.totalResponseTimeMs ?? 0) + (new Date().getTime() - startTime.getTime()),
                     modelName,
                     traceId,
@@ -343,11 +345,42 @@ const createAgentStream = async ({
         ...(hasMcpTools ? { tool_request_activation: toolRequestActivation, ...mcpToolSetsObj.tools } : {}),
     };
 
+    // Anthropic prompt caching: mark the end of the prompt's static prefix —
+    // tool definitions, the system prompt (including any resolved file sources),
+    // and the conversation history — with an ephemeral (5m) cache breakpoint on
+    // the last input message. Anthropic caches everything up to and including
+    // this point, so the large prefix is written once (~1.25x) and read back at
+    // ~0.1x on every subsequent agent step and follow-up turn instead of being
+    // reprocessed in full. The `anthropic` provider-options namespace is ignored
+    // by non-Anthropic providers, so this is safe to apply unconditionally.
+    //
+    // Caveat: when MCP tools are lazily activated mid-run via prepareStep, the
+    // tools section (which precedes everything else in the prefix) grows and
+    // invalidates the cache for that step; the cache re-warms on subsequent
+    // steps once the active tool set is stable.
+    const isPromptCachingEnabled = env.SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED === 'true';
+    const messagesWithCachedPrefix: ModelMessage[] = inputMessages.map((message, index) => {
+        if (!isPromptCachingEnabled || index !== inputMessages.length - 1) {
+            return message;
+        }
+
+        return {
+            ...message,
+            providerOptions: {
+                ...message.providerOptions,
+                anthropic: {
+                    ...message.providerOptions?.anthropic,
+                    cacheControl: { type: 'ephemeral' },
+                },
+            },
+        };
+    });
+
     try {
         const stream = streamText({
             model,
             providerOptions,
-            messages: inputMessages,
+            messages: messagesWithCachedPrefix,
             system: systemPrompt,
             tools: allTools,
             activeTools: [
diff --git a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx
@@ -59,6 +59,11 @@ const DetailsCardComponent = ({
         (part.type === 'dynamic-tool' && part.toolName.startsWith('mcp_'))
     ).length, [thinkingSteps]);
 
+    const cacheReadTokens = metadata?.totalCacheReadTokens ?? 0;
+    const cachedInputPercent = metadata?.totalInputTokens
+        ? Math.round((cacheReadTokens / metadata.totalInputTokens) * 100)
+        : 0;
+
     const handleExpandedChanged = useCallback((next: boolean) => {
         captureEvent('wa_chat_details_card_toggled', { chatId, isExpanded: next });
         onExpandedChanged(next);
@@ -132,6 +137,9 @@ const DetailsCardComponent = ({
                                                     <div className="flex items-center text-xs cursor-help">
                                                         <Zap className="w-3 h-3 mr-1 flex-shrink-0" />
                                                         {getShortenedNumberDisplayString(metadata.totalTokens, 0)} tokens
+                                                        {cachedInputPercent > 0 && (
+                                                            <span className="ml-1 text-muted-foreground">({cachedInputPercent}% cached)</span>
+                                                        )}
                                                     </div>
                                                 </TooltipTrigger>
                                                 <TooltipContent side="bottom">
@@ -140,6 +148,12 @@ const DetailsCardComponent = ({
                                                             <span className="text-muted-foreground">Input</span>
                                                             <span>{metadata.totalInputTokens?.toLocaleString() ?? '—'}</span>
                                                         </div>
+                                                        {cacheReadTokens > 0 && (
+                                                            <div className="flex justify-between gap-4 pl-3">
+                                                                <span className="text-muted-foreground">↳ Cached (discounted)</span>
+                                                                <span>{cacheReadTokens.toLocaleString()}</span>
+                                                            </div>
+                                                        )}
                                                         <div className="flex justify-between gap-4">
                                                             <span className="text-muted-foreground">Output</span>
                                                             <span>{metadata.totalOutputTokens?.toLocaleString() ?? '—'}</span>
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
@@ -55,6 +55,9 @@ export const sbChatMessageMetadataSchema = z.object({
     totalInputTokens: z.number().optional(),
     totalOutputTokens: z.number().optional(),
     totalTokens: z.number().optional(),
+    // Portion of input tokens served from / written to the prompt cache.
+    totalCacheReadTokens: z.number().optional(),
+    totalCacheWriteTokens: z.number().optional(),
     totalResponseTimeMs: z.number().optional(),
     feedback: z.array(z.object({
         type: z.enum(['like', 'dislike']),