Skip to content

Commit 499b7e8

Browse files
committed
Added support for anthropic prompt caching. Display portion of cached input tokens in the chat UI
1 parent bd47aab commit 499b7e8

5 files changed

Lines changed: 56 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
- [EE] Added prompt caching for Ask Sourcebot. For Anthropic models, the static prompt prefix (tool definitions, system prompt, and conversation history) is marked with a cache breakpoint so it is billed at the provider's discounted cache-read rate on subsequent agent steps and follow-up turns. Toggle with `SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED` (default `true`). [#<PR>](https://github.com/sourcebot-dev/sourcebot/pull/<PR>)
12+
- [EE] Added a cached-token breakdown to the Ask Sourcebot message details, showing what share of the input tokens were served from the model provider's prompt cache. [#<PR>](https://github.com/sourcebot-dev/sourcebot/pull/<PR>)
13+
1014
## [5.0.1] - 2026-06-04
1115

1216
### Fixed

packages/shared/src/env.server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ const options = {
283283
*/
284284
SOURCEBOT_CHAT_MODEL_TEMPERATURE: numberSchema.optional(),
285285
SOURCEBOT_CHAT_MAX_STEP_COUNT: numberSchema.default(100),
286+
SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED: booleanSchema.default('true'),
286287
SOURCEBOT_MCP_TOOL_CALL_TIMEOUT_MS: numberSchema.int().positive().max(maxTimerDelayMs).default(60000),
287288

288289
DEBUG_WRITE_CHAT_MESSAGES_TO_FILE: booleanSchema.default('false'),

packages/web/src/ee/features/chat/agent.ts

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ export const createMessageStream = async ({
197197
totalTokens: (priorMetadata?.totalTokens ?? 0) + (totalUsage.totalTokens ?? 0),
198198
totalInputTokens: (priorMetadata?.totalInputTokens ?? 0) + (totalUsage.inputTokens ?? 0),
199199
totalOutputTokens: (priorMetadata?.totalOutputTokens ?? 0) + (totalUsage.outputTokens ?? 0),
200+
totalCacheReadTokens: (priorMetadata?.totalCacheReadTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheReadTokens ?? 0),
201+
totalCacheWriteTokens: (priorMetadata?.totalCacheWriteTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheWriteTokens ?? 0),
200202
totalResponseTimeMs: (priorMetadata?.totalResponseTimeMs ?? 0) + (new Date().getTime() - startTime.getTime()),
201203
modelName,
202204
traceId,
@@ -343,11 +345,42 @@ const createAgentStream = async ({
343345
...(hasMcpTools ? { tool_request_activation: toolRequestActivation, ...mcpToolSetsObj.tools } : {}),
344346
};
345347

348+
// Anthropic prompt caching: mark the end of the prompt's static prefix —
349+
// tool definitions, the system prompt (including any resolved file sources),
350+
// and the conversation history — with an ephemeral (5m) cache breakpoint on
351+
// the last input message. Anthropic caches everything up to and including
352+
// this point, so the large prefix is written once (~1.25x) and read back at
353+
// ~0.1x on every subsequent agent step and follow-up turn instead of being
354+
// reprocessed in full. The `anthropic` provider-options namespace is ignored
355+
// by non-Anthropic providers, so this is safe to apply unconditionally.
356+
//
357+
// Caveat: when MCP tools are lazily activated mid-run via prepareStep, the
358+
// tools section (which precedes everything else in the prefix) grows and
359+
// invalidates the cache for that step; the cache re-warms on subsequent
360+
// steps once the active tool set is stable.
361+
const isPromptCachingEnabled = env.SOURCEBOT_CHAT_PROMPT_CACHING_ENABLED === 'true';
362+
const messagesWithCachedPrefix: ModelMessage[] = inputMessages.map((message, index) => {
363+
if (!isPromptCachingEnabled || index !== inputMessages.length - 1) {
364+
return message;
365+
}
366+
367+
return {
368+
...message,
369+
providerOptions: {
370+
...message.providerOptions,
371+
anthropic: {
372+
...message.providerOptions?.anthropic,
373+
cacheControl: { type: 'ephemeral' },
374+
},
375+
},
376+
};
377+
});
378+
346379
try {
347380
const stream = streamText({
348381
model,
349382
providerOptions,
350-
messages: inputMessages,
383+
messages: messagesWithCachedPrefix,
351384
system: systemPrompt,
352385
tools: allTools,
353386
activeTools: [

packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ const DetailsCardComponent = ({
5959
(part.type === 'dynamic-tool' && part.toolName.startsWith('mcp_'))
6060
).length, [thinkingSteps]);
6161

62+
const cacheReadTokens = metadata?.totalCacheReadTokens ?? 0;
63+
const cachedInputPercent = metadata?.totalInputTokens
64+
? Math.round((cacheReadTokens / metadata.totalInputTokens) * 100)
65+
: 0;
66+
6267
const handleExpandedChanged = useCallback((next: boolean) => {
6368
captureEvent('wa_chat_details_card_toggled', { chatId, isExpanded: next });
6469
onExpandedChanged(next);
@@ -132,6 +137,9 @@ const DetailsCardComponent = ({
132137
<div className="flex items-center text-xs cursor-help">
133138
<Zap className="w-3 h-3 mr-1 flex-shrink-0" />
134139
{getShortenedNumberDisplayString(metadata.totalTokens, 0)} tokens
140+
{cachedInputPercent > 0 && (
141+
<span className="ml-1 text-muted-foreground">({cachedInputPercent}% cached)</span>
142+
)}
135143
</div>
136144
</TooltipTrigger>
137145
<TooltipContent side="bottom">
@@ -140,6 +148,12 @@ const DetailsCardComponent = ({
140148
<span className="text-muted-foreground">Input</span>
141149
<span>{metadata.totalInputTokens?.toLocaleString() ?? '—'}</span>
142150
</div>
151+
{cacheReadTokens > 0 && (
152+
<div className="flex justify-between gap-4 pl-3">
153+
<span className="text-muted-foreground">↳ Cached (discounted)</span>
154+
<span>{cacheReadTokens.toLocaleString()}</span>
155+
</div>
156+
)}
143157
<div className="flex justify-between gap-4">
144158
<span className="text-muted-foreground">Output</span>
145159
<span>{metadata.totalOutputTokens?.toLocaleString() ?? '—'}</span>

packages/web/src/features/chat/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ export const sbChatMessageMetadataSchema = z.object({
5555
totalInputTokens: z.number().optional(),
5656
totalOutputTokens: z.number().optional(),
5757
totalTokens: z.number().optional(),
58+
// Portion of input tokens served from / written to the prompt cache.
59+
totalCacheReadTokens: z.number().optional(),
60+
totalCacheWriteTokens: z.number().optional(),
5861
totalResponseTimeMs: z.number().optional(),
5962
feedback: z.array(z.object({
6063
type: z.enum(['like', 'dislike']),

0 commit comments

Comments
 (0)