Skip to content

Commit ebe5447

Browse files
committed
feat(providers): prompt caching for Anthropic + Azure-Anthropic
Mark the static request prefix (system prompt + tools) with an ephemeral cache_control breakpoint so repeated calls — agent tool-loops and multi-turn — reuse the cached prefix (~90% cheaper cached input + lower latency). Azure- Anthropic inherits this via the shared core. - New providers/prompt-cache.ts gate: only caches when the static prefix is large enough to be cacheable AND likely reused (tools present, or a large system prompt), so a one-shot tool-less call never pays the cache-write surcharge. Kill switch: PROMPT_CACHE_DISABLED=true. - anthropic/core.ts: convert system string -> a cached text block (after the structured-output concat, which assumes a string) and tag the last tool. Uses 2 of Anthropic's 4 breakpoints; the tool-loop reuses the tagged payload. - Outputs are unchanged; cost accounting already reads cache_read/creation tokens (buildAnthropicSegmentTokens), so usage stays accurate. Matches the AI SDK / LangChain / Spring AI convention (explicit breakpoints for Claude; automatic for OpenAI/Gemini). Bedrock + OpenRouter to follow (they need cache-token accounting alongside).
1 parent cc56408 commit ebe5447

3 files changed

Lines changed: 135 additions & 0 deletions

File tree

apps/sim/providers/anthropic/core.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import {
1616
supportsNativeStructuredOutputs,
1717
supportsTemperature,
1818
} from '@/providers/models'
19+
import { shouldCacheStaticPrefix } from '@/providers/prompt-cache'
1920
import { createStreamingExecution } from '@/providers/streaming-execution'
2021
import { adaptAnthropicToolSchema } from '@/providers/tool-schema-adapter'
2122
import { enrichLastModelSegment } from '@/providers/trace-enrichment'
@@ -324,6 +325,20 @@ export async function executeAnthropicProviderRequest(
324325
}
325326
}
326327

328+
// Prompt caching: mark the static prefix (system + tools) with an ephemeral
329+
// cache breakpoint so repeated calls (agent tool-loops, multi-turn) reuse it.
330+
// Must run after the structured-output block above, which assumes `system` is
331+
// still a string. Tools are tagged at their assignment below.
332+
const cacheStaticPrefix = shouldCacheStaticPrefix({
333+
systemPrompt: typeof payload.system === 'string' ? payload.system : '',
334+
hasTools: !!anthropicTools?.length,
335+
toolsApproxChars: anthropicTools ? JSON.stringify(anthropicTools).length : 0,
336+
})
337+
338+
if (cacheStaticPrefix && typeof payload.system === 'string' && payload.system.length > 0) {
339+
payload.system = [{ type: 'text', text: payload.system, cache_control: { type: 'ephemeral' } }]
340+
}
341+
327342
// Add extended thinking configuration if supported and requested
328343
// The 'none' sentinel means "disable thinking" — skip configuration entirely.
329344
if (request.thinkingLevel && request.thinkingLevel !== 'none') {
@@ -366,6 +381,13 @@ export async function executeAnthropicProviderRequest(
366381
}
367382

368383
if (anthropicTools?.length) {
384+
if (cacheStaticPrefix) {
385+
const lastIndex = anthropicTools.length - 1
386+
anthropicTools[lastIndex] = {
387+
...anthropicTools[lastIndex],
388+
cache_control: { type: 'ephemeral' },
389+
}
390+
}
369391
payload.tools = anthropicTools
370392
// Per Anthropic docs: forced tool_choice (type: "tool" or "any") is incompatible with
371393
// thinking. Only auto and none are supported when thinking is enabled.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/**
2+
* @vitest-environment node
3+
*/
4+
import { afterEach, beforeEach, describe, expect, it } from 'vitest'
5+
import { shouldCacheStaticPrefix } from '@/providers/prompt-cache'
6+
7+
const LARGE = 'x'.repeat(8_000) // ~2,000 est. tokens, above the 1,024 gate
8+
const SMALL = 'x'.repeat(400) // ~100 est. tokens, below the gate
9+
10+
describe('shouldCacheStaticPrefix', () => {
11+
const original = process.env.PROMPT_CACHE_DISABLED
12+
13+
beforeEach(() => {
14+
process.env.PROMPT_CACHE_DISABLED = undefined
15+
})
16+
17+
afterEach(() => {
18+
process.env.PROMPT_CACHE_DISABLED = original
19+
})
20+
21+
it('caches a large system prompt that has tools (agent loop)', () => {
22+
expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: true })).toBe(true)
23+
})
24+
25+
it('caches a large system prompt even without tools', () => {
26+
expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: false })).toBe(true)
27+
})
28+
29+
it('reaches the threshold via tools when the system prompt alone is below it', () => {
30+
// Small system + large serialized tools clears the combined threshold, and
31+
// tools imply reuse, so it should cache.
32+
expect(
33+
shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: true, toolsApproxChars: 8_000 })
34+
).toBe(true)
35+
})
36+
37+
it('does NOT cache a small, tool-less prompt (one-shot write surcharge avoided)', () => {
38+
expect(shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: false })).toBe(false)
39+
})
40+
41+
it('does NOT cache a small system even with tools when the combined prefix is below threshold', () => {
42+
expect(
43+
shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: true, toolsApproxChars: 400 })
44+
).toBe(false)
45+
})
46+
47+
it('does NOT cache when there is no system prompt', () => {
48+
expect(
49+
shouldCacheStaticPrefix({ systemPrompt: '', hasTools: true, toolsApproxChars: 8_000 })
50+
).toBe(false)
51+
expect(shouldCacheStaticPrefix({ systemPrompt: null, hasTools: true })).toBe(false)
52+
})
53+
54+
it('is disabled by the PROMPT_CACHE_DISABLED kill switch', () => {
55+
process.env.PROMPT_CACHE_DISABLED = 'true'
56+
expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: true })).toBe(false)
57+
})
58+
})

apps/sim/providers/prompt-cache.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { getEnv, isTruthy } from '@/lib/core/config/env'
2+
3+
/**
4+
* Minimum estimated static-prefix size (system + tool definitions) before it is
5+
* worth marking a prompt-cache breakpoint. This is a rough lower bound across
6+
* Claude models (some require more); below it, providers silently skip caching
7+
* anyway, so this only avoids spending a breakpoint on a trivially small prefix.
8+
*/
9+
const MIN_CACHEABLE_PREFIX_TOKENS = 1024
10+
11+
/** Rough token estimate (~4 chars/token) — fast and good enough for a gate. */
12+
function estimateTokens(text: string): number {
13+
return Math.ceil(text.length / 4)
14+
}
15+
16+
/**
17+
* Decides whether to inject prompt-cache breakpoints on the static prefix
18+
* (system prompt + tool definitions) for providers that require explicit cache
19+
* control (Anthropic, Bedrock, and Anthropic models via OpenRouter).
20+
*
21+
* Caching only pays off when the prefix is large enough to be cacheable AND is
22+
* actually re-read: agent tool-loops re-send the prefix on every iteration, and
23+
* a large system prompt is typically reused across runs within the cache TTL.
24+
* A small, tool-less prompt is intentionally skipped so a one-shot call never
25+
* pays the cache-write surcharge for a prefix that is never read back.
26+
*
27+
* Set `PROMPT_CACHE_DISABLED=true` to turn this off globally (kill switch).
28+
*/
29+
export function shouldCacheStaticPrefix(params: {
30+
systemPrompt: string | null | undefined
31+
hasTools: boolean
32+
toolsApproxChars?: number
33+
}): boolean {
34+
if (isTruthy(getEnv('PROMPT_CACHE_DISABLED'))) {
35+
return false
36+
}
37+
38+
const system = params.systemPrompt ?? ''
39+
if (!system) {
40+
return false
41+
}
42+
43+
const systemTokens = estimateTokens(system)
44+
const toolTokens = params.toolsApproxChars ? Math.ceil(params.toolsApproxChars / 4) : 0
45+
const prefixTokens = systemTokens + toolTokens
46+
47+
if (prefixTokens < MIN_CACHEABLE_PREFIX_TOKENS) {
48+
return false
49+
}
50+
51+
// Tools imply an agent loop (the prefix is re-read each iteration). Without
52+
// tools, only cache when the system prompt alone is large enough to be worth
53+
// the write on its own.
54+
return params.hasTools || systemTokens >= MIN_CACHEABLE_PREFIX_TOKENS
55+
}

0 commit comments

Comments
 (0)