feat(providers): prompt caching for Anthropic + Azure-Anthropic

waleedlatif1 · waleedlatif1 · commit ebe5447d0fac · 2026-06-16T14:12:08.000-07:00
Mark the static request prefix (system prompt + tools) with an ephemeral
cache_control breakpoint so repeated calls — agent tool-loops and multi-turn —
reuse the cached prefix (~90% cheaper cached input + lower latency). Azure-
Anthropic inherits this via the shared core.

- New providers/prompt-cache.ts gate: only caches when the static prefix is
  large enough to be cacheable AND likely reused (tools present, or a large
  system prompt), so a one-shot tool-less call never pays the cache-write
  surcharge. Kill switch: PROMPT_CACHE_DISABLED=true.
- anthropic/core.ts: convert system string -&gt; a cached text block (after the
  structured-output concat, which assumes a string) and tag the last tool. Uses
  2 of Anthropic's 4 breakpoints; the tool-loop reuses the tagged payload.
- Outputs are unchanged; cost accounting already reads cache_read/creation
  tokens (buildAnthropicSegmentTokens), so usage stays accurate.

Matches the AI SDK / LangChain / Spring AI convention (explicit breakpoints for
Claude; automatic for OpenAI/Gemini). Bedrock + OpenRouter to follow (they need
cache-token accounting alongside).
diff --git a/apps/sim/providers/anthropic/core.ts b/apps/sim/providers/anthropic/core.ts
@@ -16,6 +16,7 @@ import {
   supportsNativeStructuredOutputs,
   supportsTemperature,
 } from '@/providers/models'
+import { shouldCacheStaticPrefix } from '@/providers/prompt-cache'
 import { createStreamingExecution } from '@/providers/streaming-execution'
 import { adaptAnthropicToolSchema } from '@/providers/tool-schema-adapter'
 import { enrichLastModelSegment } from '@/providers/trace-enrichment'
@@ -324,6 +325,20 @@ export async function executeAnthropicProviderRequest(
     }
   }
 
+  // Prompt caching: mark the static prefix (system + tools) with an ephemeral
+  // cache breakpoint so repeated calls (agent tool-loops, multi-turn) reuse it.
+  // Must run after the structured-output block above, which assumes `system` is
+  // still a string. Tools are tagged at their assignment below.
+  const cacheStaticPrefix = shouldCacheStaticPrefix({
+    systemPrompt: typeof payload.system === 'string' ? payload.system : '',
+    hasTools: !!anthropicTools?.length,
+    toolsApproxChars: anthropicTools ? JSON.stringify(anthropicTools).length : 0,
+  })
+
+  if (cacheStaticPrefix && typeof payload.system === 'string' && payload.system.length > 0) {
+    payload.system = [{ type: 'text', text: payload.system, cache_control: { type: 'ephemeral' } }]
+  }
+
   // Add extended thinking configuration if supported and requested
   // The 'none' sentinel means "disable thinking" — skip configuration entirely.
   if (request.thinkingLevel && request.thinkingLevel !== 'none') {
@@ -366,6 +381,13 @@ export async function executeAnthropicProviderRequest(
   }
 
   if (anthropicTools?.length) {
+    if (cacheStaticPrefix) {
+      const lastIndex = anthropicTools.length - 1
+      anthropicTools[lastIndex] = {
+        ...anthropicTools[lastIndex],
+        cache_control: { type: 'ephemeral' },
+      }
+    }
     payload.tools = anthropicTools
     // Per Anthropic docs: forced tool_choice (type: "tool" or "any") is incompatible with
     // thinking. Only auto and none are supported when thinking is enabled.
diff --git a/apps/sim/providers/prompt-cache.test.ts b/apps/sim/providers/prompt-cache.test.ts
@@ -0,0 +1,58 @@
+/**
+ * @vitest-environment node
+ */
+import { afterEach, beforeEach, describe, expect, it } from 'vitest'
+import { shouldCacheStaticPrefix } from '@/providers/prompt-cache'
+
+const LARGE = 'x'.repeat(8_000) // ~2,000 est. tokens, above the 1,024 gate
+const SMALL = 'x'.repeat(400) // ~100 est. tokens, below the gate
+
+describe('shouldCacheStaticPrefix', () => {
+  const original = process.env.PROMPT_CACHE_DISABLED
+
+  beforeEach(() => {
+    process.env.PROMPT_CACHE_DISABLED = undefined
+  })
+
+  afterEach(() => {
+    process.env.PROMPT_CACHE_DISABLED = original
+  })
+
+  it('caches a large system prompt that has tools (agent loop)', () => {
+    expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: true })).toBe(true)
+  })
+
+  it('caches a large system prompt even without tools', () => {
+    expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: false })).toBe(true)
+  })
+
+  it('reaches the threshold via tools when the system prompt alone is below it', () => {
+    // Small system + large serialized tools clears the combined threshold, and
+    // tools imply reuse, so it should cache.
+    expect(
+      shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: true, toolsApproxChars: 8_000 })
+    ).toBe(true)
+  })
+
+  it('does NOT cache a small, tool-less prompt (one-shot write surcharge avoided)', () => {
+    expect(shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: false })).toBe(false)
+  })
+
+  it('does NOT cache a small system even with tools when the combined prefix is below threshold', () => {
+    expect(
+      shouldCacheStaticPrefix({ systemPrompt: SMALL, hasTools: true, toolsApproxChars: 400 })
+    ).toBe(false)
+  })
+
+  it('does NOT cache when there is no system prompt', () => {
+    expect(
+      shouldCacheStaticPrefix({ systemPrompt: '', hasTools: true, toolsApproxChars: 8_000 })
+    ).toBe(false)
+    expect(shouldCacheStaticPrefix({ systemPrompt: null, hasTools: true })).toBe(false)
+  })
+
+  it('is disabled by the PROMPT_CACHE_DISABLED kill switch', () => {
+    process.env.PROMPT_CACHE_DISABLED = 'true'
+    expect(shouldCacheStaticPrefix({ systemPrompt: LARGE, hasTools: true })).toBe(false)
+  })
+})
diff --git a/apps/sim/providers/prompt-cache.ts b/apps/sim/providers/prompt-cache.ts
@@ -0,0 +1,55 @@
+import { getEnv, isTruthy } from '@/lib/core/config/env'
+
+/**
+ * Minimum estimated static-prefix size (system + tool definitions) before it is
+ * worth marking a prompt-cache breakpoint. This is a rough lower bound across
+ * Claude models (some require more); below it, providers silently skip caching
+ * anyway, so this only avoids spending a breakpoint on a trivially small prefix.
+ */
+const MIN_CACHEABLE_PREFIX_TOKENS = 1024
+
+/** Rough token estimate (~4 chars/token) — fast and good enough for a gate. */
+function estimateTokens(text: string): number {
+  return Math.ceil(text.length / 4)
+}
+
+/**
+ * Decides whether to inject prompt-cache breakpoints on the static prefix
+ * (system prompt + tool definitions) for providers that require explicit cache
+ * control (Anthropic, Bedrock, and Anthropic models via OpenRouter).
+ *
+ * Caching only pays off when the prefix is large enough to be cacheable AND is
+ * actually re-read: agent tool-loops re-send the prefix on every iteration, and
+ * a large system prompt is typically reused across runs within the cache TTL.
+ * A small, tool-less prompt is intentionally skipped so a one-shot call never
+ * pays the cache-write surcharge for a prefix that is never read back.
+ *
+ * Set `PROMPT_CACHE_DISABLED=true` to turn this off globally (kill switch).
+ */
+export function shouldCacheStaticPrefix(params: {
+  systemPrompt: string | null | undefined
+  hasTools: boolean
+  toolsApproxChars?: number
+}): boolean {
+  if (isTruthy(getEnv('PROMPT_CACHE_DISABLED'))) {
+    return false
+  }
+
+  const system = params.systemPrompt ?? ''
+  if (!system) {
+    return false
+  }
+
+  const systemTokens = estimateTokens(system)
+  const toolTokens = params.toolsApproxChars ? Math.ceil(params.toolsApproxChars / 4) : 0
+  const prefixTokens = systemTokens + toolTokens
+
+  if (prefixTokens < MIN_CACHEABLE_PREFIX_TOKENS) {
+    return false
+  }
+
+  // Tools imply an agent loop (the prefix is re-read each iteration). Without
+  // tools, only cache when the system prompt alone is large enough to be worth
+  // the write on its own.
+  return params.hasTools || systemTokens >= MIN_CACHEABLE_PREFIX_TOKENS
+}