github · lpcox · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/.github/workflows/smoke-claude.lock.yml b/.github/workflows/smoke-claude.lock.yml
diff --git a/.github/workflows/smoke-claude.md b/.github/workflows/smoke-claude.md
@@ -15,7 +15,7 @@ permissions:
   pull-requests: read
 
 name: Smoke Claude
-max-turns: 2
+max-turns: 5
 engine:
   id: claude
   model: claude-haiku-4-5

diff --git a/containers/api-proxy/Dockerfile b/containers/api-proxy/Dockerfile
@@ -29,7 +29,7 @@ COPY server.js logging.js metrics.js rate-limiter.js \
      deprecated-header-tracker.js billing-headers.js upstream-response.js \
      anthropic-cache.js otel.js otel-exporters.js otel-serialization.js \
      token-budget-log.js blocked-request-diagnostics.js \
-     provider-env-constants.js ./
+     provider-env-constants.js provider-names.js ./
 COPY guards/ ./guards/
 COPY providers/ ./providers/
 COPY transforms/ ./transforms/

diff --git a/containers/api-proxy/guards/ai-credits-guard.js b/containers/api-proxy/guards/ai-credits-guard.js
@@ -4,6 +4,7 @@ const { logRequest, sanitizeForLog } = require('../logging');
 const pricingByModel = require('../ai-credits-pricing');
 const { resolveCatalogModel } = require('../models-dev-catalog');
 const { parsePositiveNumber } = require('./guard-utils');
+const { PROVIDER_ANTHROPIC, PROVIDER_COPILOT } = require('../provider-names');
 
 const TOKENS_PER_MILLION = 1_000_000;
 const DOLLARS_PER_CREDIT = 0.01;
@@ -165,17 +166,24 @@ function checkUnknownModelRejection(model) {
   };
 }
 
-function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
+function calculateAiCredits(normalizedUsage, model, state = aiCreditsState, provider = undefined) {
   const pricing = resolveModelPricing(model, state);
   if (!pricing) return null;
 
-  // Both Anthropic and OpenAI report input_tokens as the TOTAL input including
-  // cache_read and cache_creation tokens. To avoid double-counting, subtract
-  // cached portions before applying the full input rate.
-  const totalInput = normalizedUsage.input_tokens || 0;
+  // input_tokens semantics differ by provider:
+  //  - Anthropic and Copilot report input_tokens as the NON-cached input only;
+  //    cache_read_input_tokens and cache_creation_input_tokens are reported
+  //    separately and are ADDITIVE to input_tokens. Subtracting them here would
+  //    over-subtract and undercount the genuinely-fresh input tokens.
+  //  - OpenAI (and OpenAI-compatible providers) report prompt_tokens/input_tokens
+  //    as the TOTAL input, with cached tokens being a SUBSET. Those must be
+  //    subtracted before applying the full input rate to avoid double-counting.
+  const reportedInput = normalizedUsage.input_tokens || 0;
   const cacheReadTokens = normalizedUsage.cache_read_tokens || 0;
   const cacheWriteTokens = normalizedUsage.cache_write_tokens || 0;
-  const nonCachedInput = Math.max(0, totalInput - cacheReadTokens - cacheWriteTokens);
+  const nonCachedInput = provider === PROVIDER_ANTHROPIC || provider === PROVIDER_COPILOT
+    ? reportedInput
+    : Math.max(0, reportedInput - cacheReadTokens - cacheWriteTokens);
 
   const inputCredits = (nonCachedInput * pricing.input) / CREDIT_DENOMINATOR;
   const cachedInputCredits = (cacheReadTokens * pricing.cachedInput) / CREDIT_DENOMINATOR;
@@ -194,10 +202,10 @@ function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
   };
 }
 
-function applyAiCreditsUsage(normalizedUsage, model) {
+function applyAiCreditsUsage(normalizedUsage, model, provider = undefined) {
   if (!normalizedUsage) return null;
   const safeModel = model || 'unknown';
-  const calc = calculateAiCredits(normalizedUsage, safeModel);
+  const calc = calculateAiCredits(normalizedUsage, safeModel, aiCreditsState, provider);
   if (!calc) return null;
 
   if (!Object.hasOwn(aiCreditsState.byModel, safeModel)) {

diff --git a/containers/api-proxy/guards/ai-credits-guard.test.js b/containers/api-proxy/guards/ai-credits-guard.test.js
@@ -84,30 +84,94 @@ describe('ai-credits-guard', () => {
     expect(getAiCreditsReflectState().by_model['claude-sonnet-4-6-20260601'].total).toBeCloseTo(0.5175, 10);
   });
 
-  it('does not double-count cached tokens (cache_read included in input_tokens)', () => {
-    // Simulates: 3M total input, 2.9M from cache, 0.1M new input
-    // This is how Anthropic reports: input_tokens is the total (includes cache hits)
+  it('does not double-count cached tokens when input_tokens is total-inclusive (OpenAI-style)', () => {
+    // OpenAI (Chat Completions and Responses API) reports prompt_tokens/input_tokens
+    // as the TOTAL input, with cached tokens being a subset. When no provider is
+    // passed, the calculation defaults to this total-inclusive interpretation.
+    // Simulates: 3M total input, 2.9M from cache, 0.1M new input.
     const usage = applyAiCreditsUsage({
       input_tokens: 3_000_000,
       cache_read_tokens: 2_900_000,
       output_tokens: 50_000,
-    }, 'claude-sonnet-4-6');
+    }, 'gpt-5.4');
 
     // nonCached = 3M - 2.9M = 100K
-    // inputCredits = 100_000 × $3.00 / 10000 = 30
-    // cachedInputCredits = 2_900_000 × $0.30 / 10000 = 87
+    // inputCredits = 100_000 × $2.50 / 10000 = 25
+    // cachedInputCredits = 2_900_000 × $0.25 / 10000 = 72.5
     // outputCredits = 50_000 × $15.00 / 10000 = 75
-    // total = 192 AIC
-    expect(usage.inputCreditsThisResponse).toBeCloseTo(30, 5);
-    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(87, 5);
+    // total = 172.5 AIC
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(25, 5);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(72.5, 5);
     expect(usage.outputCreditsThisResponse).toBeCloseTo(75, 5);
-    expect(usage.aiCreditsThisResponse).toBeCloseTo(192, 5);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(172.5, 5);
 
-    // BUG (before fix): would have been 30 + 87 + 75 + (2.9M × $3 / 10000) = 192 + 870 = 1062
-    // i.e., cached tokens counted at full price AND cache rate
+    // BUG (before fix): would have been 25 + 72.5 + 75 + (2.9M × $2.50 / 10000) = 172.5 + 725
+    // i.e., cached tokens counted at full price AND cache rate.
     expect(usage.aiCreditsThisResponse).toBeLessThan(250);
   });
 
+  it('treats Anthropic input_tokens as non-cached (additive cache), not total-inclusive', () => {
+    // Anthropic reports input_tokens as the NON-cached input only;
+    // cache_read_input_tokens and cache_creation_input_tokens are reported
+    // separately and are ADDITIVE. The fresh input tokens must therefore be
+    // charged in full and NOT subtracted from cache totals.
+    const usage = applyAiCreditsUsage({
+      input_tokens: 2000,
+      cache_read_tokens: 10_000,
+      output_tokens: 100,
+    }, 'claude-sonnet-4-6', 'anthropic');
+
+    // nonCached = 2000 (NOT 2000 - 10000 clamped to 0)
+    // inputCredits = 2000 × $3.00 / 10000 = 0.6
+    // cachedInputCredits = 10_000 × $0.30 / 10000 = 0.3
+    // outputCredits = 100 × $15.00 / 10000 = 0.15
+    // total = 1.05 AIC
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.6, 10);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.3, 10);
+    expect(usage.outputCreditsThisResponse).toBeCloseTo(0.15, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(1.05, 10);
+
+    // BUG (before fix): nonCached = max(0, 2000 - 10000) = 0, undercounting the
+    // 2000 fresh input tokens → total would have been 0.45 instead of 1.05.
+    expect(usage.aiCreditsThisResponse).toBeGreaterThan(1.0);
+  });
+
+  it('charges Anthropic fresh input even when cache totals exceed input_tokens', () => {
+    // Reproduces the observed smoke-claude record: tiny fresh input alongside
+    // large cache read/write. Previously nonCached clamped to 0, dropping the
+    // fresh input charge entirely.
+    const usage = applyAiCreditsUsage({
+      input_tokens: 5,
+      cache_read_tokens: 38_673,
+      cache_write_tokens: 21_060,
+      output_tokens: 205,
+    }, 'claude-opus-4-7', 'anthropic');
+
+    // nonCached = 5 (Anthropic: additive, not subtracted)
+    // inputCredits  = 5      × $5.00  / 10000 = 0.0025
+    // cachedInput   = 38_673 × $0.50  / 10000 = 1.93365
+    // cacheWrite    = 21_060 × $6.25  / 10000 = 13.1625
+    // outputCredits = 205    × $25.00 / 10000 = 0.5125
+    // total = 15.6111
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.0025, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(15.6111, 4);
+  });
+
+  it('treats Copilot input_tokens as non-cached when provider is copilot', () => {
+    const usage = applyAiCreditsUsage({
+      input_tokens: 100,
+      cache_read_tokens: 10_000,
+      output_tokens: 0,
+    }, 'gpt-5.4', 'copilot');
+
+    // inputCredits = 100 × $2.50 / 10000 = 0.025
+    // cachedInputCredits = 10_000 × $0.25 / 10000 = 0.25
+    // total = 0.275
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.025, 10);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.25, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(0.275, 10);
+  });
+
   it('warns and skips usage for unknown models', () => {
     const { lines } = collectLogOutput();
     const usage = applyAiCreditsUsage({ input_tokens: 100 }, 'unknown-model');

diff --git a/containers/api-proxy/guards/common-guard-checks.js b/containers/api-proxy/guards/common-guard-checks.js
@@ -65,7 +65,11 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getEffectiveTokenBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap: returning 429 would make LLM SDK clients treat this
+      // as a transient rate-limit and retry-storm against a limit that never
+      // recovers, burning the budget until the step times out. 403 is
+      // non-retryable, so the agent stops cleanly.
+      statusCode: 403,
       eventName: 'effective_tokens_limit_exceeded',
       buildError: buildEffectiveTokenLimitError,
       buildLogFields: block => ({
@@ -76,7 +80,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getMaxRunsBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'max_runs_exceeded',
       buildError: buildMaxRunsExceededError,
       buildLogFields: block => ({
@@ -87,7 +92,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getMaxCacheMissesBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'max_cache_misses_exceeded',
       buildError: buildMaxCacheMissesExceededError,
       buildLogFields: block => ({
@@ -109,7 +115,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getAiCreditsBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'ai_credits_limit_exceeded',
       buildError: buildAiCreditsLimitError,
       buildLogFields: block => ({

diff --git a/containers/api-proxy/provider-names.js b/containers/api-proxy/provider-names.js
@@ -0,0 +1,23 @@
+'use strict';
+
+/**
+ * Centralized provider name constants.
+ *
+ * Use these instead of bare string literals when comparing provider names so
+ * that provider checks are spelling-safe and easy to find/refactor.
+ *
+ * NB: this module is intentionally named `provider-names` rather than
+ * `providers` to avoid colliding with the `providers/` directory (the upstream
+ * adapter registry resolved via `require('./providers')`).
+ */
+const PROVIDER_ANTHROPIC = 'anthropic';
+const PROVIDER_OPENAI = 'openai';
+const PROVIDER_COPILOT = 'copilot';
+const PROVIDER_GEMINI = 'gemini';
+
+module.exports = {
+  PROVIDER_ANTHROPIC,
+  PROVIDER_OPENAI,
+  PROVIDER_COPILOT,
+  PROVIDER_GEMINI,
+};
diff --git a/containers/api-proxy/server.token-guards.test.js b/containers/api-proxy/server.token-guards.test.js
@@ -1,6 +1,7 @@
 /**
- * Tests for proxyRequest guards: effective token limit (429) and
- * max-runs limit (429).
+ * Tests for proxyRequest token and permission guard behavior, including
+ * effective-token, max-runs, max-cache-misses, AI-credits, and
+ * permission-denied enforcement paths.
  *
  * Extracted from server.proxy.test.js.
  */
@@ -60,7 +61,7 @@ describe('proxyRequest effective token guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when effective token limit is reached', async () => {
+  it('returns 403 with structured payload when effective token limit is reached', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     const req1 = makeReq();
@@ -81,7 +82,7 @@ describe('proxyRequest effective token guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);
@@ -148,7 +149,7 @@ describe('proxyRequest max-runs guard', () => {
       jest.restoreAllMocks();
     });
 
-    it('returns 429 after max consecutive cache misses with non-zero input tokens', async () => {
+    it('returns 403 after max consecutive cache misses with non-zero input tokens', async () => {
       const cycle = createMockUpstreamCycle(https);
 
       const req1 = makeReq();
@@ -178,7 +179,7 @@ describe('proxyRequest max-runs guard', () => {
       await flushPromises();
 
       expect(cycle.spy).toHaveBeenCalledTimes(2);
-      expect(res3.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+      expect(res3.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
         'Content-Type': 'application/json',
       }));
       const payload = JSON.parse(res3.end.mock.calls[0][0]);
@@ -220,7 +221,7 @@ describe('proxyRequest max-runs guard', () => {
       await flushPromises();
 
       expect(cycle.spy).toHaveBeenCalledTimes(3);
-      expect(res3.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
+      expect(res3.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
     });
   });
 
@@ -230,7 +231,7 @@ describe('proxyRequest max-runs guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when max runs limit is exceeded', async () => {
+  it('returns 403 with structured payload when max runs limit is exceeded', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     // First request completes successfully — consumes the single allowed run
@@ -250,7 +251,7 @@ describe('proxyRequest max-runs guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);
@@ -273,7 +274,7 @@ describe('proxyRequest max-runs guard', () => {
     await flushPromises();
 
     expect(httpsRequestSpy).toHaveBeenCalledTimes(1);
-    expect(res.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
+    expect(res.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
   });
 });
 
@@ -296,7 +297,7 @@ describe('proxyRequest max-ai-credits guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when ai credits limit is reached', async () => {
+  it('returns 403 with structured payload when ai credits limit is reached', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     const req1 = makeReq();
@@ -317,7 +318,7 @@ describe('proxyRequest max-ai-credits guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);