Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/smoke-claude.lock.yml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion .github/workflows/smoke-claude.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ permissions:
pull-requests: read

name: Smoke Claude
max-turns: 2
max-turns: 5
engine:
id: claude
model: claude-haiku-4-5
Expand Down
2 changes: 1 addition & 1 deletion containers/api-proxy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ COPY server.js logging.js metrics.js rate-limiter.js \
deprecated-header-tracker.js billing-headers.js upstream-response.js \
anthropic-cache.js otel.js otel-exporters.js otel-serialization.js \
token-budget-log.js blocked-request-diagnostics.js \
provider-env-constants.js ./
provider-env-constants.js provider-names.js ./
COPY guards/ ./guards/
COPY providers/ ./providers/
COPY transforms/ ./transforms/
Expand Down
24 changes: 16 additions & 8 deletions containers/api-proxy/guards/ai-credits-guard.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const { logRequest, sanitizeForLog } = require('../logging');
const pricingByModel = require('../ai-credits-pricing');
const { resolveCatalogModel } = require('../models-dev-catalog');
const { parsePositiveNumber } = require('./guard-utils');
const { PROVIDER_ANTHROPIC, PROVIDER_COPILOT } = require('../provider-names');

const TOKENS_PER_MILLION = 1_000_000;
const DOLLARS_PER_CREDIT = 0.01;
Expand Down Expand Up @@ -165,17 +166,24 @@ function checkUnknownModelRejection(model) {
};
}

function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
function calculateAiCredits(normalizedUsage, model, state = aiCreditsState, provider = undefined) {
const pricing = resolveModelPricing(model, state);
if (!pricing) return null;

// Both Anthropic and OpenAI report input_tokens as the TOTAL input including
// cache_read and cache_creation tokens. To avoid double-counting, subtract
// cached portions before applying the full input rate.
const totalInput = normalizedUsage.input_tokens || 0;
// input_tokens semantics differ by provider:
// - Anthropic and Copilot report input_tokens as the NON-cached input only;
// cache_read_input_tokens and cache_creation_input_tokens are reported
// separately and are ADDITIVE to input_tokens. Subtracting them here would
// over-subtract and undercount the genuinely-fresh input tokens.
// - OpenAI (and OpenAI-compatible providers) report prompt_tokens/input_tokens
// as the TOTAL input, with cached tokens being a SUBSET. Those must be
// subtracted before applying the full input rate to avoid double-counting.
const reportedInput = normalizedUsage.input_tokens || 0;
const cacheReadTokens = normalizedUsage.cache_read_tokens || 0;
const cacheWriteTokens = normalizedUsage.cache_write_tokens || 0;
const nonCachedInput = Math.max(0, totalInput - cacheReadTokens - cacheWriteTokens);
const nonCachedInput = provider === PROVIDER_ANTHROPIC || provider === PROVIDER_COPILOT
? reportedInput
: Math.max(0, reportedInput - cacheReadTokens - cacheWriteTokens);

const inputCredits = (nonCachedInput * pricing.input) / CREDIT_DENOMINATOR;
const cachedInputCredits = (cacheReadTokens * pricing.cachedInput) / CREDIT_DENOMINATOR;
Expand All @@ -194,10 +202,10 @@ function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
};
}

function applyAiCreditsUsage(normalizedUsage, model) {
function applyAiCreditsUsage(normalizedUsage, model, provider = undefined) {
if (!normalizedUsage) return null;
const safeModel = model || 'unknown';
const calc = calculateAiCredits(normalizedUsage, safeModel);
const calc = calculateAiCredits(normalizedUsage, safeModel, aiCreditsState, provider);
if (!calc) return null;

if (!Object.hasOwn(aiCreditsState.byModel, safeModel)) {
Expand Down
88 changes: 76 additions & 12 deletions containers/api-proxy/guards/ai-credits-guard.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,30 +84,94 @@ describe('ai-credits-guard', () => {
expect(getAiCreditsReflectState().by_model['claude-sonnet-4-6-20260601'].total).toBeCloseTo(0.5175, 10);
});

it('does not double-count cached tokens (cache_read included in input_tokens)', () => {
// Simulates: 3M total input, 2.9M from cache, 0.1M new input
// This is how Anthropic reports: input_tokens is the total (includes cache hits)
it('does not double-count cached tokens when input_tokens is total-inclusive (OpenAI-style)', () => {
// OpenAI (Chat Completions and Responses API) reports prompt_tokens/input_tokens
// as the TOTAL input, with cached tokens being a subset. When no provider is
// passed, the calculation defaults to this total-inclusive interpretation.
// Simulates: 3M total input, 2.9M from cache, 0.1M new input.
const usage = applyAiCreditsUsage({
input_tokens: 3_000_000,
cache_read_tokens: 2_900_000,
output_tokens: 50_000,
}, 'claude-sonnet-4-6');
}, 'gpt-5.4');

// nonCached = 3M - 2.9M = 100K
// inputCredits = 100_000 × $3.00 / 10000 = 30
// cachedInputCredits = 2_900_000 × $0.30 / 10000 = 87
// inputCredits = 100_000 × $2.50 / 10000 = 25
// cachedInputCredits = 2_900_000 × $0.25 / 10000 = 72.5
// outputCredits = 50_000 × $15.00 / 10000 = 75
// total = 192 AIC
expect(usage.inputCreditsThisResponse).toBeCloseTo(30, 5);
expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(87, 5);
// total = 172.5 AIC
expect(usage.inputCreditsThisResponse).toBeCloseTo(25, 5);
expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(72.5, 5);
expect(usage.outputCreditsThisResponse).toBeCloseTo(75, 5);
expect(usage.aiCreditsThisResponse).toBeCloseTo(192, 5);
expect(usage.aiCreditsThisResponse).toBeCloseTo(172.5, 5);

// BUG (before fix): would have been 30 + 87 + 75 + (2.9M × $3 / 10000) = 192 + 870 = 1062
// i.e., cached tokens counted at full price AND cache rate
// BUG (before fix): would have been 25 + 72.5 + 75 + (2.9M × $2.50 / 10000) = 172.5 + 725
// i.e., cached tokens counted at full price AND cache rate.
expect(usage.aiCreditsThisResponse).toBeLessThan(250);
});

it('treats Anthropic input_tokens as non-cached (additive cache), not total-inclusive', () => {
// Anthropic reports input_tokens as the NON-cached input only;
// cache_read_input_tokens and cache_creation_input_tokens are reported
// separately and are ADDITIVE. The fresh input tokens must therefore be
// charged in full and NOT subtracted from cache totals.
const usage = applyAiCreditsUsage({
input_tokens: 2000,
cache_read_tokens: 10_000,
output_tokens: 100,
}, 'claude-sonnet-4-6', 'anthropic');

// nonCached = 2000 (NOT 2000 - 10000 clamped to 0)
// inputCredits = 2000 × $3.00 / 10000 = 0.6
// cachedInputCredits = 10_000 × $0.30 / 10000 = 0.3
// outputCredits = 100 × $15.00 / 10000 = 0.15
// total = 1.05 AIC
expect(usage.inputCreditsThisResponse).toBeCloseTo(0.6, 10);
expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.3, 10);
expect(usage.outputCreditsThisResponse).toBeCloseTo(0.15, 10);
expect(usage.aiCreditsThisResponse).toBeCloseTo(1.05, 10);

// BUG (before fix): nonCached = max(0, 2000 - 10000) = 0, undercounting the
// 2000 fresh input tokens → total would have been 0.45 instead of 1.05.
expect(usage.aiCreditsThisResponse).toBeGreaterThan(1.0);
});

it('charges Anthropic fresh input even when cache totals exceed input_tokens', () => {
// Reproduces the observed smoke-claude record: tiny fresh input alongside
// large cache read/write. Previously nonCached clamped to 0, dropping the
// fresh input charge entirely.
const usage = applyAiCreditsUsage({
input_tokens: 5,
cache_read_tokens: 38_673,
cache_write_tokens: 21_060,
output_tokens: 205,
}, 'claude-opus-4-7', 'anthropic');

// nonCached = 5 (Anthropic: additive, not subtracted)
// inputCredits = 5 × $5.00 / 10000 = 0.0025
// cachedInput = 38_673 × $0.50 / 10000 = 1.93365
// cacheWrite = 21_060 × $6.25 / 10000 = 13.1625
// outputCredits = 205 × $25.00 / 10000 = 0.5125
// total = 15.6111
expect(usage.inputCreditsThisResponse).toBeCloseTo(0.0025, 10);
expect(usage.aiCreditsThisResponse).toBeCloseTo(15.6111, 4);
});

it('treats Copilot input_tokens as non-cached when provider is copilot', () => {
const usage = applyAiCreditsUsage({
input_tokens: 100,
cache_read_tokens: 10_000,
output_tokens: 0,
}, 'gpt-5.4', 'copilot');

// inputCredits = 100 × $2.50 / 10000 = 0.025
// cachedInputCredits = 10_000 × $0.25 / 10000 = 0.25
// total = 0.275
expect(usage.inputCreditsThisResponse).toBeCloseTo(0.025, 10);
expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.25, 10);
expect(usage.aiCreditsThisResponse).toBeCloseTo(0.275, 10);
});

it('warns and skips usage for unknown models', () => {
const { lines } = collectLogOutput();
const usage = applyAiCreditsUsage({ input_tokens: 100 }, 'unknown-model');
Expand Down
15 changes: 11 additions & 4 deletions containers/api-proxy/guards/common-guard-checks.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ function buildCommonGuardChecks(deps, model) {
{
block: getEffectiveTokenBlockState(),
isBlocked: block => block && block.maxExceeded,
statusCode: 429,
// Terminal hard cap: returning 429 would make LLM SDK clients treat this
// as a transient rate-limit and retry-storm against a limit that never
// recovers, burning the budget until the step times out. 403 is
// non-retryable, so the agent stops cleanly.
statusCode: 403,
eventName: 'effective_tokens_limit_exceeded',
buildError: buildEffectiveTokenLimitError,
buildLogFields: block => ({
Expand All @@ -76,7 +80,8 @@ function buildCommonGuardChecks(deps, model) {
{
block: getMaxRunsBlockState(),
isBlocked: block => block && block.maxExceeded,
statusCode: 429,
// Terminal hard cap — non-retryable (see effective-tokens guard above).
statusCode: 403,
eventName: 'max_runs_exceeded',
buildError: buildMaxRunsExceededError,
buildLogFields: block => ({
Expand All @@ -87,7 +92,8 @@ function buildCommonGuardChecks(deps, model) {
{
block: getMaxCacheMissesBlockState(),
isBlocked: block => block && block.maxExceeded,
statusCode: 429,
// Terminal hard cap — non-retryable (see effective-tokens guard above).
statusCode: 403,
eventName: 'max_cache_misses_exceeded',
buildError: buildMaxCacheMissesExceededError,
buildLogFields: block => ({
Expand All @@ -109,7 +115,8 @@ function buildCommonGuardChecks(deps, model) {
{
block: getAiCreditsBlockState(),
isBlocked: block => block && block.maxExceeded,
statusCode: 429,
// Terminal hard cap — non-retryable (see effective-tokens guard above).
statusCode: 403,
eventName: 'ai_credits_limit_exceeded',
buildError: buildAiCreditsLimitError,
buildLogFields: block => ({
Expand Down
23 changes: 23 additions & 0 deletions containers/api-proxy/provider-names.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
'use strict';

/**
* Centralized provider name constants.
*
* Use these instead of bare string literals when comparing provider names so
* that provider checks are spelling-safe and easy to find/refactor.
*
* NB: this module is intentionally named `provider-names` rather than
* `providers` to avoid colliding with the `providers/` directory (the upstream
* adapter registry resolved via `require('./providers')`).
*/
const PROVIDER_ANTHROPIC = 'anthropic';
const PROVIDER_OPENAI = 'openai';
const PROVIDER_COPILOT = 'copilot';
const PROVIDER_GEMINI = 'gemini';

module.exports = {
PROVIDER_ANTHROPIC,
PROVIDER_OPENAI,
PROVIDER_COPILOT,
PROVIDER_GEMINI,
};
25 changes: 13 additions & 12 deletions containers/api-proxy/server.token-guards.test.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* Tests for proxyRequest guards: effective token limit (429) and
* max-runs limit (429).
* Tests for proxyRequest token and permission guard behavior, including
* effective-token, max-runs, max-cache-misses, AI-credits, and
* permission-denied enforcement paths.
*
* Extracted from server.proxy.test.js.
Comment on lines 1 to 6
*/
Expand Down Expand Up @@ -60,7 +61,7 @@ describe('proxyRequest effective token guard', () => {
jest.restoreAllMocks();
});

it('returns 429 with structured payload when effective token limit is reached', async () => {
it('returns 403 with structured payload when effective token limit is reached', async () => {
const cycle = createMockUpstreamCycle(https);

const req1 = makeReq();
Expand All @@ -81,7 +82,7 @@ describe('proxyRequest effective token guard', () => {
await flushPromises();

expect(cycle.spy).toHaveBeenCalledTimes(1);
expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
'Content-Type': 'application/json',
}));
const payload = JSON.parse(res2.end.mock.calls[0][0]);
Expand Down Expand Up @@ -148,7 +149,7 @@ describe('proxyRequest max-runs guard', () => {
jest.restoreAllMocks();
});

it('returns 429 after max consecutive cache misses with non-zero input tokens', async () => {
it('returns 403 after max consecutive cache misses with non-zero input tokens', async () => {
const cycle = createMockUpstreamCycle(https);

const req1 = makeReq();
Expand Down Expand Up @@ -178,7 +179,7 @@ describe('proxyRequest max-runs guard', () => {
await flushPromises();

expect(cycle.spy).toHaveBeenCalledTimes(2);
expect(res3.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
expect(res3.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
'Content-Type': 'application/json',
}));
const payload = JSON.parse(res3.end.mock.calls[0][0]);
Expand Down Expand Up @@ -220,7 +221,7 @@ describe('proxyRequest max-runs guard', () => {
await flushPromises();

expect(cycle.spy).toHaveBeenCalledTimes(3);
expect(res3.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
expect(res3.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
});
});

Expand All @@ -230,7 +231,7 @@ describe('proxyRequest max-runs guard', () => {
jest.restoreAllMocks();
});

it('returns 429 with structured payload when max runs limit is exceeded', async () => {
it('returns 403 with structured payload when max runs limit is exceeded', async () => {
const cycle = createMockUpstreamCycle(https);

// First request completes successfully — consumes the single allowed run
Expand All @@ -250,7 +251,7 @@ describe('proxyRequest max-runs guard', () => {
await flushPromises();

expect(cycle.spy).toHaveBeenCalledTimes(1);
expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
'Content-Type': 'application/json',
}));
const payload = JSON.parse(res2.end.mock.calls[0][0]);
Expand All @@ -273,7 +274,7 @@ describe('proxyRequest max-runs guard', () => {
await flushPromises();

expect(httpsRequestSpy).toHaveBeenCalledTimes(1);
expect(res.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
expect(res.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
});
});

Expand All @@ -296,7 +297,7 @@ describe('proxyRequest max-ai-credits guard', () => {
jest.restoreAllMocks();
});

it('returns 429 with structured payload when ai credits limit is reached', async () => {
it('returns 403 with structured payload when ai credits limit is reached', async () => {
const cycle = createMockUpstreamCycle(https);

const req1 = makeReq();
Expand All @@ -317,7 +318,7 @@ describe('proxyRequest max-ai-credits guard', () => {
await flushPromises();

expect(cycle.spy).toHaveBeenCalledTimes(1);
expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
'Content-Type': 'application/json',
}));
const payload = JSON.parse(res2.end.mock.calls[0][0]);
Expand Down
Loading
Loading