From 2df70a27c21c8b6a82219148f541167a8d29aee0 Mon Sep 17 00:00:00 2001
From: Landon Cox <landon.cox@microsoft.com>
Date: Thu, 18 Jun 2026 19:39:48 -0700
Subject: [PATCH 1/3] fix(api-proxy): 403 for terminal caps; fix Anthropic
 input credits

Two related token-budget fixes:

1. Terminal hard caps (effective_tokens, max_runs, max_cache_misses,
   ai_credits) now reject with HTTP 403 instead of 429. LLM SDK clients
   treat 429 as a transient rate-limit and retry-storm against a cap that
   never recovers, exhausting the run budget until the step times out.
   403 is non-retryable, so the agent stops cleanly. The per-IP rate
   limiter keeps returning 429 (with Retry-After) since it is recoverable.

2. AI-credit calculation is now provider-aware. Anthropic reports
   input_tokens as the NON-cached input only (cache_read/cache_creation
   are additive), whereas OpenAI reports it as the TOTAL with cache as a
   subset. The old code always subtracted cache from input, over-counting
   cache and under-counting fresh input for Anthropic. provider is now
   threaded through applyAiCreditsUsage -> calculateAiCredits.

Provider string literals in the new code use centralized constants from
the new provider-names module (named to avoid colliding with the
providers/ adapter directory).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 containers/api-proxy/Dockerfile               |  2 +-
 .../api-proxy/guards/ai-credits-guard.js      | 24 ++++--
 .../api-proxy/guards/ai-credits-guard.test.js | 73 ++++++++++++++++---
 .../api-proxy/guards/common-guard-checks.js   | 15 +++-
 containers/api-proxy/provider-names.js        | 23 ++++++
 .../api-proxy/server.token-guards.test.js     | 24 +++---
 containers/api-proxy/server.websocket.test.js | 16 ++--
 containers/api-proxy/token-budget-log.js      |  2 +-
 docs/api-proxy-sidecar.md                     | 16 ++--
 docs/awf-config-spec.md                       | 12 +--
 docs/awf-config.schema.json                   |  8 +-
 src/awf-config-schema.json                    |  8 +-
 12 files changed, 155 insertions(+), 68 deletions(-)
 create mode 100644 containers/api-proxy/provider-names.js

diff --git a/containers/api-proxy/Dockerfile b/containers/api-proxy/Dockerfile
index 5d77393ee..8be0a2e4c 100644
--- a/containers/api-proxy/Dockerfile
+++ b/containers/api-proxy/Dockerfile
@@ -29,7 +29,7 @@ COPY server.js logging.js metrics.js rate-limiter.js \
      deprecated-header-tracker.js billing-headers.js upstream-response.js \
      anthropic-cache.js otel.js otel-exporters.js otel-serialization.js \
      token-budget-log.js blocked-request-diagnostics.js \
-     provider-env-constants.js ./
+     provider-env-constants.js provider-names.js ./
 COPY guards/ ./guards/
 COPY providers/ ./providers/
 COPY transforms/ ./transforms/
diff --git a/containers/api-proxy/guards/ai-credits-guard.js b/containers/api-proxy/guards/ai-credits-guard.js
index e1c3758e5..daf67760c 100644
--- a/containers/api-proxy/guards/ai-credits-guard.js
+++ b/containers/api-proxy/guards/ai-credits-guard.js
@@ -4,6 +4,7 @@ const { logRequest, sanitizeForLog } = require('../logging');
 const pricingByModel = require('../ai-credits-pricing');
 const { resolveCatalogModel } = require('../models-dev-catalog');
 const { parsePositiveNumber } = require('./guard-utils');
+const { PROVIDER_ANTHROPIC } = require('../provider-names');
 
 const TOKENS_PER_MILLION = 1_000_000;
 const DOLLARS_PER_CREDIT = 0.01;
@@ -165,17 +166,24 @@ function checkUnknownModelRejection(model) {
   };
 }
 
-function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
+function calculateAiCredits(normalizedUsage, model, state = aiCreditsState, provider = undefined) {
   const pricing = resolveModelPricing(model, state);
   if (!pricing) return null;
 
-  // Both Anthropic and OpenAI report input_tokens as the TOTAL input including
-  // cache_read and cache_creation tokens. To avoid double-counting, subtract
-  // cached portions before applying the full input rate.
-  const totalInput = normalizedUsage.input_tokens || 0;
+  // input_tokens semantics differ by provider:
+  //  - Anthropic reports input_tokens as the NON-cached input only;
+  //    cache_read_input_tokens and cache_creation_input_tokens are reported
+  //    separately and are ADDITIVE to input_tokens. Subtracting them here would
+  //    over-subtract and undercount the genuinely-fresh input tokens.
+  //  - OpenAI (and OpenAI-compatible providers) report prompt_tokens/input_tokens
+  //    as the TOTAL input, with cached tokens being a SUBSET. Those must be
+  //    subtracted before applying the full input rate to avoid double-counting.
+  const reportedInput = normalizedUsage.input_tokens || 0;
   const cacheReadTokens = normalizedUsage.cache_read_tokens || 0;
   const cacheWriteTokens = normalizedUsage.cache_write_tokens || 0;
-  const nonCachedInput = Math.max(0, totalInput - cacheReadTokens - cacheWriteTokens);
+  const nonCachedInput = provider === PROVIDER_ANTHROPIC
+    ? reportedInput
+    : Math.max(0, reportedInput - cacheReadTokens - cacheWriteTokens);
 
   const inputCredits = (nonCachedInput * pricing.input) / CREDIT_DENOMINATOR;
   const cachedInputCredits = (cacheReadTokens * pricing.cachedInput) / CREDIT_DENOMINATOR;
@@ -194,10 +202,10 @@ function calculateAiCredits(normalizedUsage, model, state = aiCreditsState) {
   };
 }
 
-function applyAiCreditsUsage(normalizedUsage, model) {
+function applyAiCreditsUsage(normalizedUsage, model, provider = undefined) {
   if (!normalizedUsage) return null;
   const safeModel = model || 'unknown';
-  const calc = calculateAiCredits(normalizedUsage, safeModel);
+  const calc = calculateAiCredits(normalizedUsage, safeModel, aiCreditsState, provider);
   if (!calc) return null;
 
   if (!Object.hasOwn(aiCreditsState.byModel, safeModel)) {
diff --git a/containers/api-proxy/guards/ai-credits-guard.test.js b/containers/api-proxy/guards/ai-credits-guard.test.js
index 6dc486ddb..379742482 100644
--- a/containers/api-proxy/guards/ai-credits-guard.test.js
+++ b/containers/api-proxy/guards/ai-credits-guard.test.js
@@ -84,30 +84,79 @@ describe('ai-credits-guard', () => {
     expect(getAiCreditsReflectState().by_model['claude-sonnet-4-6-20260601'].total).toBeCloseTo(0.5175, 10);
   });
 
-  it('does not double-count cached tokens (cache_read included in input_tokens)', () => {
-    // Simulates: 3M total input, 2.9M from cache, 0.1M new input
-    // This is how Anthropic reports: input_tokens is the total (includes cache hits)
+  it('does not double-count cached tokens when input_tokens is total-inclusive (OpenAI-style)', () => {
+    // OpenAI (Chat Completions and Responses API) reports prompt_tokens/input_tokens
+    // as the TOTAL input, with cached tokens being a subset. When no provider is
+    // passed, the calculation defaults to this total-inclusive interpretation.
+    // Simulates: 3M total input, 2.9M from cache, 0.1M new input.
     const usage = applyAiCreditsUsage({
       input_tokens: 3_000_000,
       cache_read_tokens: 2_900_000,
       output_tokens: 50_000,
-    }, 'claude-sonnet-4-6');
+    }, 'gpt-5.4');
 
     // nonCached = 3M - 2.9M = 100K
-    // inputCredits = 100_000 × $3.00 / 10000 = 30
-    // cachedInputCredits = 2_900_000 × $0.30 / 10000 = 87
+    // inputCredits = 100_000 × $2.50 / 10000 = 25
+    // cachedInputCredits = 2_900_000 × $0.25 / 10000 = 72.5
     // outputCredits = 50_000 × $15.00 / 10000 = 75
-    // total = 192 AIC
-    expect(usage.inputCreditsThisResponse).toBeCloseTo(30, 5);
-    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(87, 5);
+    // total = 172.5 AIC
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(25, 5);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(72.5, 5);
     expect(usage.outputCreditsThisResponse).toBeCloseTo(75, 5);
-    expect(usage.aiCreditsThisResponse).toBeCloseTo(192, 5);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(172.5, 5);
 
-    // BUG (before fix): would have been 30 + 87 + 75 + (2.9M × $3 / 10000) = 192 + 870 = 1062
-    // i.e., cached tokens counted at full price AND cache rate
+    // BUG (before fix): would have been 25 + 72.5 + 75 + (2.9M × $2.50 / 10000) = 172.5 + 725
+    // i.e., cached tokens counted at full price AND cache rate.
     expect(usage.aiCreditsThisResponse).toBeLessThan(250);
   });
 
+  it('treats Anthropic input_tokens as non-cached (additive cache), not total-inclusive', () => {
+    // Anthropic reports input_tokens as the NON-cached input only;
+    // cache_read_input_tokens and cache_creation_input_tokens are reported
+    // separately and are ADDITIVE. The fresh input tokens must therefore be
+    // charged in full and NOT subtracted from cache totals.
+    const usage = applyAiCreditsUsage({
+      input_tokens: 2000,
+      cache_read_tokens: 10_000,
+      output_tokens: 100,
+    }, 'claude-sonnet-4-6', 'anthropic');
+
+    // nonCached = 2000 (NOT 2000 - 10000 clamped to 0)
+    // inputCredits = 2000 × $3.00 / 10000 = 0.6
+    // cachedInputCredits = 10_000 × $0.30 / 10000 = 0.3
+    // outputCredits = 100 × $15.00 / 10000 = 0.15
+    // total = 1.05 AIC
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.6, 10);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.3, 10);
+    expect(usage.outputCreditsThisResponse).toBeCloseTo(0.15, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(1.05, 10);
+
+    // BUG (before fix): nonCached = max(0, 2000 - 10000) = 0, undercounting the
+    // 2000 fresh input tokens → total would have been 0.45 instead of 1.05.
+    expect(usage.aiCreditsThisResponse).toBeGreaterThan(1.0);
+  });
+
+  it('charges Anthropic fresh input even when cache totals exceed input_tokens', () => {
+    // Reproduces the observed smoke-claude record: tiny fresh input alongside
+    // large cache read/write. Previously nonCached clamped to 0, dropping the
+    // fresh input charge entirely.
+    const usage = applyAiCreditsUsage({
+      input_tokens: 5,
+      cache_read_tokens: 38_673,
+      cache_write_tokens: 21_060,
+      output_tokens: 205,
+    }, 'claude-opus-4-7', 'anthropic');
+
+    // nonCached = 5 (Anthropic: additive, not subtracted)
+    // inputCredits  = 5      × $5.00  / 10000 = 0.0025
+    // cachedInput   = 38_673 × $0.50  / 10000 = 1.93365
+    // cacheWrite    = 21_060 × $6.25  / 10000 = 13.1625
+    // outputCredits = 205    × $25.00 / 10000 = 0.5125
+    // total = 15.6111
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.0025, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(15.6111, 4);
+  });
+
   it('warns and skips usage for unknown models', () => {
     const { lines } = collectLogOutput();
     const usage = applyAiCreditsUsage({ input_tokens: 100 }, 'unknown-model');
diff --git a/containers/api-proxy/guards/common-guard-checks.js b/containers/api-proxy/guards/common-guard-checks.js
index d3d92626d..b989ad6b1 100644
--- a/containers/api-proxy/guards/common-guard-checks.js
+++ b/containers/api-proxy/guards/common-guard-checks.js
@@ -65,7 +65,11 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getEffectiveTokenBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap: returning 429 would make LLM SDK clients treat this
+      // as a transient rate-limit and retry-storm against a limit that never
+      // recovers, burning the budget until the step times out. 403 is
+      // non-retryable, so the agent stops cleanly.
+      statusCode: 403,
       eventName: 'effective_tokens_limit_exceeded',
       buildError: buildEffectiveTokenLimitError,
       buildLogFields: block => ({
@@ -76,7 +80,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getMaxRunsBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'max_runs_exceeded',
       buildError: buildMaxRunsExceededError,
       buildLogFields: block => ({
@@ -87,7 +92,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getMaxCacheMissesBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'max_cache_misses_exceeded',
       buildError: buildMaxCacheMissesExceededError,
       buildLogFields: block => ({
@@ -109,7 +115,8 @@ function buildCommonGuardChecks(deps, model) {
     {
       block: getAiCreditsBlockState(),
       isBlocked: block => block && block.maxExceeded,
-      statusCode: 429,
+      // Terminal hard cap — non-retryable (see effective-tokens guard above).
+      statusCode: 403,
       eventName: 'ai_credits_limit_exceeded',
       buildError: buildAiCreditsLimitError,
       buildLogFields: block => ({
diff --git a/containers/api-proxy/provider-names.js b/containers/api-proxy/provider-names.js
new file mode 100644
index 000000000..ddc3e967d
--- /dev/null
+++ b/containers/api-proxy/provider-names.js
@@ -0,0 +1,23 @@
+'use strict';
+
+/**
+ * Centralized provider name constants.
+ *
+ * Use these instead of bare string literals when comparing provider names so
+ * that provider checks are spelling-safe and easy to find/refactor.
+ *
+ * NB: this module is intentionally named `provider-names` rather than
+ * `providers` to avoid colliding with the `providers/` directory (the upstream
+ * adapter registry resolved via `require('./providers')`).
+ */
+const PROVIDER_ANTHROPIC = 'anthropic';
+const PROVIDER_OPENAI = 'openai';
+const PROVIDER_COPILOT = 'copilot';
+const PROVIDER_GEMINI = 'gemini';
+
+module.exports = {
+  PROVIDER_ANTHROPIC,
+  PROVIDER_OPENAI,
+  PROVIDER_COPILOT,
+  PROVIDER_GEMINI,
+};
diff --git a/containers/api-proxy/server.token-guards.test.js b/containers/api-proxy/server.token-guards.test.js
index 383487d12..cde83dc6f 100644
--- a/containers/api-proxy/server.token-guards.test.js
+++ b/containers/api-proxy/server.token-guards.test.js
@@ -1,6 +1,6 @@
 /**
- * Tests for proxyRequest guards: effective token limit (429) and
- * max-runs limit (429).
+ * Tests for proxyRequest guards: effective token limit (403) and
+ * max-runs limit (403).
  *
  * Extracted from server.proxy.test.js.
  */
@@ -60,7 +60,7 @@ describe('proxyRequest effective token guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when effective token limit is reached', async () => {
+  it('returns 403 with structured payload when effective token limit is reached', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     const req1 = makeReq();
@@ -81,7 +81,7 @@ describe('proxyRequest effective token guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);
@@ -148,7 +148,7 @@ describe('proxyRequest max-runs guard', () => {
       jest.restoreAllMocks();
     });
 
-    it('returns 429 after max consecutive cache misses with non-zero input tokens', async () => {
+    it('returns 403 after max consecutive cache misses with non-zero input tokens', async () => {
       const cycle = createMockUpstreamCycle(https);
 
       const req1 = makeReq();
@@ -178,7 +178,7 @@ describe('proxyRequest max-runs guard', () => {
       await flushPromises();
 
       expect(cycle.spy).toHaveBeenCalledTimes(2);
-      expect(res3.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+      expect(res3.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
         'Content-Type': 'application/json',
       }));
       const payload = JSON.parse(res3.end.mock.calls[0][0]);
@@ -220,7 +220,7 @@ describe('proxyRequest max-runs guard', () => {
       await flushPromises();
 
       expect(cycle.spy).toHaveBeenCalledTimes(3);
-      expect(res3.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
+      expect(res3.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
     });
   });
 
@@ -230,7 +230,7 @@ describe('proxyRequest max-runs guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when max runs limit is exceeded', async () => {
+  it('returns 403 with structured payload when max runs limit is exceeded', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     // First request completes successfully — consumes the single allowed run
@@ -250,7 +250,7 @@ describe('proxyRequest max-runs guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);
@@ -273,7 +273,7 @@ describe('proxyRequest max-runs guard', () => {
     await flushPromises();
 
     expect(httpsRequestSpy).toHaveBeenCalledTimes(1);
-    expect(res.writeHead).not.toHaveBeenCalledWith(429, expect.anything());
+    expect(res.writeHead).not.toHaveBeenCalledWith(403, expect.anything());
   });
 });
 
@@ -296,7 +296,7 @@ describe('proxyRequest max-ai-credits guard', () => {
     jest.restoreAllMocks();
   });
 
-  it('returns 429 with structured payload when ai credits limit is reached', async () => {
+  it('returns 403 with structured payload when ai credits limit is reached', async () => {
     const cycle = createMockUpstreamCycle(https);
 
     const req1 = makeReq();
@@ -317,7 +317,7 @@ describe('proxyRequest max-ai-credits guard', () => {
     await flushPromises();
 
     expect(cycle.spy).toHaveBeenCalledTimes(1);
-    expect(res2.writeHead).toHaveBeenCalledWith(429, expect.objectContaining({
+    expect(res2.writeHead).toHaveBeenCalledWith(403, expect.objectContaining({
       'Content-Type': 'application/json',
     }));
     const payload = JSON.parse(res2.end.mock.calls[0][0]);
diff --git a/containers/api-proxy/server.websocket.test.js b/containers/api-proxy/server.websocket.test.js
index a42c6e44f..7dc86b242 100644
--- a/containers/api-proxy/server.websocket.test.js
+++ b/containers/api-proxy/server.websocket.test.js
@@ -323,38 +323,38 @@ describe('proxyWebSocket security guards', () => {
     jest.restoreAllMocks();
   });
 
-  it('blocks with 429 when max-runs limit is exceeded', () => {
+  it('blocks with 403 when max-runs limit is exceeded', () => {
     process.env.AWF_MAX_RUNS = '1';
     applyMaxRunsInvocation(); // consume the single allowed run
 
     const socket = makeMockSocket();
     wsProxy(makeUpgradeReq(), socket, Buffer.alloc(0), 'api.openai.com', {}, 'openai');
 
-    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 429 Too Many Requests'));
+    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 403 Forbidden'));
     expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('"max_runs_exceeded"'));
     expect(socket.destroy).toHaveBeenCalled();
   });
 
-  it('blocks with 429 when effective-token limit is exceeded', () => {
+  it('blocks with 403 when effective-token limit is exceeded', () => {
     process.env.AWF_MAX_EFFECTIVE_TOKENS = '1';
     applyEffectiveTokenUsage({ output_tokens: 5 }, 'gpt-4o'); // exceeds cap of 1
 
     const socket = makeMockSocket();
     wsProxy(makeUpgradeReq(), socket, Buffer.alloc(0), 'api.openai.com', {}, 'openai');
 
-    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 429 Too Many Requests'));
+    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 403 Forbidden'));
     expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('"effective_tokens_limit_exceeded"'));
     expect(socket.destroy).toHaveBeenCalled();
   });
 
-  it('blocks with 429 when max-cache-misses limit is exceeded', () => {
+  it('blocks with 403 when max-cache-misses limit is exceeded', () => {
     process.env.AWF_MAX_CACHE_MISSES = '1';
     applyMaxCacheMissesUsage({ input_tokens: 100, cache_read_tokens: 0 });
 
     const socket = makeMockSocket();
     wsProxy(makeUpgradeReq(), socket, Buffer.alloc(0), 'api.openai.com', {}, 'openai');
 
-    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 429 Too Many Requests'));
+    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 403 Forbidden'));
     expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('"max_cache_misses_exceeded"'));
     expect(socket.destroy).toHaveBeenCalled();
   });
@@ -371,14 +371,14 @@ describe('proxyWebSocket security guards', () => {
     expect(socket.destroy).toHaveBeenCalled();
   });
 
-  it('blocks with 429 when ai-credits limit is exceeded', () => {
+  it('blocks with 403 when ai-credits limit is exceeded', () => {
     process.env.AWF_MAX_AI_CREDITS = '0.000001'; // tiny cap — any real usage will exceed it
     applyAiCreditsUsage({ input_tokens: 1_000_000, output_tokens: 1_000_000 }, 'gpt-4o');
 
     const socket = makeMockSocket();
     wsProxy(makeUpgradeReq(), socket, Buffer.alloc(0), 'api.openai.com', {}, 'openai');
 
-    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 429 Too Many Requests'));
+    expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('HTTP/1.1 403 Forbidden'));
     expect(socket.write).toHaveBeenCalledWith(expect.stringContaining('"ai_credits_limit_exceeded"'));
     expect(socket.destroy).toHaveBeenCalled();
   });
diff --git a/containers/api-proxy/token-budget-log.js b/containers/api-proxy/token-budget-log.js
index 06e090893..74d9a1849 100644
--- a/containers/api-proxy/token-budget-log.js
+++ b/containers/api-proxy/token-budget-log.js
@@ -19,7 +19,7 @@ const { applyMaxCacheMissesUsage } = require('./guards/max-cache-misses-guard');
  */
 function computeTokenBudgetUsage({ logRequest, requestId, provider }, normalizedUsage, model) {
   const effectiveTokenUsage = applyEffectiveTokenUsage(normalizedUsage, model);
-  const aiCreditsUsage = applyAiCreditsUsage(normalizedUsage, model);
+  const aiCreditsUsage = applyAiCreditsUsage(normalizedUsage, model, provider);
   applyMaxCacheMissesUsage(normalizedUsage);
   if (aiCreditsUsage) {
     logRequest('info', 'token_budget_usage', {
diff --git a/docs/api-proxy-sidecar.md b/docs/api-proxy-sidecar.md
index 320fa2061..6e17770ba 100644
--- a/docs/api-proxy-sidecar.md
+++ b/docs/api-proxy-sidecar.md
@@ -877,7 +877,7 @@ After each successful upstream response, the proxy accumulates the effective tok
 
 - **Under budget**: Request is forwarded normally.
 - **Budget reached or exceeded**: Request is rejected immediately with:
-  - **HTTP `429 Too Many Requests`**
+  - **HTTP `403 Forbidden`**
   - **Error body**:
 
     ```json
@@ -891,10 +891,10 @@ After each successful upstream response, the proxy accumulates the effective tok
     }
     ```
 
-WebSocket upgrade requests are also rejected with `429` when the budget is reached or exceeded.
+WebSocket upgrade requests are also rejected with `403` when the budget is reached or exceeded.
 
 :::caution
-Once the budget is reached or exceeded, **all subsequent requests in the run are rejected**. The budget is not recoverable — there is no way to "free up" tokens within a single run.
+Once the budget is reached or exceeded, **all subsequent requests in the run are rejected**. The budget is not recoverable — there is no way to "free up" tokens within a single run. The rejection uses **HTTP `403`** (not `429`) precisely because the limit is terminal: a `429` would invite LLM SDK clients to retry with backoff against a cap that never recovers, burning the remaining run budget until the step times out.
 :::
 
 ### Threshold tracking and token steering
@@ -971,10 +971,10 @@ The response includes:
 
 ### Detecting budget exhaustion
 
-Agents and orchestrators should detect the `429` response and the `effective_tokens_limit_exceeded` error type. The error body is structured JSON and can be parsed programmatically:
+Agents and orchestrators should detect the `403` response and the `effective_tokens_limit_exceeded` error type. The error body is structured JSON and can be parsed programmatically:
 
 ```javascript
-if (response.status === 429) {
+if (response.status === 403) {
   const body = await response.json();
   if (body.error?.type === 'effective_tokens_limit_exceeded') {
     // Budget exhausted — stop making API calls
@@ -1005,7 +1005,7 @@ Before forwarding each request to the upstream provider, the proxy checks the in
 
 - **Under limit**: Request is forwarded normally.
 - **Limit reached or exceeded**: Request is rejected immediately with:
-  - **HTTP `429 Too Many Requests`**
+  - **HTTP `403 Forbidden`**
   - **Error body**:
 
     ```json
@@ -1019,7 +1019,7 @@ Before forwarding each request to the upstream provider, the proxy checks the in
     }
     ```
 
-WebSocket upgrade requests are also rejected with `429` when the limit is reached.
+WebSocket upgrade requests are also rejected with `403` when the limit is reached.
 
 :::caution
 Once the limit is reached, **all subsequent requests in the run are rejected**. The counter is not recoverable within a single run.
@@ -1045,7 +1045,7 @@ When `maxTurns` is not configured, `enabled` is `false` and `max_runs`/`remainin
 ### Detecting the limit
 
 ```javascript
-if (response.status === 429) {
+if (response.status === 403) {
   const body = await response.json();
   if (body.error?.type === 'max_runs_exceeded') {
     console.log(`Run limit exceeded: ${body.error.invocation_count} / ${body.error.max_runs}`);
diff --git a/docs/awf-config-spec.md b/docs/awf-config-spec.md
index af67fbb53..bf39cf622 100644
--- a/docs/awf-config-spec.md
+++ b/docs/awf-config-spec.md
@@ -630,7 +630,7 @@ The API proxy MUST enforce the budget as follows:
 
 3. **Rejection**: When the budget is reached or exceeded, the proxy MUST reject the
    request with:
-   - **HTTP status**: `429 Too Many Requests`
+   - **HTTP status**: `403 Forbidden`
    - **Content-Type**: `application/json`
    - **Response body**:
      ```json
@@ -645,7 +645,7 @@ The API proxy MUST enforce the budget as follows:
      ```
 
 4. **WebSocket rejection**: For WebSocket upgrade requests, the proxy MUST
-   reject with `HTTP/1.1 429 Too Many Requests` and include the same JSON
+   reject with `HTTP/1.1 403 Forbidden` and include the same JSON
    error body before destroying the socket.
 
 5. **Finality**: Once the budget is reached or exceeded, all subsequent requests in
@@ -730,12 +730,12 @@ container.
 
 When configured, the proxy MUST enforce this budget in addition to any
 configured `maxEffectiveTokens` budget. Once cumulative AI credits reach or
-exceed `maxAiCredits`, subsequent requests MUST be rejected with HTTP `429`
+exceed `maxAiCredits`, subsequent requests MUST be rejected with HTTP `403`
 and error type `ai_credits_limit_exceeded`.
 
 Regardless of `maxAiCredits` configuration, AWF also enforces a non-overridable
 hard cap of **10,000 AI credits**. When cumulative AI credits reach this hard
-cap, subsequent requests MUST be rejected with HTTP `429` and error type
+cap, subsequent requests MUST be rejected with HTTP `403` and error type
 `ai_credits_limit_exceeded`, and the error/log payload MUST include
 `hard_cap: true`.
 
@@ -837,7 +837,7 @@ The API proxy MUST enforce the max-runs limit as follows:
 
 2. **Rejection**: When the limit is reached or exceeded, the proxy MUST reject
    the request with:
-   - **HTTP status**: `429 Too Many Requests`
+   - **HTTP status**: `403 Forbidden`
    - **Content-Type**: `application/json`
    - **Response body**:
      ```json
@@ -852,7 +852,7 @@ The API proxy MUST enforce the max-runs limit as follows:
      ```
 
 3. **WebSocket rejection**: For WebSocket upgrade requests, the proxy MUST
-   reject with `HTTP/1.1 429 Too Many Requests` and include the same JSON
+   reject with `HTTP/1.1 403 Forbidden` and include the same JSON
    error body before destroying the socket.
 
 4. **Finality**: Once the limit is reached, all subsequent requests in the
diff --git a/docs/awf-config.schema.json b/docs/awf-config.schema.json
index 5a05365ae..74a713cc4 100644
--- a/docs/awf-config.schema.json
+++ b/docs/awf-config.schema.json
@@ -70,12 +70,12 @@
         "maxEffectiveTokens": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum cumulative effective tokens allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'effective_tokens_limit_exceeded'. Tokens are weighted: input ×1, cache-read ×0.1, output ×4, reasoning ×4. See spec §10."
+          "description": "Maximum cumulative effective tokens allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'effective_tokens_limit_exceeded'. Tokens are weighted: input ×1, cache-read ×0.1, output ×4, reasoning ×4. See spec §10."
         },
         "maxAiCredits": {
           "type": "number",
           "exclusiveMinimum": 0,
-          "description": "Maximum cumulative AI credits allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'ai_credits_limit_exceeded'. AWF also enforces a non-overridable hard cap of 10,000 AI credits; values above 10,000 are effectively clamped."
+          "description": "Maximum cumulative AI credits allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'ai_credits_limit_exceeded'. AWF also enforces a non-overridable hard cap of 10,000 AI credits; values above 10,000 are effectively clamped."
         },
         "defaultAiCreditsPricing": {
           "type": "object",
@@ -132,7 +132,7 @@
         "maxTurns": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum number of LLM invocations allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'max_runs_exceeded'. See spec §11."
+          "description": "Maximum number of LLM invocations allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'max_runs_exceeded'. See spec §11."
         },
         "maxRuns": {
           "type": "integer",
@@ -147,7 +147,7 @@
         "maxCacheMisses": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum number of consecutive cache misses allowed per run. A miss is counted only for successful responses with non-zero input_tokens and zero cache_read_tokens. Responses with cache_read_tokens > 0 reset the streak. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'max_cache_misses_exceeded'."
+          "description": "Maximum number of consecutive cache misses allowed per run. A miss is counted only for successful responses with non-zero input_tokens and zero cache_read_tokens. Responses with cache_read_tokens > 0 reset the streak. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'max_cache_misses_exceeded'."
         },
         "requestedModel": {
           "type": "string",
diff --git a/src/awf-config-schema.json b/src/awf-config-schema.json
index 5a05365ae..74a713cc4 100644
--- a/src/awf-config-schema.json
+++ b/src/awf-config-schema.json
@@ -70,12 +70,12 @@
         "maxEffectiveTokens": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum cumulative effective tokens allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'effective_tokens_limit_exceeded'. Tokens are weighted: input ×1, cache-read ×0.1, output ×4, reasoning ×4. See spec §10."
+          "description": "Maximum cumulative effective tokens allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'effective_tokens_limit_exceeded'. Tokens are weighted: input ×1, cache-read ×0.1, output ×4, reasoning ×4. See spec §10."
         },
         "maxAiCredits": {
           "type": "number",
           "exclusiveMinimum": 0,
-          "description": "Maximum cumulative AI credits allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'ai_credits_limit_exceeded'. AWF also enforces a non-overridable hard cap of 10,000 AI credits; values above 10,000 are effectively clamped."
+          "description": "Maximum cumulative AI credits allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'ai_credits_limit_exceeded'. AWF also enforces a non-overridable hard cap of 10,000 AI credits; values above 10,000 are effectively clamped."
         },
         "defaultAiCreditsPricing": {
           "type": "object",
@@ -132,7 +132,7 @@
         "maxTurns": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum number of LLM invocations allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'max_runs_exceeded'. See spec §11."
+          "description": "Maximum number of LLM invocations allowed for a run. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'max_runs_exceeded'. See spec §11."
         },
         "maxRuns": {
           "type": "integer",
@@ -147,7 +147,7 @@
         "maxCacheMisses": {
           "type": "integer",
           "minimum": 1,
-          "description": "Maximum number of consecutive cache misses allowed per run. A miss is counted only for successful responses with non-zero input_tokens and zero cache_read_tokens. Responses with cache_read_tokens > 0 reset the streak. When reached, the API proxy rejects subsequent requests with HTTP 429 and error type 'max_cache_misses_exceeded'."
+          "description": "Maximum number of consecutive cache misses allowed per run. A miss is counted only for successful responses with non-zero input_tokens and zero cache_read_tokens. Responses with cache_read_tokens > 0 reset the streak. When reached, the API proxy rejects subsequent requests with HTTP 403 and error type 'max_cache_misses_exceeded'."
         },
         "requestedModel": {
           "type": "string",

From 8b0ee058f22d2a9ae3a1bdac84589a50d85ed002 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 19 Jun 2026 02:54:09 +0000
Subject: [PATCH 2/3] fix(api-proxy): handle copilot fresh input in ai credits

---
 containers/api-proxy/guards/ai-credits-guard.js   |  6 +++---
 .../api-proxy/guards/ai-credits-guard.test.js     | 15 +++++++++++++++
 containers/api-proxy/server.token-guards.test.js  |  5 +++--
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/containers/api-proxy/guards/ai-credits-guard.js b/containers/api-proxy/guards/ai-credits-guard.js
index daf67760c..10c28a293 100644
--- a/containers/api-proxy/guards/ai-credits-guard.js
+++ b/containers/api-proxy/guards/ai-credits-guard.js
@@ -4,7 +4,7 @@ const { logRequest, sanitizeForLog } = require('../logging');
 const pricingByModel = require('../ai-credits-pricing');
 const { resolveCatalogModel } = require('../models-dev-catalog');
 const { parsePositiveNumber } = require('./guard-utils');
-const { PROVIDER_ANTHROPIC } = require('../provider-names');
+const { PROVIDER_ANTHROPIC, PROVIDER_COPILOT } = require('../provider-names');
 
 const TOKENS_PER_MILLION = 1_000_000;
 const DOLLARS_PER_CREDIT = 0.01;
@@ -171,7 +171,7 @@ function calculateAiCredits(normalizedUsage, model, state = aiCreditsState, prov
   if (!pricing) return null;
 
   // input_tokens semantics differ by provider:
-  //  - Anthropic reports input_tokens as the NON-cached input only;
+  //  - Anthropic and Copilot report input_tokens as the NON-cached input only;
   //    cache_read_input_tokens and cache_creation_input_tokens are reported
   //    separately and are ADDITIVE to input_tokens. Subtracting them here would
   //    over-subtract and undercount the genuinely-fresh input tokens.
@@ -181,7 +181,7 @@ function calculateAiCredits(normalizedUsage, model, state = aiCreditsState, prov
   const reportedInput = normalizedUsage.input_tokens || 0;
   const cacheReadTokens = normalizedUsage.cache_read_tokens || 0;
   const cacheWriteTokens = normalizedUsage.cache_write_tokens || 0;
-  const nonCachedInput = provider === PROVIDER_ANTHROPIC
+  const nonCachedInput = provider === PROVIDER_ANTHROPIC || provider === PROVIDER_COPILOT
     ? reportedInput
     : Math.max(0, reportedInput - cacheReadTokens - cacheWriteTokens);
 
diff --git a/containers/api-proxy/guards/ai-credits-guard.test.js b/containers/api-proxy/guards/ai-credits-guard.test.js
index 379742482..4b50bd1dd 100644
--- a/containers/api-proxy/guards/ai-credits-guard.test.js
+++ b/containers/api-proxy/guards/ai-credits-guard.test.js
@@ -157,6 +157,21 @@ describe('ai-credits-guard', () => {
     expect(usage.aiCreditsThisResponse).toBeCloseTo(15.6111, 4);
   });
 
+  it('treats Copilot input_tokens as non-cached when provider is copilot', () => {
+    const usage = applyAiCreditsUsage({
+      input_tokens: 100,
+      cache_read_tokens: 10_000,
+      output_tokens: 0,
+    }, 'gpt-5.4', 'copilot');
+
+    // inputCredits = 100 × $2.50 / 10000 = 0.025
+    // cachedInputCredits = 10_000 × $0.25 / 10000 = 0.25
+    // total = 0.275
+    expect(usage.inputCreditsThisResponse).toBeCloseTo(0.025, 10);
+    expect(usage.cachedInputCreditsThisResponse).toBeCloseTo(0.25, 10);
+    expect(usage.aiCreditsThisResponse).toBeCloseTo(0.275, 10);
+  });
+
   it('warns and skips usage for unknown models', () => {
     const { lines } = collectLogOutput();
     const usage = applyAiCreditsUsage({ input_tokens: 100 }, 'unknown-model');
diff --git a/containers/api-proxy/server.token-guards.test.js b/containers/api-proxy/server.token-guards.test.js
index cde83dc6f..cc0209f2f 100644
--- a/containers/api-proxy/server.token-guards.test.js
+++ b/containers/api-proxy/server.token-guards.test.js
@@ -1,6 +1,7 @@
 /**
- * Tests for proxyRequest guards: effective token limit (403) and
- * max-runs limit (403).
+ * Tests for proxyRequest token and permission guard behavior, including
+ * effective-token, max-runs, max-cache-misses, AI-credits, and
+ * permission-denied enforcement paths.
  *
  * Extracted from server.proxy.test.js.
  */

From 3b46ba93e77a11473c5b6a2076f682465f20702e Mon Sep 17 00:00:00 2001
From: Landon Cox <landon.cox@microsoft.com>
Date: Thu, 18 Jun 2026 20:12:24 -0700
Subject: [PATCH 3/3] test(smoke-claude): raise max-turns/maxRuns from 2 to 5

The maxRuns:2 cap was too tight for the smoke prompt: the agent
routinely burns its 2 invocations on a planning turn plus a parallel
capability-probe before emitting its safe output, then hits the cap and
fails. Bump max-turns (which drives apiProxy.maxRuns) to 5 so the smoke
test has headroom to complete. Recompiled the lock file and updated the
workflow test assertions accordingly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/workflows/smoke-claude.lock.yml  | 8 ++++----
 .github/workflows/smoke-claude.md        | 2 +-
 scripts/ci/smoke-claude-workflow.test.ts | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/smoke-claude.lock.yml b/.github/workflows/smoke-claude.lock.yml
index 497086848..77736fdfb 100644
--- a/.github/workflows/smoke-claude.lock.yml
+++ b/.github/workflows/smoke-claude.lock.yml
@@ -1,4 +1,4 @@
-# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"1931d05a82aa65b2b1d5af50c9dcde1453044c61ac1c0718031eb2eca5c6b046","body_hash":"61fdfb929477edfef0935407ef5e3016122fdda0a2bc1fb9e82755c7dbbeb886","compiler_version":"v0.79.6","agent_id":"claude","agent_model":"claude-haiku-4-5","engine_versions":{"claude":"2.1.168"}}
+# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"56c81b9f5ed7b54bd55d1b6a753f29387b124ed89f38bff44c19925629b1468f","body_hash":"61fdfb929477edfef0935407ef5e3016122fdda0a2bc1fb9e82755c7dbbeb886","compiler_version":"v0.79.6","agent_id":"claude","agent_model":"claude-haiku-4-5","engine_versions":{"claude":"2.1.168"}}
 # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"5c2fe865bb4dc46e1450f6ee0d0541d759aea73a","version":"v0.79.6"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.2","digest":"sha256:f88e5b17b6b7a600117bc121114d6ce2155c88c983c0c939c5df884f730fa1d6","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.2@sha256:f88e5b17b6b7a600117bc121114d6ce2155c88c983c0c939c5df884f730fa1d6"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.2","digest":"sha256:ee39841d980878ebbb87592903b06d31a1af500c71525c9616f7e8e2a27041a4","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.2@sha256:ee39841d980878ebbb87592903b06d31a1af500c71525c9616f7e8e2a27041a4"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.2","digest":"sha256:2e3a717e5f19a654cd9a2263beb52012b56bcb68562ec5ae2e42f9d156b49591","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.2@sha256:2e3a717e5f19a654cd9a2263beb52012b56bcb68562ec5ae2e42f9d156b49591"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.1","digest":"sha256:287fad0236959f3b3d9936ea1ef8d5b4f135ef2a5f5789713495cbbef191e60c","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.1@sha256:287fad0236959f3b3d9936ea1ef8d5b4f135ef2a5f5789713495cbbef191e60c"}]}
 #    ___                   _   _      
 #   / _ \                 | | (_)     
@@ -804,7 +804,7 @@ jobs:
           touch /tmp/gh-aw/agent-step-summary.md
           (umask 177 && touch /tmp/gh-aw/agent-stdio.log)
           GH_AW_MAX_AI_CREDITS="${{ vars.GH_AW_DEFAULT_MAX_AI_CREDITS || '1000' }}"
-          printf '%s\n' "{\"\$schema\":\"https://github.com/github/gh-aw-firewall/releases/download/v0.27.2/awf-config.schema.json\",\"network\":{\"allowDomains\":[\"*.githubusercontent.com\",\"anthropic.com\",\"api.anthropic.com\",\"api.github.com\",\"api.snapcraft.io\",\"archive.ubuntu.com\",\"azure.archive.ubuntu.com\",\"cdn.playwright.dev\",\"codeload.github.com\",\"crl.geotrust.com\",\"crl.globalsign.com\",\"crl.identrust.com\",\"crl.sectigo.com\",\"crl.thawte.com\",\"crl.usertrust.com\",\"crl.verisign.com\",\"crl3.digicert.com\",\"crl4.digicert.com\",\"crls.ssl.com\",\"files.pythonhosted.org\",\"ghcr.io\",\"github-cloud.githubusercontent.com\",\"github-cloud.s3.amazonaws.com\",\"github.com\",\"host.docker.internal\",\"json-schema.org\",\"json.schemastore.org\",\"keyserver.ubuntu.com\",\"lfs.github.com\",\"objects.githubusercontent.com\",\"ocsp.digicert.com\",\"ocsp.geotrust.com\",\"ocsp.globalsign.com\",\"ocsp.identrust.com\",\"ocsp.sectigo.com\",\"ocsp.ssl.com\",\"ocsp.thawte.com\",\"ocsp.usertrust.com\",\"ocsp.verisign.com\",\"packagecloud.io\",\"packages.cloud.google.com\",\"packages.microsoft.com\",\"playwright.download.prss.microsoft.com\",\"ppa.launchpad.net\",\"pypi.org\",\"raw.githubusercontent.com\",\"registry.npmjs.org\",\"s.symcb.com\",\"s.symcd.com\",\"security.ubuntu.com\",\"sentry.io\",\"statsig.anthropic.com\",\"ts-crl.ws.symantec.com\",\"ts-ocsp.ws.symantec.com\",\"www.googleapis.com\"]},\"apiProxy\":{\"enabled\":true,\"enableTokenSteering\":true,\"maxRuns\":2,\"maxAiCredits\":${GH_AW_MAX_AI_CREDITS},\"models\":{\"agent\":[\"sonnet-6x\",\"gpt-5.4\",\"gpt-5.3\",\"gemini-pro\",\"any\"],\"antigravity\":[\"copilot/antigravity*\",\"google/antigravity*\",\"gemini/antigravity*\"],\"any\":[\"copilot/*\",\"anthropic/*\",\"openai/*\",\"google/*\",\"gemini/*\"],\"claude\":[\"agent\"],\"codex\":[\"agent\"],\"coding\":[\"copilot/gpt-5*codex*\",\"openai/gpt-5*codex*\",\"gpt-5-codex\"],\"computer-use\":[\"copilot/*computer-use*\",\"google/*computer-use*\",\"gemini/*computer-use*\",\"openai/*computer-use*\"],\"copilot\":[\"agent\"],\"deep-research\":[\"copilot/deep-research*\",\"copilot/o3-deep-research*\",\"copilot/o4-mini-deep-research*\",\"google/deep-research*\",\"gemini/deep-research*\",\"openai/o3-deep-research*\",\"openai/o4-mini-deep-research*\"],\"gemini\":[\"agent\"],\"gemini-3-flash\":[\"copilot/gemini-3*flash*\",\"google/gemini-3*flash*\",\"gemini/gemini-3*flash*\"],\"gemini-3-pro\":[\"copilot/gemini-3*pro*\",\"google/gemini-3*pro*\",\"google/nano-banana*\",\"gemini/gemini-3*pro*\"],\"gemini-3.1-flash\":[\"copilot/gemini-3.1*flash*\",\"google/gemini-3.1*flash*\",\"gemini/gemini-3.1*flash*\"],\"gemini-3.1-pro\":[\"copilot/gemini-3.1*pro*\",\"google/gemini-3.1*pro*\",\"gemini/gemini-3.1*pro*\"],\"gemini-3.5-flash\":[\"copilot/gemini-3.5*flash*\",\"google/gemini-3.5*flash*\",\"gemini/gemini-3.5*flash*\"],\"gemini-flash\":[\"copilot/gemini-*flash*\",\"google/gemini-*flash*\",\"gemini/gemini-*flash*\"],\"gemini-flash-lite\":[\"copilot/gemini-*flash*lite*\",\"google/gemini-*flash*lite*\",\"gemini/gemini-*flash*lite*\"],\"gemini-pro\":[\"copilot/gemini-*pro*\",\"google/gemini-*pro*\",\"gemini/gemini-*pro*\"],\"gemma\":[\"copilot/gemma*\",\"google/gemma*\",\"gemini/gemma*\"],\"gpt-5\":[\"copilot/gpt-5*\",\"openai/gpt-5*\"],\"gpt-5-codex\":[\"copilot/gpt-5*codex*\",\"openai/gpt-5*codex*\"],\"gpt-5-mini\":[\"copilot/gpt-5*mini*\",\"openai/gpt-5*mini*\"],\"gpt-5-nano\":[\"copilot/gpt-5*nano*\",\"openai/gpt-5*nano*\"],\"gpt-5-pro\":[\"copilot/gpt-5*pro*\",\"openai/gpt-5*pro*\"],\"gpt-5.2\":[\"copilot/gpt-5.2*\",\"openai/gpt-5.2*\"],\"gpt-5.3\":[\"copilot/gpt-5.3*\",\"openai/gpt-5.3*\"],\"gpt-5.4\":[\"copilot/gpt-5.4*\",\"openai/gpt-5.4*\"],\"gpt-5.5\":[\"copilot/gpt-5.5*\",\"openai/gpt-5.5*\"],\"haiku\":[\"copilot/*haiku*\",\"anthropic/*haiku*\"],\"large\":[\"sonnet\",\"gpt-5-pro\",\"gpt-5\",\"gemini-pro\"],\"mai-code\":[\"copilot/MAI-Code*\",\"copilot/mai-code*\",\"openai/MAI-Code*\"],\"mini\":[\"haiku\",\"gpt-5-mini\",\"gpt-5-nano\",\"gemini-flash-lite\"],\"nano-banana\":[\"copilot/nano-banana*\",\"google/nano-banana*\",\"gemini/nano-banana*\"],\"opus\":[\"copilot/*opus*\",\"anthropic/*opus*\"],\"opusplan\":[\"opus?effort=high\"],\"reasoning\":[\"copilot/o1*\",\"copilot/o3*\",\"copilot/o4*\",\"openai/o1*\",\"openai/o3*\",\"openai/o4*\"],\"robotics\":[\"copilot/*robotics*\",\"google/*robotics*\",\"gemini/*robotics*\"],\"small\":[\"mini\"],\"small-agent\":[\"haiku\",\"gpt-5-mini\",\"gemini-flash\"],\"sonnet\":[\"copilot/*sonnet*\",\"anthropic/*sonnet*\"],\"sonnet-6x\":[\"copilot/*sonnet-4.5*\",\"copilot/*sonnet-4.6*\",\"copilot/*sonnet-4-5-*\",\"anthropic/*sonnet-4-5-*\",\"copilot/*sonnet-4-6*\",\"anthropic/*sonnet-4-6*\"],\"summarization\":[\"haiku\",\"gpt-5-mini\",\"gemini-flash-lite\",\"mini\"],\"vision\":[\"copilot/gemini-*image*\",\"gemini/gemini-*image*\",\"copilot/gemini-*flash*\",\"gemini/gemini-*flash*\"]}},\"container\":{\"imageTag\":\"0.27.2,squid=sha256:2e3a717e5f19a654cd9a2263beb52012b56bcb68562ec5ae2e42f9d156b49591,agent=sha256:f88e5b17b6b7a600117bc121114d6ce2155c88c983c0c939c5df884f730fa1d6,api-proxy=sha256:ee39841d980878ebbb87592903b06d31a1af500c71525c9616f7e8e2a27041a4,cli-proxy=sha256:02f3ec08f32dc26c5427920c6a2e2f3036238fce44802f2f11ef49ed8621b5d0\"}}" > "${RUNNER_TEMP}/gh-aw/awf-config.json"
+          printf '%s\n' "{\"\$schema\":\"https://github.com/github/gh-aw-firewall/releases/download/v0.27.2/awf-config.schema.json\",\"network\":{\"allowDomains\":[\"*.githubusercontent.com\",\"anthropic.com\",\"api.anthropic.com\",\"api.github.com\",\"api.snapcraft.io\",\"archive.ubuntu.com\",\"azure.archive.ubuntu.com\",\"cdn.playwright.dev\",\"codeload.github.com\",\"crl.geotrust.com\",\"crl.globalsign.com\",\"crl.identrust.com\",\"crl.sectigo.com\",\"crl.thawte.com\",\"crl.usertrust.com\",\"crl.verisign.com\",\"crl3.digicert.com\",\"crl4.digicert.com\",\"crls.ssl.com\",\"files.pythonhosted.org\",\"ghcr.io\",\"github-cloud.githubusercontent.com\",\"github-cloud.s3.amazonaws.com\",\"github.com\",\"host.docker.internal\",\"json-schema.org\",\"json.schemastore.org\",\"keyserver.ubuntu.com\",\"lfs.github.com\",\"objects.githubusercontent.com\",\"ocsp.digicert.com\",\"ocsp.geotrust.com\",\"ocsp.globalsign.com\",\"ocsp.identrust.com\",\"ocsp.sectigo.com\",\"ocsp.ssl.com\",\"ocsp.thawte.com\",\"ocsp.usertrust.com\",\"ocsp.verisign.com\",\"packagecloud.io\",\"packages.cloud.google.com\",\"packages.microsoft.com\",\"playwright.download.prss.microsoft.com\",\"ppa.launchpad.net\",\"pypi.org\",\"raw.githubusercontent.com\",\"registry.npmjs.org\",\"s.symcb.com\",\"s.symcd.com\",\"security.ubuntu.com\",\"sentry.io\",\"statsig.anthropic.com\",\"ts-crl.ws.symantec.com\",\"ts-ocsp.ws.symantec.com\",\"www.googleapis.com\"]},\"apiProxy\":{\"enabled\":true,\"enableTokenSteering\":true,\"maxRuns\":5,\"maxAiCredits\":${GH_AW_MAX_AI_CREDITS},\"models\":{\"agent\":[\"sonnet-6x\",\"gpt-5.4\",\"gpt-5.3\",\"gemini-pro\",\"any\"],\"antigravity\":[\"copilot/antigravity*\",\"google/antigravity*\",\"gemini/antigravity*\"],\"any\":[\"copilot/*\",\"anthropic/*\",\"openai/*\",\"google/*\",\"gemini/*\"],\"claude\":[\"agent\"],\"codex\":[\"agent\"],\"coding\":[\"copilot/gpt-5*codex*\",\"openai/gpt-5*codex*\",\"gpt-5-codex\"],\"computer-use\":[\"copilot/*computer-use*\",\"google/*computer-use*\",\"gemini/*computer-use*\",\"openai/*computer-use*\"],\"copilot\":[\"agent\"],\"deep-research\":[\"copilot/deep-research*\",\"copilot/o3-deep-research*\",\"copilot/o4-mini-deep-research*\",\"google/deep-research*\",\"gemini/deep-research*\",\"openai/o3-deep-research*\",\"openai/o4-mini-deep-research*\"],\"gemini\":[\"agent\"],\"gemini-3-flash\":[\"copilot/gemini-3*flash*\",\"google/gemini-3*flash*\",\"gemini/gemini-3*flash*\"],\"gemini-3-pro\":[\"copilot/gemini-3*pro*\",\"google/gemini-3*pro*\",\"google/nano-banana*\",\"gemini/gemini-3*pro*\"],\"gemini-3.1-flash\":[\"copilot/gemini-3.1*flash*\",\"google/gemini-3.1*flash*\",\"gemini/gemini-3.1*flash*\"],\"gemini-3.1-pro\":[\"copilot/gemini-3.1*pro*\",\"google/gemini-3.1*pro*\",\"gemini/gemini-3.1*pro*\"],\"gemini-3.5-flash\":[\"copilot/gemini-3.5*flash*\",\"google/gemini-3.5*flash*\",\"gemini/gemini-3.5*flash*\"],\"gemini-flash\":[\"copilot/gemini-*flash*\",\"google/gemini-*flash*\",\"gemini/gemini-*flash*\"],\"gemini-flash-lite\":[\"copilot/gemini-*flash*lite*\",\"google/gemini-*flash*lite*\",\"gemini/gemini-*flash*lite*\"],\"gemini-pro\":[\"copilot/gemini-*pro*\",\"google/gemini-*pro*\",\"gemini/gemini-*pro*\"],\"gemma\":[\"copilot/gemma*\",\"google/gemma*\",\"gemini/gemma*\"],\"gpt-5\":[\"copilot/gpt-5*\",\"openai/gpt-5*\"],\"gpt-5-codex\":[\"copilot/gpt-5*codex*\",\"openai/gpt-5*codex*\"],\"gpt-5-mini\":[\"copilot/gpt-5*mini*\",\"openai/gpt-5*mini*\"],\"gpt-5-nano\":[\"copilot/gpt-5*nano*\",\"openai/gpt-5*nano*\"],\"gpt-5-pro\":[\"copilot/gpt-5*pro*\",\"openai/gpt-5*pro*\"],\"gpt-5.2\":[\"copilot/gpt-5.2*\",\"openai/gpt-5.2*\"],\"gpt-5.3\":[\"copilot/gpt-5.3*\",\"openai/gpt-5.3*\"],\"gpt-5.4\":[\"copilot/gpt-5.4*\",\"openai/gpt-5.4*\"],\"gpt-5.5\":[\"copilot/gpt-5.5*\",\"openai/gpt-5.5*\"],\"haiku\":[\"copilot/*haiku*\",\"anthropic/*haiku*\"],\"large\":[\"sonnet\",\"gpt-5-pro\",\"gpt-5\",\"gemini-pro\"],\"mai-code\":[\"copilot/MAI-Code*\",\"copilot/mai-code*\",\"openai/MAI-Code*\"],\"mini\":[\"haiku\",\"gpt-5-mini\",\"gpt-5-nano\",\"gemini-flash-lite\"],\"nano-banana\":[\"copilot/nano-banana*\",\"google/nano-banana*\",\"gemini/nano-banana*\"],\"opus\":[\"copilot/*opus*\",\"anthropic/*opus*\"],\"opusplan\":[\"opus?effort=high\"],\"reasoning\":[\"copilot/o1*\",\"copilot/o3*\",\"copilot/o4*\",\"openai/o1*\",\"openai/o3*\",\"openai/o4*\"],\"robotics\":[\"copilot/*robotics*\",\"google/*robotics*\",\"gemini/*robotics*\"],\"small\":[\"mini\"],\"small-agent\":[\"haiku\",\"gpt-5-mini\",\"gemini-flash\"],\"sonnet\":[\"copilot/*sonnet*\",\"anthropic/*sonnet*\"],\"sonnet-6x\":[\"copilot/*sonnet-4.5*\",\"copilot/*sonnet-4.6*\",\"copilot/*sonnet-4-5-*\",\"anthropic/*sonnet-4-5-*\",\"copilot/*sonnet-4-6*\",\"anthropic/*sonnet-4-6*\"],\"summarization\":[\"haiku\",\"gpt-5-mini\",\"gemini-flash-lite\",\"mini\"],\"vision\":[\"copilot/gemini-*image*\",\"gemini/gemini-*image*\",\"copilot/gemini-*flash*\",\"gemini/gemini-*flash*\"]}},\"container\":{\"imageTag\":\"0.27.2,squid=sha256:2e3a717e5f19a654cd9a2263beb52012b56bcb68562ec5ae2e42f9d156b49591,agent=sha256:f88e5b17b6b7a600117bc121114d6ce2155c88c983c0c939c5df884f730fa1d6,api-proxy=sha256:ee39841d980878ebbb87592903b06d31a1af500c71525c9616f7e8e2a27041a4,cli-proxy=sha256:02f3ec08f32dc26c5427920c6a2e2f3036238fce44802f2f11ef49ed8621b5d0\"}}" > "${RUNNER_TEMP}/gh-aw/awf-config.json"
           GH_AW_MODEL_MULTIPLIERS_PATH="/tmp/gh-aw/model_multipliers.json" node "${RUNNER_TEMP}/gh-aw/actions/merge_awf_model_multipliers.cjs"
           cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json
           export GH_AW_MODELS_JSON_PATH="/tmp/gh-aw/models.json"
@@ -823,7 +823,7 @@ jobs:
           fi
           # shellcheck disable=SC1003
           sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" ${GH_AW_TOOL_CACHE_MOUNT:+--mount "$GH_AW_TOOL_CACHE_MOUNT"} ${GH_AW_DOCKER_HOST_PATH_PREFIX_ARGS} --tty --env-all --exclude-env ANTHROPIC_API_KEY --exclude-env MCP_GATEWAY_API_KEY --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --session-state-dir /tmp/gh-aw/sandbox/agent/session-state --enable-host-access --allow-host-ports 80,443,8080 --build-local \
-            -- /bin/bash -c 'set +o histexpand; export PATH="${RUNNER_TEMP}/gh-aw/mcp-cli/bin:$PATH" && GH_AW_TOOL_CACHE="${RUNNER_TOOL_CACHE:-/opt/hostedtoolcache}"; export PATH="$(find "$GH_AW_TOOL_CACHE" /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 5 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || true)"; fi; if [ -z "$GH_AW_NODE_EXEC" ]; then echo "node runtime missing on this runner — check runtimes.node in workflow YAML" >&2; exit 127; fi; GH_AW_NPM_GLOBAL_ROOT="$(npm root -g 2>/dev/null || true)"; if [ -n "$GH_AW_NPM_GLOBAL_ROOT" ]; then export NODE_PATH="${GH_AW_NPM_GLOBAL_ROOT}${NODE_PATH:+:${NODE_PATH}}"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/claude_harness.cjs claude --print --no-chrome --max-turns 2 --allowed-tools '\''Bash(bash),Bash(cat),Bash(date),Bash(echo),Bash(grep),Bash(head),Bash(ls),Bash(printf),Bash(pwd),Bash(safeoutputs:*),Bash(sort),Bash(tail),Bash(uniq),Bash(wc),Bash(yq),BashOutput,Edit,Edit(/tmp/*),Edit(/tmp/gh-aw/agent/*),ExitPlanMode,Glob,Grep,KillBash,LS,MultiEdit,MultiEdit(/tmp/*),MultiEdit(/tmp/gh-aw/agent/*),NotebookEdit,NotebookRead,Read,Read(/tmp/*),Read(/tmp/gh-aw/agent/*),Task,TodoWrite,Write,Write(/tmp/*),Write(/tmp/gh-aw/agent/*),mcp__safeoutputs'\'' --debug-file /tmp/gh-aw/agent-stdio.log --verbose --permission-mode acceptEdits --output-format stream-json --mcp-config "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log
+            -- /bin/bash -c 'set +o histexpand; export PATH="${RUNNER_TEMP}/gh-aw/mcp-cli/bin:$PATH" && GH_AW_TOOL_CACHE="${RUNNER_TOOL_CACHE:-/opt/hostedtoolcache}"; export PATH="$(find "$GH_AW_TOOL_CACHE" /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 5 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || true)"; fi; if [ -z "$GH_AW_NODE_EXEC" ]; then echo "node runtime missing on this runner — check runtimes.node in workflow YAML" >&2; exit 127; fi; GH_AW_NPM_GLOBAL_ROOT="$(npm root -g 2>/dev/null || true)"; if [ -n "$GH_AW_NPM_GLOBAL_ROOT" ]; then export NODE_PATH="${GH_AW_NPM_GLOBAL_ROOT}${NODE_PATH:+:${NODE_PATH}}"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/claude_harness.cjs claude --print --no-chrome --max-turns 5 --allowed-tools '\''Bash(bash),Bash(cat),Bash(date),Bash(echo),Bash(grep),Bash(head),Bash(ls),Bash(printf),Bash(pwd),Bash(safeoutputs:*),Bash(sort),Bash(tail),Bash(uniq),Bash(wc),Bash(yq),BashOutput,Edit,Edit(/tmp/*),Edit(/tmp/gh-aw/agent/*),ExitPlanMode,Glob,Grep,KillBash,LS,MultiEdit,MultiEdit(/tmp/*),MultiEdit(/tmp/gh-aw/agent/*),NotebookEdit,NotebookRead,Read,Read(/tmp/*),Read(/tmp/gh-aw/agent/*),Task,TodoWrite,Write,Write(/tmp/*),Write(/tmp/gh-aw/agent/*),mcp__safeoutputs'\'' --debug-file /tmp/gh-aw/agent-stdio.log --verbose --permission-mode acceptEdits --output-format stream-json --mcp-config "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           ANTHROPIC_MODEL: claude-haiku-4-5
@@ -833,7 +833,7 @@ jobs:
           DISABLE_BUG_COMMAND: 1
           DISABLE_ERROR_REPORTING: 1
           DISABLE_TELEMETRY: 1
-          GH_AW_MAX_TURNS: 2
+          GH_AW_MAX_TURNS: 5
           GH_AW_MCP_CONFIG: ${{ runner.temp }}/gh-aw/mcp-config/mcp-servers.json
           GH_AW_PHASE: agent
           GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
diff --git a/.github/workflows/smoke-claude.md b/.github/workflows/smoke-claude.md
index 5b1b1c538..fd7dd4cf4 100644
--- a/.github/workflows/smoke-claude.md
+++ b/.github/workflows/smoke-claude.md
@@ -15,7 +15,7 @@ permissions:
   pull-requests: read
   
 name: Smoke Claude
-max-turns: 2
+max-turns: 5
 engine:
   id: claude
   model: claude-haiku-4-5
diff --git a/scripts/ci/smoke-claude-workflow.test.ts b/scripts/ci/smoke-claude-workflow.test.ts
index 235ec431a..98d9861c2 100644
--- a/scripts/ci/smoke-claude-workflow.test.ts
+++ b/scripts/ci/smoke-claude-workflow.test.ts
@@ -9,7 +9,7 @@ describe('smoke claude workflow optimization config', () => {
   it('uses pre-computed result step and minimal turn budget in source workflow', () => {
     const source = fs.readFileSync(smokeClaudeSourcePath, 'utf-8');
 
-    expect(source).toContain('max-turns: 2');
+    expect(source).toContain('max-turns: 5');
     expect(source).toContain('Check GitHub.com reachability');
     expect(source).toContain('/tmp/gh-aw/agent/smoke-context.txt');
     expect(source).toContain('curl -fsSL --max-time 15 https://github.com');
@@ -35,10 +35,10 @@ describe('smoke claude workflow optimization config', () => {
     expect(source).not.toContain('safeoutputs add_labels . < /tmp/gh-aw/agent/labels.json');
   });
 
-  it('compiles the workflow without playwright tools and with max-turns 2', () => {
+  it('compiles the workflow without playwright tools and with max-turns 5', () => {
     const lock = fs.readFileSync(smokeClaudeLockPath, 'utf-8');
 
-    expect(lock).toContain('--max-turns 2');
+    expect(lock).toContain('--max-turns 5');
     expect(lock).toContain('Check GitHub.com reachability');
     expect(lock).toContain('playwright_check=✅ PASS');
     expect(lock).toContain('Compute final smoke result');