From 474d3838fe815b8a78e40d6da4aa092307a8c78e Mon Sep 17 00:00:00 2001
From: Landon Cox <landon.cox@microsoft.com>
Date: Thu, 18 Jun 2026 12:56:24 -0700
Subject: [PATCH 1/3] fix(api-proxy): map OpenAI Responses API cached tokens to
 cache_read

The token normalizer recognized cached prompt tokens from the Chat
Completions API (usage.prompt_tokens_details.cached_tokens) and Anthropic
(cache_read_input_tokens), but not the OpenAI Responses API (/responses),
which reports them under usage.input_tokens_details.cached_tokens as an
object property.

Because extractCacheReadTokens only treated input_tokens_details as a
token-entry array, Responses API cache reads silently fell through and were
recorded as cache_read_tokens: 0. Agents using the /responses endpoint
(e.g. codex) with heavy automatic prompt caching had their cache hits
completely unreported, which also skews AI-credits accounting since the
guard prices the non-cached input as input_tokens - cache_read_tokens.

Fix extractCacheReadTokens to read input_tokens_details.cached_tokens
directly. This covers both the buffered JSON and SSE streaming paths
(both route through extractCacheReadTokens). Adds regression tests for the
JSON, streaming, and normalizeUsage paths using the real Responses API
usage shape.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 containers/api-proxy/token-parsers.js         | 16 +++-
 .../api-proxy/token-tracker.parsing.test.js   | 83 +++++++++++++++++++
 2 files changed, 97 insertions(+), 2 deletions(-)
diff --git a/containers/api-proxy/token-parsers.js b/containers/api-proxy/token-parsers.js
index 6cc72174..56d3691d 100644
--- a/containers/api-proxy/token-parsers.js
+++ b/containers/api-proxy/token-parsers.js
@@ -61,7 +61,8 @@ function extractReasoningTokens(usage) {
  *
  * Supports:
  *  - Anthropic: usage.cache_read_input_tokens
- *  - OpenAI/Copilot: usage.prompt_tokens_details.cached_tokens
+ *  - OpenAI Chat Completions / Copilot: usage.prompt_tokens_details.cached_tokens
+ *  - OpenAI Responses API: usage.input_tokens_details.cached_tokens
  *  - Token-entry arrays containing { token_type: "cache_read", token_count: <n> }
  */
 function extractCacheReadTokens(usage) {
@@ -75,6 +76,15 @@ function extractCacheReadTokens(usage) {
     return usage.prompt_tokens_details.cached_tokens;
   }
 
+  // OpenAI Responses API (/responses) reports cached prompt tokens under
+  // `input_tokens_details.cached_tokens` (an object), rather than the Chat
+  // Completions `prompt_tokens_details.cached_tokens`. Without this branch the
+  // value falls through to the array loop below, which only handles token-entry
+  // arrays, so cache reads are silently dropped (reported as 0).
+  if (usage.input_tokens_details && typeof usage.input_tokens_details.cached_tokens === 'number') {
+    return usage.input_tokens_details.cached_tokens;
+  }
+
   const tokenContainers = [
     usage.prompt_tokens_details,
     usage.input_tokens_details,
@@ -392,7 +402,9 @@ function parseSseDataLines(text) {
  * Output fields:
  *   - input_tokens: number (from Anthropic input_tokens or OpenAI prompt_tokens)
  *   - output_tokens: number (from Anthropic output_tokens or OpenAI completion_tokens)
- *   - cache_read_tokens: number (from Anthropic cache_read_input_tokens or OpenAI prompt_tokens_details.cached_tokens)
+ *   - cache_read_tokens: number (from Anthropic cache_read_input_tokens,
+ *       OpenAI Chat Completions prompt_tokens_details.cached_tokens, or
+ *       OpenAI Responses API input_tokens_details.cached_tokens)
  *   - cache_write_tokens: number (Anthropic cache_creation_input_tokens or
  *       Copilot copilot_usage cache_write; not available in flattened OpenAI usage)
  */
diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js
index f9e728e9..c8849812 100644
--- a/containers/api-proxy/token-tracker.parsing.test.js
+++ b/containers/api-proxy/token-tracker.parsing.test.js
@@ -211,6 +211,39 @@ describe('extractUsageFromJson', () => {
       cache_read_input_tokens: 77,
     });
   });
+
+  test('extracts OpenAI Responses API cached tokens from input_tokens_details.cached_tokens', () => {
+    // The real /responses endpoint (used by codex) reports cached prompt tokens
+    // under `input_tokens_details.cached_tokens`, not `prompt_tokens_details`.
+    const body = Buffer.from(JSON.stringify({
+      type: 'response.completed',
+      response: {
+        id: 'resp_responses_cache',
+        model: 'gpt-5.4-mini',
+        usage: {
+          input_tokens: 707301,
+          output_tokens: 12096,
+          total_tokens: 719397,
+          input_tokens_details: {
+            cached_tokens: 672256,
+          },
+          output_tokens_details: {
+            reasoning_tokens: 7715,
+          },
+        },
+      },
+    }));
+
+    const result = extractUsageFromJson(body);
+    expect(result.model).toBe('gpt-5.4-mini');
+    expect(result.usage).toEqual({
+      input_tokens: 707301,
+      output_tokens: 12096,
+      total_tokens: 719397,
+      reasoning_tokens: 7715,
+      cache_read_input_tokens: 672256,
+    });
+  });
 });
 
 // ── extractUsageFromSseLine ───────────────────────────────────────────
@@ -343,6 +376,37 @@ describe('extractUsageFromSseLine', () => {
     });
   });
 
+  test('extracts cache tokens from OpenAI Responses API input_tokens_details (streaming)', () => {
+    // Real /responses streaming final event: cached tokens live under
+    // input_tokens_details.cached_tokens (object), not prompt_tokens_details.
+    const line = JSON.stringify({
+      type: 'response.completed',
+      response: {
+        model: 'gpt-5.4-mini',
+        usage: {
+          input_tokens: 37484,
+          output_tokens: 619,
+          total_tokens: 38103,
+          input_tokens_details: {
+            cached_tokens: 34816,
+          },
+          output_tokens_details: {
+            reasoning_tokens: 128,
+          },
+        },
+      },
+    });
+
+    const result = extractUsageFromSseLine(line);
+    expect(result.usage).toEqual({
+      input_tokens: 37484,
+      output_tokens: 619,
+      total_tokens: 38103,
+      reasoning_tokens: 128,
+      cache_read_input_tokens: 34816,
+    });
+  });
+
   test('returns null for [DONE]', () => {
     const result = extractUsageFromSseLine('[DONE]');
     expect(result.usage).toBeNull();
@@ -523,6 +587,25 @@ describe('normalizeUsage', () => {
       reasoning_tokens: 0,
     });
   });
+
+  test('normalizes OpenAI Responses API cached_tokens via input_tokens_details.cached_tokens', () => {
+    const result = normalizeUsage({
+      input_tokens: 707301,
+      output_tokens: 12096,
+      total_tokens: 719397,
+      input_tokens_details: {
+        cached_tokens: 672256,
+      },
+      reasoning_tokens: 7715,
+    });
+    expect(result).toEqual({
+      input_tokens: 707301,
+      output_tokens: 12096,
+      cache_read_tokens: 672256,
+      cache_write_tokens: 0,
+      reasoning_tokens: 7715,
+    });
+  });
 });
 
 // ── Copilot copilot_usage.token_details breakdown ─────────────────────

From d8ce3806352fea4c9f1628a19f772aeddd372fc5 Mon Sep 17 00:00:00 2001
From: Landon Cox <landon.cox@microsoft.com>
Date: Thu, 18 Jun 2026 13:06:47 -0700
Subject: [PATCH 2/3] test(api-proxy): cover Copilot /responses streaming cache
 reads

Add a regression test reproducing the exact final-chunk shape from
gh-aw run 27784259295: a Copilot `/responses` streaming response that
arrives as a chat.completion.chunk carrying both
prompt_tokens_details.cached_tokens and the authoritative per-type split
in copilot_usage.token_details. That run reported cache_read_tokens: 0
despite ~1.43M cached reads across 28 requests; this locks in that the
copilot_usage breakdown drives the exact input/cache_read split.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../api-proxy/token-tracker.parsing.test.js   | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js
index c8849812..1bce6729 100644
--- a/containers/api-proxy/token-tracker.parsing.test.js
+++ b/containers/api-proxy/token-tracker.parsing.test.js
@@ -454,6 +454,43 @@ describe('extractUsageFromSseLine', () => {
       cache_read_input_tokens: 43894,
     });
   });
+
+  test('extracts Copilot /responses cache reads from copilot_usage.token_details (streaming)', () => {
+    // Regression for gh-aw run 27784259295: the Copilot /responses endpoint
+    // streams a chat.completion-shaped final chunk that carries both
+    // prompt_tokens_details.cached_tokens AND the authoritative per-type split
+    // in copilot_usage.token_details. The copilot_usage breakdown must win so
+    // the input/cache_read split (and cache_write, when present) is exact —
+    // the run had reported cache_read_tokens: 0 despite ~10.7K cached reads.
+    const line = JSON.stringify({
+      object: 'chat.completion.chunk',
+      model: 'gpt-5.4-2026-03-05',
+      choices: [{ index: 0, delta: {}, finish_reason: 'stop' }],
+      usage: {
+        completion_tokens: 124,
+        prompt_tokens: 16601,
+        total_tokens: 16725,
+        prompt_tokens_details: { cached_tokens: 10752 },
+        completion_tokens_details: { reasoning_tokens: 14 },
+      },
+      copilot_usage: {
+        token_details: [
+          { token_count: 5849, token_type: 'input' },
+          { token_count: 10752, token_type: 'cache_read' },
+          { token_count: 124, token_type: 'output' },
+        ],
+      },
+    });
+
+    const normalized = normalizeUsage(extractUsageFromSseLine(line).usage);
+    expect(normalized).toEqual({
+      input_tokens: 5849,
+      output_tokens: 124,
+      cache_read_tokens: 10752,
+      cache_write_tokens: 0,
+      reasoning_tokens: 14,
+    });
+  });
 });
 
 // ── parseSseDataLines ─────────────────────────────────────────────────

From d540f9acae1bf252c5e38e25e764f8a788b86b4e Mon Sep 17 00:00:00 2001
From: Landon Cox <landon.cox@microsoft.com>
Date: Thu, 18 Jun 2026 13:16:33 -0700
Subject: [PATCH 3/3] test(api-proxy): data-driven Copilot /responses
 cache-read replay

Replace the single Copilot /responses regression sample with a
data-driven test.each over all 28 real requests captured from gh-aw run
27784259295 (chronological; cache reads grow as the prompt is re-sent).
Each request asserts the exact input/cache_read/output split from the
upstream copilot_usage.token_details, and that input + cache_read
reconstructs the lumped prompt_tokens. A final aggregate test confirms
the parser recovers the full 1,426,432 cache-read tokens that the run
had reported as 0.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../api-proxy/token-tracker.parsing.test.js   | 102 ++++++++++++++----
 1 file changed, 79 insertions(+), 23 deletions(-)

diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js
index 1bce6729..a52e8054 100644
--- a/containers/api-proxy/token-tracker.parsing.test.js
+++ b/containers/api-proxy/token-tracker.parsing.test.js
@@ -455,40 +455,96 @@ describe('extractUsageFromSseLine', () => {
     });
   });
 
-  test('extracts Copilot /responses cache reads from copilot_usage.token_details (streaming)', () => {
-    // Regression for gh-aw run 27784259295: the Copilot /responses endpoint
-    // streams a chat.completion-shaped final chunk that carries both
-    // prompt_tokens_details.cached_tokens AND the authoritative per-type split
-    // in copilot_usage.token_details. The copilot_usage breakdown must win so
-    // the input/cache_read split (and cache_write, when present) is exact —
-    // the run had reported cache_read_tokens: 0 despite ~10.7K cached reads.
-    const line = JSON.stringify({
+  // Regression for gh-aw run 27784259295: the Copilot /responses endpoint
+  // streams a chat.completion-shaped final chunk that carries both
+  // prompt_tokens_details.cached_tokens AND the authoritative per-type split
+  // in copilot_usage.token_details. The copilot_usage breakdown must win so
+  // the input/cache_read split is exact. That run reported cache_read_tokens: 0
+  // on every request despite ~1.43M cached reads in aggregate.
+  //
+  // Each fixture below is a real request captured from the agent's process log
+  // for that run, in chronological order (cache reads grow as the prompt is
+  // re-sent). `input` + `cacheRead` === `promptTokens` for every entry.
+  describe('Copilot /responses streaming cache reads (run 27784259295)', () => {
+    const REQUESTS = [
+      { promptTokens: 19158, completionTokens: 1304, cachedTokens: 0, reasoningTokens: 516, input: 19158, cacheRead: 0, output: 1304 },
+      { promptTokens: 10852, completionTokens: 168, cachedTokens: 0, reasoningTokens: 94, input: 10852, cacheRead: 0, output: 168 },
+      { promptTokens: 16601, completionTokens: 124, cachedTokens: 10752, reasoningTokens: 14, input: 5849, cacheRead: 10752, output: 124 },
+      { promptTokens: 23055, completionTokens: 559, cachedTokens: 18944, reasoningTokens: 516, input: 4111, cacheRead: 18944, output: 559 },
+      { promptTokens: 24429, completionTokens: 978, cachedTokens: 22528, reasoningTokens: 455, input: 1901, cacheRead: 22528, output: 978 },
+      { promptTokens: 26055, completionTokens: 1405, cachedTokens: 24064, reasoningTokens: 904, input: 1991, cacheRead: 24064, output: 1405 },
+      { promptTokens: 28551, completionTokens: 1306, cachedTokens: 25600, reasoningTokens: 941, input: 2951, cacheRead: 25600, output: 1306 },
+      { promptTokens: 33145, completionTokens: 1636, cachedTokens: 28160, reasoningTokens: 938, input: 4985, cacheRead: 28160, output: 1636 },
+      { promptTokens: 39144, completionTokens: 921, cachedTokens: 32768, reasoningTokens: 595, input: 6376, cacheRead: 32768, output: 921 },
+      { promptTokens: 41728, completionTokens: 372, cachedTokens: 38912, reasoningTokens: 193, input: 2816, cacheRead: 38912, output: 372 },
+      { promptTokens: 44382, completionTokens: 735, cachedTokens: 41472, reasoningTokens: 488, input: 2910, cacheRead: 41472, output: 735 },
+      { promptTokens: 45677, completionTokens: 335, cachedTokens: 44032, reasoningTokens: 83, input: 1645, cacheRead: 44032, output: 335 },
+      { promptTokens: 46386, completionTokens: 363, cachedTokens: 45568, reasoningTokens: 119, input: 818, cacheRead: 45568, output: 363 },
+      { promptTokens: 48174, completionTokens: 376, cachedTokens: 46080, reasoningTokens: 139, input: 2094, cacheRead: 46080, output: 376 },
+      { promptTokens: 48980, completionTokens: 211, cachedTokens: 47616, reasoningTokens: 62, input: 1364, cacheRead: 47616, output: 211 },
+      { promptTokens: 65247, completionTokens: 424, cachedTokens: 48640, reasoningTokens: 313, input: 16607, cacheRead: 48640, output: 424 },
+      { promptTokens: 68930, completionTokens: 267, cachedTokens: 65024, reasoningTokens: 114, input: 3906, cacheRead: 65024, output: 267 },
+      { promptTokens: 69642, completionTokens: 138, cachedTokens: 68608, reasoningTokens: 24, input: 1034, cacheRead: 68608, output: 138 },
+      { promptTokens: 75433, completionTokens: 138, cachedTokens: 69120, reasoningTokens: 22, input: 6313, cacheRead: 69120, output: 138 },
+      { promptTokens: 78451, completionTokens: 131, cachedTokens: 75264, reasoningTokens: 73, input: 3187, cacheRead: 75264, output: 131 },
+      { promptTokens: 78808, completionTokens: 56, cachedTokens: 78336, reasoningTokens: 0, input: 472, cacheRead: 78336, output: 56 },
+      { promptTokens: 79128, completionTokens: 56, cachedTokens: 78336, reasoningTokens: 0, input: 792, cacheRead: 78336, output: 56 },
+      { promptTokens: 79320, completionTokens: 2799, cachedTokens: 78848, reasoningTokens: 2522, input: 472, cacheRead: 78848, output: 2799 },
+      { promptTokens: 82221, completionTokens: 3408, cachedTokens: 78848, reasoningTokens: 2243, input: 3373, cacheRead: 78848, output: 3408 },
+      { promptTokens: 91547, completionTokens: 1400, cachedTokens: 81920, reasoningTokens: 1333, input: 9627, cacheRead: 81920, output: 1400 },
+      { promptTokens: 93125, completionTokens: 201, cachedTokens: 91136, reasoningTokens: 113, input: 1989, cacheRead: 91136, output: 201 },
+      { promptTokens: 93675, completionTokens: 423, cachedTokens: 92672, reasoningTokens: 366, input: 1003, cacheRead: 92672, output: 423 },
+      { promptTokens: 94114, completionTokens: 161, cachedTokens: 93184, reasoningTokens: 60, input: 930, cacheRead: 93184, output: 161 },
+    ];
+
+    const buildChunk = (r) => JSON.stringify({
       object: 'chat.completion.chunk',
       model: 'gpt-5.4-2026-03-05',
       choices: [{ index: 0, delta: {}, finish_reason: 'stop' }],
       usage: {
-        completion_tokens: 124,
-        prompt_tokens: 16601,
-        total_tokens: 16725,
-        prompt_tokens_details: { cached_tokens: 10752 },
-        completion_tokens_details: { reasoning_tokens: 14 },
+        completion_tokens: r.completionTokens,
+        prompt_tokens: r.promptTokens,
+        total_tokens: r.promptTokens + r.completionTokens,
+        prompt_tokens_details: { cached_tokens: r.cachedTokens },
+        completion_tokens_details: { reasoning_tokens: r.reasoningTokens },
       },
       copilot_usage: {
         token_details: [
-          { token_count: 5849, token_type: 'input' },
-          { token_count: 10752, token_type: 'cache_read' },
-          { token_count: 124, token_type: 'output' },
+          { token_count: r.input, token_type: 'input' },
+          { token_count: r.cacheRead, token_type: 'cache_read' },
+          { token_count: r.output, token_type: 'output' },
         ],
       },
     });
 
-    const normalized = normalizeUsage(extractUsageFromSseLine(line).usage);
-    expect(normalized).toEqual({
-      input_tokens: 5849,
-      output_tokens: 124,
-      cache_read_tokens: 10752,
-      cache_write_tokens: 0,
-      reasoning_tokens: 14,
+    test.each(REQUESTS)(
+      'request prompt=$promptTokens recovers cache_read=$cacheRead',
+      (r) => {
+        const normalized = normalizeUsage(extractUsageFromSseLine(buildChunk(r)).usage);
+        expect(normalized).toEqual({
+          input_tokens: r.input,
+          output_tokens: r.output,
+          cache_read_tokens: r.cacheRead,
+          cache_write_tokens: 0,
+          reasoning_tokens: r.reasoningTokens,
+        });
+        // The copilot_usage split must reconstruct the lumped prompt_tokens.
+        expect(normalized.input_tokens + normalized.cache_read_tokens).toBe(r.promptTokens);
+      },
+    );
+
+    test('recovers the full aggregate cache-read total the run reported as 0', () => {
+      const totals = REQUESTS.reduce(
+        (acc, r) => {
+          const n = normalizeUsage(extractUsageFromSseLine(buildChunk(r)).usage);
+          acc.cacheRead += n.cache_read_tokens;
+          acc.input += n.input_tokens;
+          return acc;
+        },
+        { cacheRead: 0, input: 0 },
+      );
+      expect(totals.cacheRead).toBe(1426432);
+      expect(totals.input).toBe(119526);
     });
   });
 });