From 474d3838fe815b8a78e40d6da4aa092307a8c78e Mon Sep 17 00:00:00 2001 From: Landon Cox Date: Thu, 18 Jun 2026 12:56:24 -0700 Subject: [PATCH 1/3] fix(api-proxy): map OpenAI Responses API cached tokens to cache_read The token normalizer recognized cached prompt tokens from the Chat Completions API (usage.prompt_tokens_details.cached_tokens) and Anthropic (cache_read_input_tokens), but not the OpenAI Responses API (/responses), which reports them under usage.input_tokens_details.cached_tokens as an object property. Because extractCacheReadTokens only treated input_tokens_details as a token-entry array, Responses API cache reads silently fell through and were recorded as cache_read_tokens: 0. Agents using the /responses endpoint (e.g. codex) with heavy automatic prompt caching had their cache hits completely unreported, which also skews AI-credits accounting since the guard prices the non-cached input as input_tokens - cache_read_tokens. Fix extractCacheReadTokens to read input_tokens_details.cached_tokens directly. This covers both the buffered JSON and SSE streaming paths (both route through extractCacheReadTokens). Adds regression tests for the JSON, streaming, and normalizeUsage paths using the real Responses API usage shape. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- containers/api-proxy/token-parsers.js | 16 +++- .../api-proxy/token-tracker.parsing.test.js | 83 +++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/containers/api-proxy/token-parsers.js b/containers/api-proxy/token-parsers.js index 6cc72174..56d3691d 100644 --- a/containers/api-proxy/token-parsers.js +++ b/containers/api-proxy/token-parsers.js @@ -61,7 +61,8 @@ function extractReasoningTokens(usage) { * * Supports: * - Anthropic: usage.cache_read_input_tokens - * - OpenAI/Copilot: usage.prompt_tokens_details.cached_tokens + * - OpenAI Chat Completions / Copilot: usage.prompt_tokens_details.cached_tokens + * - OpenAI Responses API: usage.input_tokens_details.cached_tokens * - Token-entry arrays containing { token_type: "cache_read", token_count: } */ function extractCacheReadTokens(usage) { @@ -75,6 +76,15 @@ function extractCacheReadTokens(usage) { return usage.prompt_tokens_details.cached_tokens; } + // OpenAI Responses API (/responses) reports cached prompt tokens under + // `input_tokens_details.cached_tokens` (an object), rather than the Chat + // Completions `prompt_tokens_details.cached_tokens`. Without this branch the + // value falls through to the array loop below, which only handles token-entry + // arrays, so cache reads are silently dropped (reported as 0). + if (usage.input_tokens_details && typeof usage.input_tokens_details.cached_tokens === 'number') { + return usage.input_tokens_details.cached_tokens; + } + const tokenContainers = [ usage.prompt_tokens_details, usage.input_tokens_details, @@ -392,7 +402,9 @@ function parseSseDataLines(text) { * Output fields: * - input_tokens: number (from Anthropic input_tokens or OpenAI prompt_tokens) * - output_tokens: number (from Anthropic output_tokens or OpenAI completion_tokens) - * - cache_read_tokens: number (from Anthropic cache_read_input_tokens or OpenAI prompt_tokens_details.cached_tokens) + * - cache_read_tokens: number (from Anthropic cache_read_input_tokens, + * OpenAI Chat Completions prompt_tokens_details.cached_tokens, or + * OpenAI Responses API input_tokens_details.cached_tokens) * - cache_write_tokens: number (Anthropic cache_creation_input_tokens or * Copilot copilot_usage cache_write; not available in flattened OpenAI usage) */ diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js index f9e728e9..c8849812 100644 --- a/containers/api-proxy/token-tracker.parsing.test.js +++ b/containers/api-proxy/token-tracker.parsing.test.js @@ -211,6 +211,39 @@ describe('extractUsageFromJson', () => { cache_read_input_tokens: 77, }); }); + + test('extracts OpenAI Responses API cached tokens from input_tokens_details.cached_tokens', () => { + // The real /responses endpoint (used by codex) reports cached prompt tokens + // under `input_tokens_details.cached_tokens`, not `prompt_tokens_details`. + const body = Buffer.from(JSON.stringify({ + type: 'response.completed', + response: { + id: 'resp_responses_cache', + model: 'gpt-5.4-mini', + usage: { + input_tokens: 707301, + output_tokens: 12096, + total_tokens: 719397, + input_tokens_details: { + cached_tokens: 672256, + }, + output_tokens_details: { + reasoning_tokens: 7715, + }, + }, + }, + })); + + const result = extractUsageFromJson(body); + expect(result.model).toBe('gpt-5.4-mini'); + expect(result.usage).toEqual({ + input_tokens: 707301, + output_tokens: 12096, + total_tokens: 719397, + reasoning_tokens: 7715, + cache_read_input_tokens: 672256, + }); + }); }); // ── extractUsageFromSseLine ─────────────────────────────────────────── @@ -343,6 +376,37 @@ describe('extractUsageFromSseLine', () => { }); }); + test('extracts cache tokens from OpenAI Responses API input_tokens_details (streaming)', () => { + // Real /responses streaming final event: cached tokens live under + // input_tokens_details.cached_tokens (object), not prompt_tokens_details. + const line = JSON.stringify({ + type: 'response.completed', + response: { + model: 'gpt-5.4-mini', + usage: { + input_tokens: 37484, + output_tokens: 619, + total_tokens: 38103, + input_tokens_details: { + cached_tokens: 34816, + }, + output_tokens_details: { + reasoning_tokens: 128, + }, + }, + }, + }); + + const result = extractUsageFromSseLine(line); + expect(result.usage).toEqual({ + input_tokens: 37484, + output_tokens: 619, + total_tokens: 38103, + reasoning_tokens: 128, + cache_read_input_tokens: 34816, + }); + }); + test('returns null for [DONE]', () => { const result = extractUsageFromSseLine('[DONE]'); expect(result.usage).toBeNull(); @@ -523,6 +587,25 @@ describe('normalizeUsage', () => { reasoning_tokens: 0, }); }); + + test('normalizes OpenAI Responses API cached_tokens via input_tokens_details.cached_tokens', () => { + const result = normalizeUsage({ + input_tokens: 707301, + output_tokens: 12096, + total_tokens: 719397, + input_tokens_details: { + cached_tokens: 672256, + }, + reasoning_tokens: 7715, + }); + expect(result).toEqual({ + input_tokens: 707301, + output_tokens: 12096, + cache_read_tokens: 672256, + cache_write_tokens: 0, + reasoning_tokens: 7715, + }); + }); }); // ── Copilot copilot_usage.token_details breakdown ───────────────────── From d8ce3806352fea4c9f1628a19f772aeddd372fc5 Mon Sep 17 00:00:00 2001 From: Landon Cox Date: Thu, 18 Jun 2026 13:06:47 -0700 Subject: [PATCH 2/3] test(api-proxy): cover Copilot /responses streaming cache reads Add a regression test reproducing the exact final-chunk shape from gh-aw run 27784259295: a Copilot `/responses` streaming response that arrives as a chat.completion.chunk carrying both prompt_tokens_details.cached_tokens and the authoritative per-type split in copilot_usage.token_details. That run reported cache_read_tokens: 0 despite ~1.43M cached reads across 28 requests; this locks in that the copilot_usage breakdown drives the exact input/cache_read split. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../api-proxy/token-tracker.parsing.test.js | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js index c8849812..1bce6729 100644 --- a/containers/api-proxy/token-tracker.parsing.test.js +++ b/containers/api-proxy/token-tracker.parsing.test.js @@ -454,6 +454,43 @@ describe('extractUsageFromSseLine', () => { cache_read_input_tokens: 43894, }); }); + + test('extracts Copilot /responses cache reads from copilot_usage.token_details (streaming)', () => { + // Regression for gh-aw run 27784259295: the Copilot /responses endpoint + // streams a chat.completion-shaped final chunk that carries both + // prompt_tokens_details.cached_tokens AND the authoritative per-type split + // in copilot_usage.token_details. The copilot_usage breakdown must win so + // the input/cache_read split (and cache_write, when present) is exact — + // the run had reported cache_read_tokens: 0 despite ~10.7K cached reads. + const line = JSON.stringify({ + object: 'chat.completion.chunk', + model: 'gpt-5.4-2026-03-05', + choices: [{ index: 0, delta: {}, finish_reason: 'stop' }], + usage: { + completion_tokens: 124, + prompt_tokens: 16601, + total_tokens: 16725, + prompt_tokens_details: { cached_tokens: 10752 }, + completion_tokens_details: { reasoning_tokens: 14 }, + }, + copilot_usage: { + token_details: [ + { token_count: 5849, token_type: 'input' }, + { token_count: 10752, token_type: 'cache_read' }, + { token_count: 124, token_type: 'output' }, + ], + }, + }); + + const normalized = normalizeUsage(extractUsageFromSseLine(line).usage); + expect(normalized).toEqual({ + input_tokens: 5849, + output_tokens: 124, + cache_read_tokens: 10752, + cache_write_tokens: 0, + reasoning_tokens: 14, + }); + }); }); // ── parseSseDataLines ───────────────────────────────────────────────── From d540f9acae1bf252c5e38e25e764f8a788b86b4e Mon Sep 17 00:00:00 2001 From: Landon Cox Date: Thu, 18 Jun 2026 13:16:33 -0700 Subject: [PATCH 3/3] test(api-proxy): data-driven Copilot /responses cache-read replay Replace the single Copilot /responses regression sample with a data-driven test.each over all 28 real requests captured from gh-aw run 27784259295 (chronological; cache reads grow as the prompt is re-sent). Each request asserts the exact input/cache_read/output split from the upstream copilot_usage.token_details, and that input + cache_read reconstructs the lumped prompt_tokens. A final aggregate test confirms the parser recovers the full 1,426,432 cache-read tokens that the run had reported as 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../api-proxy/token-tracker.parsing.test.js | 102 ++++++++++++++---- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js index 1bce6729..a52e8054 100644 --- a/containers/api-proxy/token-tracker.parsing.test.js +++ b/containers/api-proxy/token-tracker.parsing.test.js @@ -455,40 +455,96 @@ describe('extractUsageFromSseLine', () => { }); }); - test('extracts Copilot /responses cache reads from copilot_usage.token_details (streaming)', () => { - // Regression for gh-aw run 27784259295: the Copilot /responses endpoint - // streams a chat.completion-shaped final chunk that carries both - // prompt_tokens_details.cached_tokens AND the authoritative per-type split - // in copilot_usage.token_details. The copilot_usage breakdown must win so - // the input/cache_read split (and cache_write, when present) is exact — - // the run had reported cache_read_tokens: 0 despite ~10.7K cached reads. - const line = JSON.stringify({ + // Regression for gh-aw run 27784259295: the Copilot /responses endpoint + // streams a chat.completion-shaped final chunk that carries both + // prompt_tokens_details.cached_tokens AND the authoritative per-type split + // in copilot_usage.token_details. The copilot_usage breakdown must win so + // the input/cache_read split is exact. That run reported cache_read_tokens: 0 + // on every request despite ~1.43M cached reads in aggregate. + // + // Each fixture below is a real request captured from the agent's process log + // for that run, in chronological order (cache reads grow as the prompt is + // re-sent). `input` + `cacheRead` === `promptTokens` for every entry. + describe('Copilot /responses streaming cache reads (run 27784259295)', () => { + const REQUESTS = [ + { promptTokens: 19158, completionTokens: 1304, cachedTokens: 0, reasoningTokens: 516, input: 19158, cacheRead: 0, output: 1304 }, + { promptTokens: 10852, completionTokens: 168, cachedTokens: 0, reasoningTokens: 94, input: 10852, cacheRead: 0, output: 168 }, + { promptTokens: 16601, completionTokens: 124, cachedTokens: 10752, reasoningTokens: 14, input: 5849, cacheRead: 10752, output: 124 }, + { promptTokens: 23055, completionTokens: 559, cachedTokens: 18944, reasoningTokens: 516, input: 4111, cacheRead: 18944, output: 559 }, + { promptTokens: 24429, completionTokens: 978, cachedTokens: 22528, reasoningTokens: 455, input: 1901, cacheRead: 22528, output: 978 }, + { promptTokens: 26055, completionTokens: 1405, cachedTokens: 24064, reasoningTokens: 904, input: 1991, cacheRead: 24064, output: 1405 }, + { promptTokens: 28551, completionTokens: 1306, cachedTokens: 25600, reasoningTokens: 941, input: 2951, cacheRead: 25600, output: 1306 }, + { promptTokens: 33145, completionTokens: 1636, cachedTokens: 28160, reasoningTokens: 938, input: 4985, cacheRead: 28160, output: 1636 }, + { promptTokens: 39144, completionTokens: 921, cachedTokens: 32768, reasoningTokens: 595, input: 6376, cacheRead: 32768, output: 921 }, + { promptTokens: 41728, completionTokens: 372, cachedTokens: 38912, reasoningTokens: 193, input: 2816, cacheRead: 38912, output: 372 }, + { promptTokens: 44382, completionTokens: 735, cachedTokens: 41472, reasoningTokens: 488, input: 2910, cacheRead: 41472, output: 735 }, + { promptTokens: 45677, completionTokens: 335, cachedTokens: 44032, reasoningTokens: 83, input: 1645, cacheRead: 44032, output: 335 }, + { promptTokens: 46386, completionTokens: 363, cachedTokens: 45568, reasoningTokens: 119, input: 818, cacheRead: 45568, output: 363 }, + { promptTokens: 48174, completionTokens: 376, cachedTokens: 46080, reasoningTokens: 139, input: 2094, cacheRead: 46080, output: 376 }, + { promptTokens: 48980, completionTokens: 211, cachedTokens: 47616, reasoningTokens: 62, input: 1364, cacheRead: 47616, output: 211 }, + { promptTokens: 65247, completionTokens: 424, cachedTokens: 48640, reasoningTokens: 313, input: 16607, cacheRead: 48640, output: 424 }, + { promptTokens: 68930, completionTokens: 267, cachedTokens: 65024, reasoningTokens: 114, input: 3906, cacheRead: 65024, output: 267 }, + { promptTokens: 69642, completionTokens: 138, cachedTokens: 68608, reasoningTokens: 24, input: 1034, cacheRead: 68608, output: 138 }, + { promptTokens: 75433, completionTokens: 138, cachedTokens: 69120, reasoningTokens: 22, input: 6313, cacheRead: 69120, output: 138 }, + { promptTokens: 78451, completionTokens: 131, cachedTokens: 75264, reasoningTokens: 73, input: 3187, cacheRead: 75264, output: 131 }, + { promptTokens: 78808, completionTokens: 56, cachedTokens: 78336, reasoningTokens: 0, input: 472, cacheRead: 78336, output: 56 }, + { promptTokens: 79128, completionTokens: 56, cachedTokens: 78336, reasoningTokens: 0, input: 792, cacheRead: 78336, output: 56 }, + { promptTokens: 79320, completionTokens: 2799, cachedTokens: 78848, reasoningTokens: 2522, input: 472, cacheRead: 78848, output: 2799 }, + { promptTokens: 82221, completionTokens: 3408, cachedTokens: 78848, reasoningTokens: 2243, input: 3373, cacheRead: 78848, output: 3408 }, + { promptTokens: 91547, completionTokens: 1400, cachedTokens: 81920, reasoningTokens: 1333, input: 9627, cacheRead: 81920, output: 1400 }, + { promptTokens: 93125, completionTokens: 201, cachedTokens: 91136, reasoningTokens: 113, input: 1989, cacheRead: 91136, output: 201 }, + { promptTokens: 93675, completionTokens: 423, cachedTokens: 92672, reasoningTokens: 366, input: 1003, cacheRead: 92672, output: 423 }, + { promptTokens: 94114, completionTokens: 161, cachedTokens: 93184, reasoningTokens: 60, input: 930, cacheRead: 93184, output: 161 }, + ]; + + const buildChunk = (r) => JSON.stringify({ object: 'chat.completion.chunk', model: 'gpt-5.4-2026-03-05', choices: [{ index: 0, delta: {}, finish_reason: 'stop' }], usage: { - completion_tokens: 124, - prompt_tokens: 16601, - total_tokens: 16725, - prompt_tokens_details: { cached_tokens: 10752 }, - completion_tokens_details: { reasoning_tokens: 14 }, + completion_tokens: r.completionTokens, + prompt_tokens: r.promptTokens, + total_tokens: r.promptTokens + r.completionTokens, + prompt_tokens_details: { cached_tokens: r.cachedTokens }, + completion_tokens_details: { reasoning_tokens: r.reasoningTokens }, }, copilot_usage: { token_details: [ - { token_count: 5849, token_type: 'input' }, - { token_count: 10752, token_type: 'cache_read' }, - { token_count: 124, token_type: 'output' }, + { token_count: r.input, token_type: 'input' }, + { token_count: r.cacheRead, token_type: 'cache_read' }, + { token_count: r.output, token_type: 'output' }, ], }, }); - const normalized = normalizeUsage(extractUsageFromSseLine(line).usage); - expect(normalized).toEqual({ - input_tokens: 5849, - output_tokens: 124, - cache_read_tokens: 10752, - cache_write_tokens: 0, - reasoning_tokens: 14, + test.each(REQUESTS)( + 'request prompt=$promptTokens recovers cache_read=$cacheRead', + (r) => { + const normalized = normalizeUsage(extractUsageFromSseLine(buildChunk(r)).usage); + expect(normalized).toEqual({ + input_tokens: r.input, + output_tokens: r.output, + cache_read_tokens: r.cacheRead, + cache_write_tokens: 0, + reasoning_tokens: r.reasoningTokens, + }); + // The copilot_usage split must reconstruct the lumped prompt_tokens. + expect(normalized.input_tokens + normalized.cache_read_tokens).toBe(r.promptTokens); + }, + ); + + test('recovers the full aggregate cache-read total the run reported as 0', () => { + const totals = REQUESTS.reduce( + (acc, r) => { + const n = normalizeUsage(extractUsageFromSseLine(buildChunk(r)).usage); + acc.cacheRead += n.cache_read_tokens; + acc.input += n.input_tokens; + return acc; + }, + { cacheRead: 0, input: 0 }, + ); + expect(totals.cacheRead).toBe(1426432); + expect(totals.input).toBe(119526); }); }); });