diff --git a/containers/api-proxy/token-parsers.js b/containers/api-proxy/token-parsers.js index 276f4e41..6cc72174 100644 --- a/containers/api-proxy/token-parsers.js +++ b/containers/api-proxy/token-parsers.js @@ -118,6 +118,71 @@ function extractCacheReadTokens(usage) { return undefined; } +/** + * Extract the authoritative per-type token breakdown from a Copilot + * `copilot_usage.token_details` array. + * + * The GitHub Copilot OpenAI-compatible endpoint reports a flattened + * `usage` object where `prompt_tokens` lumps fresh input together with + * cache-write tokens, and `prompt_tokens_details.cached_tokens` only + * carries cache-read. The true split (input / cache_read / cache_write / + * output), which is billed at distinct rates, is only available in the + * sibling `copilot_usage.token_details` array, e.g.: + * + * copilot_usage: { token_details: [ + * { token_type: "input", token_count: 3857 }, + * { token_type: "cache_read", token_count: 0 }, + * { token_type: "cache_write", token_count: 12539 }, + * { token_type: "output", token_count: 362 }, + * ] } + * + * Returns Anthropic-normalized usage fields (input_tokens, output_tokens, + * cache_read_input_tokens, cache_creation_input_tokens) so downstream + * normalization records the correct cache_write split, or null when no + * recognizable token_details are present. + * + * @param {object} json - Parsed response JSON (or SSE event object) + * @returns {object|null} + */ +function extractCopilotUsageBreakdown(json) { + if (!json || typeof json !== 'object') return null; + const copilotUsage = (json.copilot_usage && typeof json.copilot_usage === 'object') + ? json.copilot_usage + : ((json.response && json.response.copilot_usage && typeof json.response.copilot_usage === 'object') + ? json.response.copilot_usage + : null); + if (!copilotUsage || !Array.isArray(copilotUsage.token_details)) return null; + + const out = {}; + let found = false; + for (const entry of copilotUsage.token_details) { + if (!entry || typeof entry !== 'object') continue; + const count = entry.token_count; + if (typeof count !== 'number') continue; + switch (entry.token_type) { + case 'input': + out.input_tokens = (out.input_tokens || 0) + count; + found = true; + break; + case 'output': + out.output_tokens = (out.output_tokens || 0) + count; + found = true; + break; + case 'cache_read': + out.cache_read_input_tokens = (out.cache_read_input_tokens || 0) + count; + found = true; + break; + case 'cache_write': + out.cache_creation_input_tokens = (out.cache_creation_input_tokens || 0) + count; + found = true; + break; + default: + break; + } + } + return found ? out : null; +} + /** * Extract token usage from a non-streaming JSON response body. * @@ -185,6 +250,26 @@ function extractUsageFromJson(body) { } } + // Copilot exposes the authoritative input/cache_read/cache_write/output + // split only in the sibling `copilot_usage.token_details` array. When + // present, prefer it: the flattened `usage.prompt_tokens` lumps fresh + // input together with cache-write tokens (billed at different rates). + const copilotBreakdown = extractCopilotUsageBreakdown(json); + if (copilotBreakdown) { + const merged = { ...(result.usage || {}), ...copilotBreakdown }; + if (copilotBreakdown.input_tokens !== undefined) { + // Copilot gave us a precise input split: drop the lumped prompt_tokens. + delete merged.prompt_tokens; + } else if (copilotBreakdown.cache_creation_input_tokens !== undefined + && typeof merged.prompt_tokens === 'number') { + // cache_write present but input absent: infer input = prompt_tokens - cache_write + // to avoid double-counting cache_write in normalizeUsage. + merged.input_tokens = Math.max(0, merged.prompt_tokens - copilotBreakdown.cache_creation_input_tokens); + delete merged.prompt_tokens; + } + result.usage = merged; + } + return result; } catch { return { usage: null, model: null }; @@ -260,6 +345,20 @@ function extractUsageFromSseLine(line) { } const cacheReadTokens = extractCacheReadTokens(json.usage); if (typeof cacheReadTokens === 'number') result.usage.cache_read_input_tokens = cacheReadTokens; + const copilotBreakdown = extractCopilotUsageBreakdown(json); + if (copilotBreakdown) { + result.usage = { ...result.usage, ...copilotBreakdown }; + if (copilotBreakdown.input_tokens !== undefined) { + // Copilot gave us a precise input split: drop the lumped prompt_tokens. + delete result.usage.prompt_tokens; + } else if (copilotBreakdown.cache_creation_input_tokens !== undefined + && typeof result.usage.prompt_tokens === 'number') { + // cache_write present but input absent: infer input = prompt_tokens - cache_write + // to avoid double-counting cache_write in normalizeUsage. + result.usage.input_tokens = Math.max(0, result.usage.prompt_tokens - copilotBreakdown.cache_creation_input_tokens); + delete result.usage.prompt_tokens; + } + } return result; } @@ -294,7 +393,8 @@ function parseSseDataLines(text) { * - input_tokens: number (from Anthropic input_tokens or OpenAI prompt_tokens) * - output_tokens: number (from Anthropic output_tokens or OpenAI completion_tokens) * - cache_read_tokens: number (from Anthropic cache_read_input_tokens or OpenAI prompt_tokens_details.cached_tokens) - * - cache_write_tokens: number (Anthropic cache_creation_input_tokens; not available in OpenAI format) + * - cache_write_tokens: number (Anthropic cache_creation_input_tokens or + * Copilot copilot_usage cache_write; not available in flattened OpenAI usage) */ function normalizeUsage(usage) { if (!usage) return null; @@ -314,6 +414,7 @@ module.exports = { createDecompressor, extractReasoningTokens, extractCacheReadTokens, + extractCopilotUsageBreakdown, extractUsageFromJson, extractUsageFromSseLine, parseSseDataLines, diff --git a/containers/api-proxy/token-tracker.js b/containers/api-proxy/token-tracker.js index 859f13d5..69583225 100644 --- a/containers/api-proxy/token-tracker.js +++ b/containers/api-proxy/token-tracker.js @@ -25,6 +25,7 @@ const { normalizeUsage, isStreamingResponse, isCompressedResponse, + extractCopilotUsageBreakdown, } = require('./token-parsers'); module.exports = { @@ -39,6 +40,7 @@ module.exports = { normalizeUsage, isStreamingResponse, isCompressedResponse, + extractCopilotUsageBreakdown, validateTokenUsageRecord, writeTokenUsage, TOKEN_LOG_FILE, diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js index 79aa7a24..f9e728e9 100644 --- a/containers/api-proxy/token-tracker.parsing.test.js +++ b/containers/api-proxy/token-tracker.parsing.test.js @@ -7,6 +7,7 @@ const { extractUsageFromSseLine, parseSseDataLines, normalizeUsage, + extractCopilotUsageBreakdown, } = require('./token-tracker'); // ── extractUsageFromJson ────────────────────────────────────────────── @@ -523,3 +524,231 @@ describe('normalizeUsage', () => { }); }); }); + +// ── Copilot copilot_usage.token_details breakdown ───────────────────── + +describe('extractCopilotUsageBreakdown', () => { + test('returns null when copilot_usage is absent', () => { + expect(extractCopilotUsageBreakdown({ usage: { prompt_tokens: 10 } })).toBeNull(); + }); + + test('returns null when token_details is not an array', () => { + expect(extractCopilotUsageBreakdown({ copilot_usage: { token_details: {} } })).toBeNull(); + }); + + test('returns null when no recognizable token types are present', () => { + expect(extractCopilotUsageBreakdown({ + copilot_usage: { token_details: [{ token_type: 'mystery', token_count: 5 }] }, + })).toBeNull(); + }); + + test('extracts the full input/cache_read/cache_write/output split', () => { + const result = extractCopilotUsageBreakdown({ + copilot_usage: { + token_details: [ + { token_type: 'input', token_count: 3857 }, + { token_type: 'cache_read', token_count: 0 }, + { token_type: 'cache_write', token_count: 12539 }, + { token_type: 'output', token_count: 362 }, + ], + }, + }); + expect(result).toEqual({ + input_tokens: 3857, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 12539, + output_tokens: 362, + }); + }); + + test('reads copilot_usage nested under a response object', () => { + const result = extractCopilotUsageBreakdown({ + response: { + copilot_usage: { token_details: [{ token_type: 'input', token_count: 7 }] }, + }, + }); + expect(result).toEqual({ input_tokens: 7 }); + }); + + test('sums repeated token types and ignores malformed entries', () => { + const result = extractCopilotUsageBreakdown({ + copilot_usage: { + token_details: [ + { token_type: 'input', token_count: 100 }, + { token_type: 'input', token_count: 50 }, + { token_type: 'output', token_count: 'nope' }, + null, + { token_type: 'cache_write' }, + ], + }, + }); + expect(result).toEqual({ input_tokens: 150 }); + }); +}); + +// ── extractUsageFromJson + Copilot breakdown integration ────────────── + +describe('extractUsageFromJson with copilot_usage', () => { + // Real Claude-via-Copilot response shape: flattened usage.prompt_tokens + // lumps fresh input (3857) with cache_write (12539); the authoritative + // split lives only in copilot_usage.token_details. + const copilotBody = () => Buffer.from(JSON.stringify({ + id: 'e6925ddf', + model: 'claude-sonnet-4.6', + choices: [{ message: { role: 'assistant', content: 'hi' } }], + usage: { + completion_tokens: 362, + prompt_tokens: 16396, + prompt_tokens_details: { cached_tokens: 0 }, + total_tokens: 16758, + }, + copilot_usage: { + token_details: [ + { token_type: 'input', token_count: 3857 }, + { token_type: 'cache_read', token_count: 0 }, + { token_type: 'cache_write', token_count: 12539 }, + { token_type: 'output', token_count: 362 }, + ], + total_nano_aiu: 6402225000, + }, + })); + + test('prefers the copilot_usage split over the lumped prompt_tokens', () => { + const { usage, model } = extractUsageFromJson(copilotBody()); + expect(model).toBe('claude-sonnet-4.6'); + expect(usage.input_tokens).toBe(3857); + expect(usage.cache_creation_input_tokens).toBe(12539); + expect(usage.cache_read_input_tokens).toBe(0); + expect(usage.output_tokens).toBe(362); + // The lumped prompt_tokens is dropped so normalization uses input_tokens. + expect(usage.prompt_tokens).toBeUndefined(); + }); + + test('normalizes to the correct cache_write split', () => { + const { usage } = extractUsageFromJson(copilotBody()); + expect(normalizeUsage(usage)).toEqual({ + input_tokens: 3857, + output_tokens: 362, + cache_read_tokens: 0, + cache_write_tokens: 12539, + reasoning_tokens: 0, + }); + }); + + test('does not affect plain OpenAI responses without copilot_usage', () => { + const body = Buffer.from(JSON.stringify({ + model: 'gpt-5', + usage: { + prompt_tokens: 100, + completion_tokens: 20, + total_tokens: 120, + prompt_tokens_details: { cached_tokens: 30 }, + }, + })); + expect(normalizeUsage(extractUsageFromJson(body).usage)).toEqual({ + input_tokens: 100, + output_tokens: 20, + cache_read_tokens: 30, + cache_write_tokens: 0, + reasoning_tokens: 0, + }); + }); + + test('uses copilot_usage even when the flattened usage object is absent', () => { + const body = Buffer.from(JSON.stringify({ + model: 'claude-sonnet-4.6', + copilot_usage: { + token_details: [ + { token_type: 'input', token_count: 200 }, + { token_type: 'output', token_count: 10 }, + { token_type: 'cache_write', token_count: 99 }, + ], + }, + })); + expect(normalizeUsage(extractUsageFromJson(body).usage)).toEqual({ + input_tokens: 200, + output_tokens: 10, + cache_read_tokens: 0, + cache_write_tokens: 99, + reasoning_tokens: 0, + }); + }); + + test('infers input_tokens from prompt_tokens when copilot_usage has cache_write but no input', () => { + // Edge case: token_details provides cache_write but omits input. + // prompt_tokens = input + cache_write, so input must be inferred to avoid + // double-counting cache_write in normalizeUsage. + const body = Buffer.from(JSON.stringify({ + model: 'claude-sonnet-4.6', + usage: { + prompt_tokens: 500, + completion_tokens: 50, + total_tokens: 550, + }, + copilot_usage: { + token_details: [ + { token_type: 'cache_write', token_count: 300 }, + { token_type: 'output', token_count: 50 }, + ], + }, + })); + const { usage } = extractUsageFromJson(body); + // prompt_tokens should be removed; input_tokens inferred as 500 - 300 = 200 + expect(usage.prompt_tokens).toBeUndefined(); + expect(usage.input_tokens).toBe(200); + expect(usage.cache_creation_input_tokens).toBe(300); + expect(normalizeUsage(usage)).toEqual({ + input_tokens: 200, + output_tokens: 50, + cache_read_tokens: 0, + cache_write_tokens: 300, + reasoning_tokens: 0, + }); + }); +}); + +describe('extractUsageFromSseLine with copilot_usage', () => { + test('applies the copilot_usage split in a streaming final chunk', () => { + const line = JSON.stringify({ + model: 'claude-sonnet-4.6', + usage: { prompt_tokens: 16396, completion_tokens: 362, total_tokens: 16758 }, + copilot_usage: { + token_details: [ + { token_type: 'input', token_count: 3857 }, + { token_type: 'cache_write', token_count: 12539 }, + { token_type: 'output', token_count: 362 }, + ], + }, + }); + const { usage } = extractUsageFromSseLine(line); + expect(usage.input_tokens).toBe(3857); + expect(usage.cache_creation_input_tokens).toBe(12539); + expect(usage.prompt_tokens).toBeUndefined(); + }); + + test('infers input_tokens from prompt_tokens when streaming copilot_usage has cache_write but no input', () => { + // Same double-count guard as non-streaming: if token_details omits input but + // provides cache_write, prompt_tokens must not survive alongside cache_creation_input_tokens. + const line = JSON.stringify({ + model: 'claude-sonnet-4.6', + usage: { prompt_tokens: 500, completion_tokens: 50, total_tokens: 550 }, + copilot_usage: { + token_details: [ + { token_type: 'cache_write', token_count: 300 }, + { token_type: 'output', token_count: 50 }, + ], + }, + }); + const { usage } = extractUsageFromSseLine(line); + expect(usage.prompt_tokens).toBeUndefined(); + expect(usage.input_tokens).toBe(200); + expect(usage.cache_creation_input_tokens).toBe(300); + expect(normalizeUsage(usage)).toEqual({ + input_tokens: 200, + output_tokens: 50, + cache_read_tokens: 0, + cache_write_tokens: 300, + reasoning_tokens: 0, + }); + }); +});