Skip to content

Commit 60bd716

Browse files
rajbosCopilot
andcommitted
fix: use real outputTokens from assistant.message to estimate CLI session tokens
Sessions without session.shutdown events were massively undercounting tokens because character-based estimation ignores context accumulation in agent sessions. Investigation found: - Real input:output ratio in completed sessions: ~100-200x - Cache reads ≈ input tokens (prompt caching caches full context) - A session with 387 tool calls showed 92K real output tokens but only 13K estimated, with real total likely 10-25M Changes: 1. assistant.message events carry real API-reported outputTokens per response. Capture these directly instead of estimating from text. 2. When no session.shutdown exists but we have real output tokens, estimate input using observed ratios from completed sessions: - >20 tool calls: 130x (heavy agent) - >5 tool calls: 50x (moderate) - <=5 tool calls: 10x (light/chat) Cache reads estimated as equal to input (empirically ~50% of total). 3. Also fix tool.result -> tool.execution_complete event type (wrong event name meant tool outputs were silently skipped in fallback). 4. Count tool.execution_start to drive the ratio selection. Result: sessions without shutdown now estimate correctly in the right order of magnitude instead of being off by 100x. Sessions WITH session.shutdown are unaffected (still use exact API data). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 1ade50a commit 60bd716

3 files changed

Lines changed: 42 additions & 8 deletions

File tree

.github/skills/copilot-log-analysis/SKILL.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,18 @@ See the **Executable Scripts** section for available utilities:
179179
```jsonl
180180
{"type": "user.message", "data": {"content": "..."}, "model": "gpt-4o"}
181181
{"type": "assistant.message", "data": {"content": "..."}}
182-
{"type": "tool.result", "data": {"output": "..."}}
182+
{"type": "tool.execution_start", "data": {"toolCallId": "...", "toolName": "...", "arguments": {}}}
183+
{"type": "tool.execution_complete", "data": {"toolCallId": "...", "success": true, "result": {"content": "...", "detailedContent": "..."}}}
184+
{"type": "session.shutdown", "data": {"modelMetrics": {"claude-opus-4.6": {"usage": {"inputTokens": 0, "outputTokens": 0, "cacheReadTokens": 0, "cacheWriteTokens": 0}}}}}
183185
```
184186

185187
**Key fields:**
186188
- Event type: `type`
187189
- User input: `data.content` (when `type: 'user.message'`)
188190
- Assistant output: `data.content` (when `type: 'assistant.message'`)
189-
- Tool output: `data.output` (when `type: 'tool.result'`)
191+
- Tool output: `data.result.content` or `data.result.detailedContent` (when `type: 'tool.execution_complete'`)
192+
- Tool name for counting: `data.toolName` (when `type: 'tool.execution_start'`)
193+
- API-accurate token counts: `data.modelMetrics[model].usage` (when `type: 'session.shutdown'`) — always prefer these over estimates when available
190194
- Model: `model` (optional, defaults to `gpt-4o`)
191195

192196
## JSONL File Structure (JetBrains IDE)

.github/skills/copilot-log-analysis/session-file-discovery.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ function getCopilotSessionFiles() {
147147
}
148148

149149
// OpenCode session files (XDG data directory)
150-
const xdgDataHome = (platform === 'win32')
150+
const xdgDataHome = (os.platform() === 'win32')
151151
? path.join(homedir, '.local', 'share')
152152
: (process.env.XDG_DATA_HOME || path.join(homedir, '.local', 'share'));
153153
const openCodeSessionDir = path.join(xdgDataHome, 'opencode', 'storage', 'session');

vscode-extension/src/usageAnalysis.ts

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,9 @@ export async function getModelUsageFromSession(deps: Pick<UsageAnalysisDeps, 'wa
14991499
let isDeltaBased = false;
15001500
// For CLI (non-delta) sessions: capture exact per-model usage from session.shutdown
15011501
let cliShutdownModelUsage: ModelUsage | null = null;
1502+
// Real outputTokens from assistant.message events (used when session.shutdown is absent)
1503+
let cliRealOutputByModel: { [model: string]: number } | null = null;
1504+
let totalCliToolCalls = 0;
15021505

15031506
for (const line of lines) {
15041507
if (!line.trim()) { continue; }
@@ -1579,11 +1582,20 @@ export async function getModelUsageFromSession(deps: Pick<UsageAnalysisDeps, 'wa
15791582
}
15801583
} else if (event.type === 'user.message' && event.data?.content) {
15811584
modelUsage[model].inputTokens += estimateTokensFromText(event.data.content, model, deps.tokenEstimators);
1582-
} else if (event.type === 'assistant.message' && event.data?.content) {
1583-
modelUsage[model].outputTokens += estimateTokensFromText(event.data.content, model, deps.tokenEstimators);
1584-
} else if (event.type === 'tool.result' && event.data?.output) {
1585-
// Tool outputs are typically input context
1586-
modelUsage[model].inputTokens += estimateTokensFromText(event.data.output, model, deps.tokenEstimators);
1585+
} else if (event.type === 'assistant.message') {
1586+
const realOutput = typeof event.data?.outputTokens === 'number' ? event.data.outputTokens : 0;
1587+
if (realOutput > 0) {
1588+
if (!cliRealOutputByModel) { cliRealOutputByModel = {}; }
1589+
cliRealOutputByModel[model] = (cliRealOutputByModel[model] ?? 0) + realOutput;
1590+
} else if (event.data?.content) {
1591+
modelUsage[model].outputTokens += estimateTokensFromText(event.data.content, model, deps.tokenEstimators);
1592+
}
1593+
} else if (event.type === 'tool.execution_start') {
1594+
totalCliToolCalls++;
1595+
} else if (event.type === 'tool.execution_complete' && (event.data?.result?.content || event.data?.result?.detailedContent)) {
1596+
// Tool outputs are fed back as input context in the next turn
1597+
const toolContent = event.data.result.content || event.data.result.detailedContent;
1598+
modelUsage[model].inputTokens += estimateTokensFromText(String(toolContent), model, deps.tokenEstimators);
15871599
}
15881600
}
15891601
} catch (e) {
@@ -1596,6 +1608,24 @@ export async function getModelUsageFromSession(deps: Pick<UsageAnalysisDeps, 'wa
15961608
return cliShutdownModelUsage;
15971609
}
15981610

1611+
// No session.shutdown: assistant.message events carry real per-response outputTokens from the API.
1612+
// Estimate input using observed ratio from completed agent sessions (~130x output for heavy
1613+
// agent sessions with >20 tool calls, ~50x for moderate, ~10x for light/chat-only).
1614+
// Cache reads empirically equal ~input tokens (prompt caching caches the full context).
1615+
if (!isDeltaBased && cliRealOutputByModel) {
1616+
const inputOutputRatio = totalCliToolCalls > 20 ? 130 : totalCliToolCalls > 5 ? 50 : 10;
1617+
const estimatedUsage: ModelUsage = {};
1618+
for (const [m, realOutput] of Object.entries(cliRealOutputByModel)) {
1619+
const estimatedInput = Math.round(realOutput * inputOutputRatio);
1620+
estimatedUsage[m] = {
1621+
inputTokens: estimatedInput,
1622+
outputTokens: realOutput,
1623+
cachedReadTokens: estimatedInput,
1624+
};
1625+
}
1626+
return estimatedUsage;
1627+
}
1628+
15991629
// For delta-based formats, extract actual usage from reconstructed state
16001630
if (isDeltaBased && sessionState.requests && Array.isArray(sessionState.requests)) {
16011631
for (const request of sessionState.requests) {

0 commit comments

Comments
 (0)