Skip to content

Commit 16a33d0

Browse files
committed
feat(benchmark): add per-test token tracking and stream usage reporting
- Add stream_options: { include_usage: true } for OpenAI API token reporting - Fall back to chunk-counted completion tokens for local llama-server - Track per-test tokens via _currentTestTokens accumulator - Include token data in test results, log output, and emitted events - Auto-detect LLM-only mode when no VLM URL is provided - Use max_completion_tokens for cloud APIs (GPT-5.4+), max_tokens for local - Always exit 0 in Aegis skill mode regardless of test failures
1 parent 7e3450c commit 16a33d0

1 file changed

Lines changed: 32 additions & 8 deletions

File tree

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ async function llmCall(messages, opts = {}) {
224224
const params = {
225225
messages,
226226
stream: true,
227+
// Request token usage in streaming response (supported by OpenAI, some local servers)
228+
stream_options: { include_usage: true },
227229
...(model && { model }),
228230
...(opts.temperature !== undefined && { temperature: opts.temperature }),
229231
...maxTokensParam,
@@ -358,10 +360,24 @@ async function llmCall(messages, opts = {}) {
358360
content = reasoningContent;
359361
}
360362

361-
// Track token totals
362-
results.tokenTotals.prompt += usage.prompt_tokens || 0;
363-
results.tokenTotals.completion += usage.completion_tokens || 0;
364-
results.tokenTotals.total += usage.total_tokens || 0;
363+
// Build per-call token data:
364+
// Prefer server-reported usage; fall back to chunk-counted completion tokens
365+
const promptTokens = usage.prompt_tokens || 0;
366+
const completionTokens = usage.completion_tokens || tokenCount; // tokenCount = chunks with content/reasoning
367+
const totalTokens = usage.total_tokens || (promptTokens + completionTokens);
368+
const callTokens = { prompt: promptTokens, completion: completionTokens, total: totalTokens };
369+
370+
// Track global token totals
371+
results.tokenTotals.prompt += callTokens.prompt;
372+
results.tokenTotals.completion += callTokens.completion;
373+
results.tokenTotals.total += callTokens.total;
374+
375+
// Track per-test tokens (accumulated across multiple llmCall invocations within one test)
376+
if (_currentTestTokens) {
377+
_currentTestTokens.prompt += callTokens.prompt;
378+
_currentTestTokens.completion += callTokens.completion;
379+
_currentTestTokens.total += callTokens.total;
380+
}
365381

366382
// Capture model name from first response
367383
if (opts.vlm) {
@@ -370,7 +386,7 @@ async function llmCall(messages, opts = {}) {
370386
if (!results.model.name && model) results.model.name = model;
371387
}
372388

373-
return { content, toolCalls, usage, model };
389+
return { content, toolCalls, usage: callTokens, model };
374390
} finally {
375391
clearTimeout(idleTimer);
376392
}
@@ -449,25 +465,33 @@ async function runSuites() {
449465
}
450466
}
451467

468+
// ─── Per-test token accumulator (set by test(), read by llmCall) ──────────────
469+
let _currentTestTokens = null;
470+
452471
async function test(name, fn) {
453-
const testResult = { name, status: 'pass', timeMs: 0, detail: '', tokens: {} };
472+
const testResult = { name, status: 'pass', timeMs: 0, detail: '', tokens: { prompt: 0, completion: 0, total: 0 } };
473+
_currentTestTokens = { prompt: 0, completion: 0, total: 0 };
454474
const start = Date.now();
455475
try {
456476
const detail = await fn();
457477
testResult.timeMs = Date.now() - start;
458478
testResult.detail = detail || '';
479+
testResult.tokens = { ..._currentTestTokens };
459480
currentSuite.passed++;
460-
log(` ✅ ${name} (${testResult.timeMs}ms)${detail ? ` — ${detail}` : ''}`);
481+
const tokInfo = _currentTestTokens.total > 0 ? `, ${_currentTestTokens.total} tok` : '';
482+
log(` ✅ ${name} (${testResult.timeMs}ms${tokInfo})${detail ? ` — ${detail}` : ''}`);
461483
} catch (err) {
462484
testResult.timeMs = Date.now() - start;
463485
testResult.status = 'fail';
464486
testResult.detail = err.message;
487+
testResult.tokens = { ..._currentTestTokens };
465488
currentSuite.failed++;
466489
log(` ❌ ${name} (${testResult.timeMs}ms) — ${err.message}`);
467490
}
491+
_currentTestTokens = null;
468492
currentSuite.timeMs += testResult.timeMs;
469493
currentSuite.tests.push(testResult);
470-
emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120) });
494+
emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens });
471495
}
472496

473497
function skip(name, reason) {

0 commit comments

Comments
 (0)