diff --git a/.github/workflows/test-gvisor-firewall-comparison.yml b/.github/workflows/test-gvisor-firewall-comparison.yml index 7a9203792..6f2e64862 100644 --- a/.github/workflows/test-gvisor-firewall-comparison.yml +++ b/.github/workflows/test-gvisor-firewall-comparison.yml @@ -59,7 +59,22 @@ jobs: -v /tmp/squid.conf:/etc/squid/squid.conf:ro \ ubuntu/squid:latest - sleep 3 + # Wait for Squid to be ready (up to 30 seconds) + echo "Waiting for Squid to be ready..." + for i in {1..30}; do + if docker run --rm --network awf-test curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" \ + -x http://172.30.0.10:3128 http://example.com | grep -q "200"; then + echo "✅ Squid is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Squid failed to start within 30 seconds" + docker logs squid-test + exit 1 + fi + sleep 1 + done # Start agent with iptables DNAT docker run --rm --name agent-test \ @@ -182,7 +197,22 @@ jobs: -v /tmp/squid.conf:/etc/squid/squid.conf:ro \ ubuntu/squid:latest - sleep 3 + # Wait for Squid to be ready (up to 30 seconds) + echo "Waiting for Squid to be ready..." + for i in {1..30}; do + if docker run --rm --network awf-test-gvisor curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" \ + -x http://172.30.0.10:3128 http://example.com | grep -q "200"; then + echo "✅ Squid is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Squid failed to start within 30 seconds" + docker logs squid-gvisor + exit 1 + fi + sleep 1 + done # Agent on gVisor with iptables docker run --rm --name agent-gvisor \ @@ -318,7 +348,21 @@ jobs: envoyproxy/envoy:v1.28-latest \ -c /etc/envoy/envoy.yaml - sleep 3 + # Wait for Envoy to be ready (check admin port up to 30 seconds) + echo "Waiting for Envoy to be ready..." + for i in {1..30}; do + if docker run --rm --network envoy-test curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" http://172.30.0.10:9901/ready | grep -q "200"; then + echo "✅ Envoy is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Envoy failed to start within 30 seconds" + docker logs envoy-test + exit 1 + fi + sleep 1 + done # Agent with iptables redirect to Envoy docker run --rm --name agent-envoy \ @@ -443,7 +487,21 @@ jobs: -v /tmp/envoy.yaml:/etc/envoy/envoy.yaml:ro \ envoyproxy/envoy:v1.28-latest -c /etc/envoy/envoy.yaml - sleep 3 + # Wait for Envoy to be ready (check admin port up to 30 seconds) + echo "Waiting for Envoy to be ready..." + for i in {1..30}; do + if docker run --rm --network envoy-gvisor curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" http://172.30.0.10:9901/ready | grep -q "200"; then + echo "✅ Envoy is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Envoy failed to start within 30 seconds" + docker logs envoy-gvisor + exit 1 + fi + sleep 1 + done # Agent on gVisor docker run --rm --runtime=runsc \ @@ -507,7 +565,22 @@ jobs: -v /tmp/squid.conf:/etc/squid/squid.conf:ro \ ubuntu/squid:latest - sleep 3 + # Wait for Squid to be ready (up to 30 seconds) + echo "Waiting for Squid to be ready..." + for i in {1..30}; do + if docker run --rm --network squid-perf curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" \ + -x http://squid-perf:3128 http://example.com | grep -q "200"; then + echo "✅ Squid is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Squid failed to start within 30 seconds" + docker logs squid-perf + exit 1 + fi + sleep 1 + done # Run 100 requests and measure latency AVG=$(docker run --rm --network squid-perf \ @@ -588,7 +661,21 @@ jobs: -v /tmp/envoy.yaml:/etc/envoy/envoy.yaml:ro \ envoyproxy/envoy:v1.28-latest -c /etc/envoy/envoy.yaml - sleep 3 + # Wait for Envoy to be ready (check admin port up to 30 seconds) + echo "Waiting for Envoy to be ready..." + for i in {1..30}; do + if docker run --rm --network envoy-perf curlimages/curl:latest \ + curl -s -o /dev/null -w "%{http_code}" http://envoy-perf:9901/ready | grep -q "200"; then + echo "✅ Envoy is ready" + break + fi + if [ $i -eq 30 ]; then + echo "❌ Envoy failed to start within 30 seconds" + docker logs envoy-perf + exit 1 + fi + sleep 1 + done # Run 100 requests and measure latency AVG=$(docker run --rm --network envoy-perf \ diff --git a/containers/api-proxy/token-parsers.js b/containers/api-proxy/token-parsers.js index 276f4e419..636c5744c 100644 --- a/containers/api-proxy/token-parsers.js +++ b/containers/api-proxy/token-parsers.js @@ -18,6 +18,35 @@ function isStreamingResponse(headers) { return ct.includes('text/event-stream'); } +/** + * Heuristically determine whether a request path targets an LLM + * completion-style endpoint that is expected to report token usage. + * + * Used to decide whether a successful (2xx) response that yielded no + * extractable usage should still produce a token-usage record (marked + * `usage_missing`) rather than being silently dropped. Non-completion + * traffic (e.g. `/models`, health checks) is excluded to avoid noise. + * + * Matches: OpenAI/Copilot chat + responses + legacy completions, Anthropic + * messages, and Gemini generateContent/streamGenerateContent — across + * optional version prefixes and query strings. + * + * @param {string} path - Request path (may include a query string) + * @returns {boolean} + */ +function looksLikeCompletionRequest(path) { + if (typeof path !== 'string' || path.length === 0) return false; + const pathOnly = path.split('?')[0].toLowerCase(); + return ( + pathOnly.includes('/chat/completions') || + pathOnly.includes('/responses') || + pathOnly.includes('/messages') || + pathOnly.includes('/completions') || + pathOnly.includes(':generatecontent') || + pathOnly.includes(':streamgeneratecontent') + ); +} + /** * Check if a response is gzip or deflate compressed. */ @@ -310,6 +339,7 @@ function normalizeUsage(usage) { module.exports = { isStreamingResponse, + looksLikeCompletionRequest, isCompressedResponse, createDecompressor, extractReasoningTokens, diff --git a/containers/api-proxy/token-tracker-http.js b/containers/api-proxy/token-tracker-http.js index f3fdbf104..fd85e17e7 100644 --- a/containers/api-proxy/token-tracker-http.js +++ b/containers/api-proxy/token-tracker-http.js @@ -21,6 +21,7 @@ const { logRequest } = require('./logging'); const { isStreamingResponse, + looksLikeCompletionRequest, isCompressedResponse, createDecompressor, parseSseDataLines, @@ -268,6 +269,59 @@ function buildAndWriteTokenRecord(normalized, { requestId, provider, model, reqP }); } +/** + * Persist a placeholder token-usage record for a successful completion-style + * response from which no usage could be extracted. + * + * Without this, a 2xx LLM response whose body omits a usage payload (observed + * with some Copilot streaming responses) produces NO line in token-usage.jsonl + * at all — the request becomes invisible to downstream consumers (step + * summaries, OTEL fan-out, AI-credit aggregation). Writing a zeroed record + * flagged with `usage_missing: true` keeps the request observable and makes the + * extraction gap diagnosable, while clearly signalling that the token counts + * are not real measured values. + * + * @param {object} params + * @param {string} params.requestId + * @param {string} params.provider + * @param {string|null} params.model + * @param {string} params.reqPath + * @param {number} params.status + * @param {boolean} params.streaming + * @param {number} params.duration + * @param {number} params.responseBytes + */ +function writeMissingUsageRecord({ requestId, provider, model, reqPath, status, streaming, duration, responseBytes }) { + const zeroUsage = { + input_tokens: 0, + output_tokens: 0, + cache_read_tokens: 0, + cache_write_tokens: 0, + reasoning_tokens: 0, + }; + const record = buildTokenUsageRecord(zeroUsage, { + requestId, + provider, + model, + reqPath, + status, + streaming, + duration, + responseBytes, + }); + record.usage_missing = true; + writeTokenUsage(record); + + logRequest('warn', 'token_usage_missing', { + request_id: requestId, + provider, + model: model || 'unknown', + path: reqPath, + streaming, + status, + }); +} + /** * Finalize token tracking for an HTTP response. * @@ -313,6 +367,23 @@ function finalizeHttpTracking(state, proxyRes, opts) { const normalized = normalizeUsage(usage); if (!normalized) { + // No usage payload was extracted from a successful response. For + // completion-style endpoints (where usage IS expected) write a placeholder + // record so the request stays visible downstream and the extraction gap is + // diagnosable, instead of silently dropping it. Non-completion traffic + // (e.g. /models, health checks) is skipped to avoid noise. + if (looksLikeCompletionRequest(reqPath)) { + writeMissingUsageRecord({ + requestId, + provider, + model: model || requestModel || provider, + reqPath, + status: proxyRes.statusCode, + streaming, + duration, + responseBytes: state.totalBytes, + }); + } if (typeof onSpanEnd === 'function') onSpanEnd(proxyRes.statusCode); return; } diff --git a/containers/api-proxy/token-tracker-http.unit.test.js b/containers/api-proxy/token-tracker-http.unit.test.js index cc1d8e50c..574a0c69f 100644 --- a/containers/api-proxy/token-tracker-http.unit.test.js +++ b/containers/api-proxy/token-tracker-http.unit.test.js @@ -349,6 +349,114 @@ describe('finalizeHttpTracking', () => { expect(() => finalizeHttpTracking(state, makeProxyRes(200), opts)).not.toThrow(); }); + + // ── missing-usage placeholder records ─────────────────────────────── + + describe('completion responses with no extractable usage', () => { + let mockStream; + let mkdirSyncSpy; + let createWriteStreamSpy; + + function makeMockStream() { + const chunks = []; + const stream = { + writableEnded: false, + write: jest.fn((chunk) => { chunks.push(chunk); return true; }), + end: jest.fn((cb) => { stream.writableEnded = true; if (cb) cb(); }), + on: jest.fn(), + get writtenRecords() { + return chunks.map((c) => JSON.parse(c.trim())); + }, + }; + return stream; + } + + beforeEach(async () => { + await closeLogStream(); + mockStream = makeMockStream(); + mkdirSyncSpy = jest.spyOn(fs, 'mkdirSync').mockReturnValue(undefined); + createWriteStreamSpy = jest.spyOn(fs, 'createWriteStream').mockReturnValue(mockStream); + }); + + afterEach(async () => { + mkdirSyncSpy.mockRestore(); + createWriteStreamSpy.mockRestore(); + await closeLogStream(); + }); + + test('writes a usage_missing record for a completion path', () => { + const opts = makeOpts({ provider: 'copilot', path: '/chat/completions', requestModel: 'gpt-4o' }); + const body = JSON.stringify({ id: 'x', choices: [{ delta: {}, finish_reason: 'stop' }] }); // no usage + const state = makeState({ + chunks: [Buffer.from(body)], + bufferedBytes: body.length, + totalBytes: body.length, + }); + + finalizeHttpTracking(state, makeProxyRes(200), opts); + + const records = mockStream.writtenRecords; + expect(records).toHaveLength(1); + expect(records[0]).toMatchObject({ + event: 'token_usage', + provider: 'copilot', + model: 'gpt-4o', + path: '/chat/completions', + status: 200, + usage_missing: true, + input_tokens: 0, + output_tokens: 0, + }); + // No real usage was measured, so metrics must not be incremented. + expect(opts.metrics.increment).not.toHaveBeenCalled(); + }); + + test('writes a usage_missing record for a streaming completion response', () => { + const opts = makeOpts({ provider: 'copilot', path: '/chat/completions', requestModel: 'gpt-4o' }); + const state = makeState({ + streaming: true, + contentType: 'text/event-stream', + streamingUsage: {}, + streamingModel: null, + }); + + finalizeHttpTracking(state, makeProxyRes(200), opts); + + const records = mockStream.writtenRecords; + expect(records).toHaveLength(1); + expect(records[0]).toMatchObject({ usage_missing: true, streaming: true, provider: 'copilot' }); + }); + + test('does NOT write a record for a non-completion path (e.g. /models)', () => { + const opts = makeOpts({ provider: 'copilot', path: '/v1/models' }); + const body = JSON.stringify({ data: [] }); // no usage, not a completion endpoint + const state = makeState({ + chunks: [Buffer.from(body)], + bufferedBytes: body.length, + totalBytes: body.length, + }); + + finalizeHttpTracking(state, makeProxyRes(200), opts); + + expect(mockStream.writtenRecords).toHaveLength(0); + }); + + test('still calls onSpanEnd after writing the placeholder record', () => { + const onSpanEnd = jest.fn(); + const opts = makeOpts({ provider: 'copilot', path: '/responses', onSpanEnd }); + const body = JSON.stringify({ type: 'response.created' }); // no usage + const state = makeState({ + chunks: [Buffer.from(body)], + bufferedBytes: body.length, + totalBytes: body.length, + }); + + finalizeHttpTracking(state, makeProxyRes(200), opts); + + expect(onSpanEnd).toHaveBeenCalledWith(200); + expect(mockStream.writtenRecords).toHaveLength(1); + }); + }); }); // ── extractUsageFromTrackedState ────────────────────────────────────── diff --git a/containers/api-proxy/token-tracker.js b/containers/api-proxy/token-tracker.js index 859f13d5a..45e4e680e 100644 --- a/containers/api-proxy/token-tracker.js +++ b/containers/api-proxy/token-tracker.js @@ -24,6 +24,7 @@ const { parseSseDataLines, normalizeUsage, isStreamingResponse, + looksLikeCompletionRequest, isCompressedResponse, } = require('./token-parsers'); @@ -38,6 +39,7 @@ module.exports = { parseWebSocketFrames, normalizeUsage, isStreamingResponse, + looksLikeCompletionRequest, isCompressedResponse, validateTokenUsageRecord, writeTokenUsage, diff --git a/containers/api-proxy/token-tracker.parsing.test.js b/containers/api-proxy/token-tracker.parsing.test.js index 79aa7a24f..dbf0f6059 100644 --- a/containers/api-proxy/token-tracker.parsing.test.js +++ b/containers/api-proxy/token-tracker.parsing.test.js @@ -7,6 +7,7 @@ const { extractUsageFromSseLine, parseSseDataLines, normalizeUsage, + looksLikeCompletionRequest, } = require('./token-tracker'); // ── extractUsageFromJson ────────────────────────────────────────────── @@ -523,3 +524,33 @@ describe('normalizeUsage', () => { }); }); }); + +// ── looksLikeCompletionRequest ──────────────────────────────────────── + +describe('looksLikeCompletionRequest', () => { + test.each([ + '/chat/completions', + '/v1/chat/completions', + '/responses', + '/v1/responses', + '/v1/messages', + '/v1/completions', + '/v1beta/models/gemini-pro:generateContent', + '/v1beta/models/gemini-pro:streamGenerateContent', + '/v1/chat/completions?foo=bar', + ])('returns true for completion endpoint %s', (path) => { + expect(looksLikeCompletionRequest(path)).toBe(true); + }); + + test.each([ + '/v1/models', + '/models', + '/health', + '/', + '', + null, + undefined, + ])('returns false for non-completion path %s', (path) => { + expect(looksLikeCompletionRequest(path)).toBe(false); + }); +}); diff --git a/schemas/token-usage.schema.json b/schemas/token-usage.schema.json index b48603bb0..d04c5ca24 100644 --- a/schemas/token-usage.schema.json +++ b/schemas/token-usage.schema.json @@ -101,6 +101,10 @@ "minimum": 0, "description": "Total number of bytes in the response body (optional, omitted for WebSocket upgrades)." }, + "usage_missing": { + "type": "boolean", + "description": "Present and true when no token usage could be extracted from an otherwise successful (2xx) completion response. Token counts in such records are placeholder zeros, not measured values; consumers should treat them as 'request occurred, usage unknown' rather than as a real zero-cost call." + }, "x_initiator": { "type": "string", "enum": ["agent", "user"],