constructive-io
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎LLM_METADATA_DECISIONS.md‎
Lines changed: 37 additions & 0 deletions b/‎LLM_METADATA_DECISIONS.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎package.json‎
Lines changed: 7 additions & 0 deletions b/‎package.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎packages/agent/__tests__/agent.live.test.ts‎
Lines changed: 123 additions & 0 deletions b/‎packages/agent/__tests__/agent.live.test.ts‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎packages/agent/jest.config.js‎
Lines changed: 2 additions & 0 deletions b/‎packages/agent/jest.config.js‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/agent/package.json‎
Lines changed: 7 additions & 1 deletion b/‎packages/agent/package.json‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎packages/agent/scripts/run-live-tests.js‎
Lines changed: 59 additions & 0 deletions b/‎packages/agent/scripts/run-live-tests.js‎
Lines changed: 59 additions & 0 deletions
@@ -4,4 +4,8 @@
 **/yarn-error.log
 lerna-debug.log
 **/src/*.js
-**/src/*.d.ts
+**/src/*.d.ts
+.env
+.env.local
+**/.env
+**/.env.local
@@ -141,3 +141,40 @@ Tracking issue: [constructive-planning #907](https://github.com/constructive-io/
     unconditionally for every new request — including decision-resume
     requests via `respondWithDecision`. Mirrors the agent-side rule from
     decision #6 (reset on each new request, not on `continue()`).
+
+19. **Live provider eval suites are opt-in, `.env`-loaded, excluded from
+    default `pnpm test` via `testPathIgnorePatterns`, and never run in CI.**
+    Three suites land: `packages/openai/__tests__/openai.live.test.ts`,
+    `packages/ollama/__tests__/ollama.live.test.ts` (extended with a new
+    `Ollama live token-usage audit` block), and
+    `packages/agent/__tests__/agent.live.test.ts`. Each suite is gated by
+    `<NAMESPACE>_LIVE_SUITE=smoke|extended` (e.g. `OPENAI_LIVE_SUITE`); the
+    `pnpm test:live:<provider>{,:smoke,:extended}` runners set
+    `*_LIVE_READY=1` which both un-ignores the file in Jest config and
+    disables the `global.fetch = jest.fn()` mock in `openai/jest.setup.js`.
+    A shared `tools/test/load-env.js` walks up to find a workspace `.env`
+    and is silent if absent, so CI is unaffected. Why: empirical wire-shape
+    verification is the only way to confirm load-bearing claims like
+    "`completion_tokens` already includes `reasoning_tokens`" — but live
+    suites are expensive (real tokens) and require secrets, so they must
+    stay out of the default loop. How to apply: when changing usage
+    extraction, header construction, or any wire-shape detail, run the
+    matching `pnpm test:live:*:extended` locally before merging. The
+    `.gitignore` was updated to cover `.env` / `.env.local` to close a
+    secrets-leak gap.
+
+20. **Adapter-default `compat` must be the base layer of `createModel`'s
+    merge, not the override layer.** The original spread order was
+    `{ ...builtIn.compat, ...this.compat, ...overrides.compat }`, which
+    silently clobbered model-specific settings (notably
+    `maxTokensField: 'max_completion_tokens'` for reasoning-capable models)
+    with the adapter's generic default (`'max_tokens'`). OpenAI returned
+    400 (`Unsupported parameter: 'max_tokens'`) for `gpt-5.4-nano`. The
+    mock-mode unit tests didn't catch it because the mocked `fetch` never
+    validated the body. The live smoke test caught it on the very first
+    real call. Why: model-specific knowledge in the built-in catalog is
+    more authoritative than weak adapter defaults; user-provided overrides
+    are most authoritative of all. How to apply: spread order is now
+    `{ ...this.compat, ...builtIn.compat, ...overrides.compat }` — same
+    rule for `headers`. Same precedence rule should be applied any time a
+    new merge of compat-like fields is introduced.
@@ -22,6 +22,12 @@
     "typecheck": "node ./scripts/typecheck.js",
     "test:live:ollama": "pnpm --filter @agentic-kit/ollama run test:live:smoke",
     "test:live:ollama:extended": "pnpm --filter @agentic-kit/ollama run test:live:extended",
+    "test:live:openai": "pnpm --filter @agentic-kit/openai run test:live:smoke",
+    "test:live:openai:smoke": "pnpm --filter @agentic-kit/openai run test:live:smoke",
+    "test:live:openai:extended": "pnpm --filter @agentic-kit/openai run test:live:extended",
+    "test:live:agent": "pnpm --filter @agentic-kit/agent run test:live:smoke",
+    "test:live:agent:smoke": "pnpm --filter @agentic-kit/agent run test:live:smoke",
+    "test:live:agent:extended": "pnpm --filter @agentic-kit/agent run test:live:extended",
     "lint": "pnpm -r run lint",
     "internal:deps": "makage update-workspace",
     "deps": "pnpm up -r -i -L"
@@ -32,6 +38,7 @@
     "@types/node": "^20.12.7",
     "@typescript-eslint/eslint-plugin": "^8.58.2",
     "@typescript-eslint/parser": "^8.58.2",
+    "dotenv": "^16.4.5",
     "eslint": "^9.39.2",
     "eslint-config-prettier": "^10.1.8",
     "eslint-plugin-simple-import-sort": "^12.1.0",
 
@@ -0,0 +1,123 @@
+import { OpenAIAdapter } from '@agentic-kit/openai';
+import { createUserMessage, type AssistantMessage } from 'agentic-kit';
+
+import { Agent } from '../src';
+
+const modelId = process.env.OPENAI_LIVE_MODEL ?? 'gpt-5.4-nano';
+const apiKey = process.env.OPENAI_API_KEY;
+
+if (!apiKey) {
+  throw new Error('Missing required env var: OPENAI_API_KEY');
+}
+
+const liveSuite = process.env.AGENT_LIVE_SUITE ?? 'smoke';
+const runSmoke = liveSuite === 'smoke' || liveSuite === 'extended';
+const runExtended = liveSuite === 'extended';
+const describeSmoke = runSmoke ? describe : describe.skip;
+const describeExtended = runExtended ? describe : describe.skip;
+
+describeSmoke('Agent live smoke', () => {
+  jest.setTimeout(60_000);
+
+  it('single turn populates state.totalUsage from the assistant message', async () => {
+    const adapter = new OpenAIAdapter({ apiKey });
+    const model = adapter.createModel(modelId);
+    const agent = new Agent({ initialState: { model }, streamFn: adapter.stream.bind(adapter) });
+
+    await agent.prompt('Reply with the single word PONG.');
+
+    expect(agent.state.totalUsage.input).toBeGreaterThan(0);
+    expect(agent.state.totalUsage.output).toBeGreaterThan(0);
+    expect(agent.state.totalUsage.totalTokens).toBeGreaterThan(0);
+    expect(agent.state.totalUsage.cost.total).toBeGreaterThan(0);
+
+    const lastAssistant = agent.state.messages
+      .filter((m): m is AssistantMessage => m.role === 'assistant')
+      .at(-1)!;
+
+    // Single turn: the per-message usage IS the cumulative total.
+    expect(agent.state.totalUsage.input).toBe(lastAssistant.usage.input);
+    expect(agent.state.totalUsage.output).toBe(lastAssistant.usage.output);
+    expect(agent.state.totalUsage.reasoning).toBe(lastAssistant.usage.reasoning);
+    expect(agent.state.totalUsage.cacheRead).toBe(lastAssistant.usage.cacheRead);
+    expect(agent.state.totalUsage.cacheWrite).toBe(lastAssistant.usage.cacheWrite);
+    expect(agent.state.totalUsage.totalTokens).toBe(lastAssistant.usage.totalTokens);
+  });
+});
+
+describeExtended('Agent live extended', () => {
+  jest.setTimeout(120_000);
+
+  it('state.totalUsage equals field-wise sum across two turns', async () => {
+    const adapter = new OpenAIAdapter({ apiKey });
+    const model = adapter.createModel(modelId);
+    const agent = new Agent({ initialState: { model }, streamFn: adapter.stream.bind(adapter) });
+
+    await agent.prompt('What is 2 + 2? Reply with just the number.');
+
+    const t1Usage = {
+      ...agent.state.totalUsage,
+      cost: { ...agent.state.totalUsage.cost },
+    };
+
+    // continue() does not accept text; append the follow-up user message first.
+    agent.appendMessage(createUserMessage('Now what is that doubled? Reply with just the number.'));
+    await agent.continue();
+
+    const lastAssistant = agent.state.messages
+      .filter((m): m is AssistantMessage => m.role === 'assistant')
+      .at(-1)!;
+
+    expect(agent.state.totalUsage.input).toBe(t1Usage.input + lastAssistant.usage.input);
+    expect(agent.state.totalUsage.output).toBe(t1Usage.output + lastAssistant.usage.output);
+    expect(agent.state.totalUsage.reasoning).toBe(t1Usage.reasoning + lastAssistant.usage.reasoning);
+    expect(agent.state.totalUsage.cacheRead).toBe(t1Usage.cacheRead + lastAssistant.usage.cacheRead);
+    expect(agent.state.totalUsage.cacheWrite).toBe(t1Usage.cacheWrite + lastAssistant.usage.cacheWrite);
+    expect(agent.state.totalUsage.totalTokens).toBe(t1Usage.totalTokens + lastAssistant.usage.totalTokens);
+    expect(agent.state.totalUsage.cost.input).toBeCloseTo(
+      t1Usage.cost.input + lastAssistant.usage.cost.input,
+      10
+    );
+    expect(agent.state.totalUsage.cost.output).toBeCloseTo(
+      t1Usage.cost.output + lastAssistant.usage.cost.output,
+      10
+    );
+    expect(agent.state.totalUsage.cost.total).toBeCloseTo(
+      t1Usage.cost.total + lastAssistant.usage.cost.total,
+      10
+    );
+  });
+
+  it('prompt() resets totalUsage; continue() preserves it', async () => {
+    const adapter = new OpenAIAdapter({ apiKey });
+    const model = adapter.createModel(modelId);
+    const agent = new Agent({ initialState: { model }, streamFn: adapter.stream.bind(adapter) });
+
+    await agent.prompt('Reply with the single word A.');
+    const firstTotals = { ...agent.state.totalUsage, cost: { ...agent.state.totalUsage.cost } };
+
+    agent.appendMessage(createUserMessage('Reply with the single word B.'));
+    await agent.continue();
+    const secondTotals = { ...agent.state.totalUsage, cost: { ...agent.state.totalUsage.cost } };
+
+    // continue() must not reset — totals should have grown.
+    expect(secondTotals.input).toBeGreaterThanOrEqual(firstTotals.input);
+    expect(secondTotals.totalTokens).toBeGreaterThanOrEqual(firstTotals.totalTokens);
+    expect(agent.state.totalUsage.input).toBeGreaterThanOrEqual(firstTotals.input);
+
+    await agent.prompt('Reply with the single word C.');
+
+    const thirdAssistant = agent.state.messages
+      .filter((m): m is AssistantMessage => m.role === 'assistant')
+      .at(-1)!;
+
+    // prompt() resets: the new total should be one turn's worth, not cumulative
+    // across all three. We use < rather than === because token counts vary and
+    // we cannot pin the exact value — only that it did not carry over the prior
+    // two turns' worth of input tokens.
+    expect(agent.state.totalUsage.input).toBeLessThan(secondTotals.input + 100);
+    expect(agent.state.totalUsage.totalTokens).toBe(thirdAssistant.usage.totalTokens);
+    expect(agent.state.totalUsage.input).toBe(thirdAssistant.usage.input);
+    expect(agent.state.totalUsage.output).toBe(thirdAssistant.usage.output);
+  });
+});
@@ -15,10 +15,12 @@ module.exports = {
   testRegex: '(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$',
   moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
   modulePathIgnorePatterns: ['dist/*'],
+  testPathIgnorePatterns: process.env.AGENT_LIVE_READY === '1' ? [] : ['\\.live\\.test\\.ts$'],
   moduleNameMapper: {
     '^(\\.{1,2}/.*)\\.js$': '$1',
     '^@test/(.*)$': '<rootDir>/../../tools/test/$1',
     '^agentic-kit$': '<rootDir>/../agentic-kit/src',
     '^@agentic-kit/(.*)$': '<rootDir>/../$1/src',
   },
+  setupFiles: ['<rootDir>/../../tools/test/load-env.js'],
 };
@@ -34,10 +34,16 @@
     "build:dev": "makage build --dev",
     "lint": "eslint . --fix",
     "test": "jest",
-    "test:watch": "jest --watch"
+    "test:watch": "jest --watch",
+    "test:live": "node ./scripts/run-live-tests.js smoke",
+    "test:live:smoke": "node ./scripts/run-live-tests.js smoke",
+    "test:live:extended": "node ./scripts/run-live-tests.js extended"
   },
   "dependencies": {
     "agentic-kit": "workspace:*"
   },
+  "devDependencies": {
+    "@agentic-kit/openai": "workspace:*"
+  },
   "keywords": []
 }
@@ -0,0 +1,59 @@
+#!/usr/bin/env node
+
+const { spawnSync } = require('node:child_process');
+const { existsSync } = require('node:fs');
+const { dirname, join } = require('node:path');
+
+function findEnvFile(start) {
+  let dir = start;
+  while (true) {
+    const candidate = join(dir, '.env');
+    if (existsSync(candidate)) return candidate;
+    if (existsSync(join(dir, 'pnpm-workspace.yaml'))) return null;
+    const parent = dirname(dir);
+    if (parent === dir) return null;
+    dir = parent;
+  }
+}
+
+const envPath = findEnvFile(__dirname);
+if (envPath) {
+  require('dotenv').config({ path: envPath });
+}
+
+const requestedSuite = process.argv[2] || process.env.AGENT_LIVE_SUITE || 'smoke';
+const validSuites = new Set(['smoke', 'extended']);
+
+if (!validSuites.has(requestedSuite)) {
+  console.error(
+    `[agent-live] invalid suite '${requestedSuite}'. Use one of: ${Array.from(validSuites).join(', ')}`
+  );
+  process.exit(1);
+}
+
+if (!process.env.OPENAI_API_KEY) {
+  console.log('[agent-live] skipping live tests: OPENAI_API_KEY is not set');
+  process.exit(0);
+}
+
+console.log(`[agent-live] running ${requestedSuite} live tests against the OpenAI API`);
+
+const pnpmCommand = process.platform === 'win32' ? 'pnpm.cmd' : 'pnpm';
+const result = spawnSync(
+  pnpmCommand,
+  ['exec', 'jest', '--runInBand', '--runTestsByPath', '__tests__/agent.live.test.ts', '--verbose', '--forceExit'],
+  {
+    stdio: 'inherit',
+    env: {
+      ...process.env,
+      AGENT_LIVE_READY: '1',
+      AGENT_LIVE_SUITE: requestedSuite,
+    },
+  }
+);
+
+if (result.error) {
+  throw result.error;
+}
+
+process.exit(result.status ?? 1);