Skip to content

Commit 46a08cf

Browse files
committed
Fix model matrix capability handling
1 parent 4d30cdf commit 46a08cf

3 files changed

Lines changed: 219 additions & 13 deletions

File tree

scripts/test-model-matrix.js

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ function printUsage() {
110110
" --plugin=dist|package Load plugin from local dist URI or package name (default: dist)",
111111
" --max-cases=N Hard cap number of cases per scenario",
112112
" --report-json=PATH Write JSON report to PATH (relative to repo root)",
113+
" --strict-capabilities Fail unsupported account/model capabilities instead of skipping them",
113114
" --no-restore Keep generated local config files after run",
114115
" -h, --help Show help",
115116
].join("\n"),
@@ -180,18 +181,67 @@ export function resolveMatrixTimeoutMs(smoke = false) {
180181
return parsedTimeout;
181182
}
182183

183-
function hasCompletedSuccessfully(output, token) {
184+
function parseNdjsonEvents(output) {
185+
const events = [];
186+
for (const line of output.split(/\r?\n/)) {
187+
const trimmed = line.trim();
188+
if (!trimmed.startsWith("{")) {
189+
continue;
190+
}
191+
try {
192+
events.push(JSON.parse(trimmed));
193+
} catch {
194+
// Ignore wrapper noise and partial lines.
195+
}
196+
}
197+
return events;
198+
}
199+
200+
function findLastIndex(items, predicate) {
201+
for (let index = items.length - 1; index >= 0; index -= 1) {
202+
if (predicate(items[index], index)) {
203+
return index;
204+
}
205+
}
206+
return -1;
207+
}
208+
209+
function hasTerminalFailure(events) {
210+
const lastCompletedIndex = findLastIndex(
211+
events,
212+
(event) =>
213+
event?.type === "turn.completed" || event?.type === "response.completed",
214+
);
184215
return (
185-
output.includes(token) ||
186-
output.includes('"type":"turn.completed"') ||
187-
output.includes('"type":"response.completed"')
216+
findLastIndex(
217+
events,
218+
(event, index) =>
219+
index > lastCompletedIndex &&
220+
(event?.type === "error" ||
221+
event?.type === "turn.failed" ||
222+
event?.type === "response.failed" ||
223+
event?.type === "response.error" ||
224+
event?.type === "response.incomplete"),
225+
) >= 0
188226
);
189227
}
190228

191-
function getSmokeSkipReason(exitCode, output) {
192-
if (exitCode === 124) {
193-
return "timed-out";
229+
function hasCompletedSuccessfully(output, token) {
230+
const events = parseNdjsonEvents(output);
231+
if (events.length > 0) {
232+
if (hasTerminalFailure(events)) {
233+
return false;
234+
}
235+
return events.some(
236+
(event) =>
237+
event?.type === "turn.completed" ||
238+
event?.type === "response.completed",
239+
);
194240
}
241+
return output.includes(token);
242+
}
243+
244+
function getCapabilitySkipReason(exitCode, output, smoke) {
195245
if (/not supported when using codex with a chatgpt account/i.test(output)) {
196246
return "unsupported-model";
197247
}
@@ -201,14 +251,26 @@ function getSmokeSkipReason(exitCode, output) {
201251
) {
202252
return "unsupported-reasoning";
203253
}
254+
if (smoke && exitCode === 124) {
255+
return "timed-out";
256+
}
204257
return null;
205258
}
206259

207-
function finalizeModelCaseResult(caseInfo, exitCode, output, token, smoke) {
260+
function finalizeModelCaseResult(
261+
caseInfo,
262+
exitCode,
263+
output,
264+
token,
265+
{ smoke, strictCapabilities } = {},
266+
) {
208267
const hasToken = output.includes(token);
209268
const completed = hasCompletedSuccessfully(output, token);
210269
const ok = exitCode === 0 && completed;
211-
const skipReason = !ok && smoke ? getSmokeSkipReason(exitCode, output) : null;
270+
const skipReason =
271+
!ok && strictCapabilities !== true
272+
? getCapabilitySkipReason(exitCode, output, smoke === true)
273+
: null;
212274

213275
return {
214276
...caseInfo,
@@ -228,8 +290,12 @@ export function __finalizeModelCaseResultForTests(
228290
output,
229291
token,
230292
smoke = false,
293+
strictCapabilities = false,
231294
) {
232-
return finalizeModelCaseResult(caseInfo, exitCode, output, token, smoke);
295+
return finalizeModelCaseResult(caseInfo, exitCode, output, token, {
296+
smoke,
297+
strictCapabilities,
298+
});
233299
}
234300

235301
function stopCodexServersInternal() {
@@ -361,7 +427,10 @@ function executeModelCase(caseInfo, index) {
361427
124,
362428
`Timed out after ${timeoutMs}ms`,
363429
token,
364-
caseInfo.smoke === true,
430+
{
431+
smoke: caseInfo.smoke === true,
432+
strictCapabilities: caseInfo.strictCapabilities === true,
433+
},
365434
);
366435
}
367436

@@ -373,7 +442,10 @@ function executeModelCase(caseInfo, index) {
373442
exitCode,
374443
combinedOutput,
375444
token,
376-
caseInfo.smoke === true,
445+
{
446+
smoke: caseInfo.smoke === true,
447+
strictCapabilities: caseInfo.strictCapabilities === true,
448+
},
377449
);
378450
}
379451

@@ -451,6 +523,7 @@ async function runScenario(scenario, options) {
451523
(caseInfo) => ({
452524
...caseInfo,
453525
smoke: options.smoke,
526+
strictCapabilities: options.strictCapabilities,
454527
}),
455528
);
456529
console.log(`\n=== ${scenario.toUpperCase()} (${cases.length} cases) ===`);
@@ -492,6 +565,7 @@ async function main() {
492565
const scenarioValue =
493566
parseArgValue(args, "--scenario") ?? (smoke ? "modern" : "all");
494567
const pluginMode = parseArgValue(args, "--plugin") ?? "dist";
568+
const strictCapabilities = args.includes("--strict-capabilities");
495569
const noRestore = args.includes("--no-restore");
496570
const maxCasesRaw = parseArgValue(args, "--max-cases");
497571
const maxCases = maxCasesRaw ? Number.parseInt(maxCasesRaw, 10) : 0;
@@ -539,6 +613,7 @@ async function main() {
539613
smoke,
540614
maxCases,
541615
pluginRef,
616+
strictCapabilities,
542617
});
543618
allResults.push(
544619
...scenarioResults.map((item) => ({ ...item, scenario })),

test/model-capability-matrix.test.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { describe, expect, it } from "vitest";
22
import { CapabilityPolicyStore } from "../lib/capability-policy.js";
3+
import { resolveEntitlementAccountKey } from "../lib/entitlement-cache.js";
34
import { buildModelCapabilityMatrix } from "../lib/model-capability-matrix.js";
45
import type { AccountStorageV3 } from "../lib/storage.js";
56

@@ -21,6 +22,18 @@ function storage(): AccountStorageV3 {
2122
}
2223

2324
describe("model capability matrix", () => {
25+
it("returns default normalized models without entries when storage is missing", () => {
26+
const matrix = buildModelCapabilityMatrix({
27+
storage: null,
28+
models: [],
29+
now: 100,
30+
});
31+
32+
expect(matrix.generatedAt).toBe(100);
33+
expect(matrix.models.length).toBeGreaterThan(0);
34+
expect(matrix.entries).toEqual([]);
35+
});
36+
2437
it("builds model/account availability from existing model profiles", () => {
2538
const matrix = buildModelCapabilityMatrix({
2639
storage: storage(),
@@ -69,5 +82,46 @@ describe("model capability matrix", () => {
6982
);
7083
expect(matrix.entries[0]?.reasons).toContain("quota cache is rate-limited");
7184
});
85+
86+
it("marks disabled and entitlement-blocked accounts unavailable", () => {
87+
const baseStorage = storage();
88+
baseStorage.accounts[0] = {
89+
...baseStorage.accounts[0]!,
90+
enabled: false,
91+
};
92+
const entitlementKey = resolveEntitlementAccountKey({
93+
accountId: "acct_1",
94+
email: "owner@example.com",
95+
index: 0,
96+
});
97+
const matrix = buildModelCapabilityMatrix({
98+
storage: baseStorage,
99+
models: ["gpt-5.3-codex"],
100+
entitlements: {
101+
accounts: {
102+
[entitlementKey]: [
103+
{
104+
model: "gpt-5.3-codex",
105+
blockedUntil: 200,
106+
reason: "plan-entitlement",
107+
updatedAt: 100,
108+
},
109+
],
110+
},
111+
},
112+
now: 100,
113+
});
114+
115+
expect(matrix.entries[0]).toMatchObject({
116+
available: false,
117+
entitlementBlocked: true,
118+
entitlementReason: "plan-entitlement",
119+
entitlementWaitMs: 100,
120+
});
121+
expect(matrix.entries[0]?.reasons).toContain("account disabled");
122+
expect(matrix.entries[0]?.reasons).toContain(
123+
"entitlement blocked: plan-entitlement",
124+
);
125+
});
72126
});
73127

test/test-model-matrix-script.test.ts

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,33 @@ describe("test-model-matrix script helpers", () => {
150150
);
151151
});
152152

153-
it("downgrades unsupported smoke failures to skipped cases", async () => {
153+
it("does not treat echoed prompt tokens as success after a terminal JSON failure", async () => {
154+
const mod = await import("../scripts/test-model-matrix.js");
155+
expect(
156+
mod.__finalizeModelCaseResultForTests(
157+
{ model: "gpt-5.2" },
158+
0,
159+
[
160+
'{"type":"thread.started"}',
161+
'{"type":"turn.started"}',
162+
"MODEL_MATRIX_OK_8",
163+
'{"type":"turn.failed","error":{"message":"model execution failed"}}',
164+
].join("\n"),
165+
"MODEL_MATRIX_OK_8",
166+
false,
167+
true,
168+
),
169+
).toEqual(
170+
expect.objectContaining({
171+
ok: false,
172+
hasToken: true,
173+
completed: false,
174+
skipped: false,
175+
}),
176+
);
177+
});
178+
179+
it("downgrades unsupported capability failures to skipped cases", async () => {
154180
const mod = await import("../scripts/test-model-matrix.js");
155181
expect(
156182
mod.__finalizeModelCaseResultForTests(
@@ -168,6 +194,21 @@ describe("test-model-matrix script helpers", () => {
168194
}),
169195
);
170196

197+
expect(
198+
mod.__finalizeModelCaseResultForTests(
199+
{ model: "gpt-5.5-pro" },
200+
1,
201+
"{\"type\":\"turn.failed\",\"error\":{\"message\":\"The 'gpt-5.5-pro' model is not supported when using Codex with a ChatGPT account.\"}}",
202+
"MODEL_MATRIX_OK_12",
203+
),
204+
).toEqual(
205+
expect.objectContaining({
206+
ok: false,
207+
skipped: true,
208+
skipReason: "unsupported-model",
209+
}),
210+
);
211+
171212
expect(
172213
mod.__finalizeModelCaseResultForTests(
173214
{ model: "gpt-5.2" },
@@ -185,6 +226,42 @@ describe("test-model-matrix script helpers", () => {
185226
);
186227
});
187228

229+
it("keeps strict capability and full timeout failures red", async () => {
230+
const mod = await import("../scripts/test-model-matrix.js");
231+
232+
expect(
233+
mod.__finalizeModelCaseResultForTests(
234+
{ model: "gpt-5.5-pro" },
235+
1,
236+
"{\"type\":\"turn.failed\",\"error\":{\"message\":\"The 'gpt-5.5-pro' model is not supported when using Codex with a ChatGPT account.\"}}",
237+
"MODEL_MATRIX_OK_13",
238+
false,
239+
true,
240+
),
241+
).toEqual(
242+
expect.objectContaining({
243+
ok: false,
244+
skipped: false,
245+
skipReason: null,
246+
}),
247+
);
248+
249+
expect(
250+
mod.__finalizeModelCaseResultForTests(
251+
{ model: "gpt-5.2" },
252+
124,
253+
"Timed out after 120000ms",
254+
"MODEL_MATRIX_OK_14",
255+
),
256+
).toEqual(
257+
expect.objectContaining({
258+
ok: false,
259+
skipped: false,
260+
skipReason: null,
261+
}),
262+
);
263+
});
264+
188265
it("filters non-path where output on Windows", async () => {
189266
const platformSpy = vi
190267
.spyOn(process, "platform", "get")

0 commit comments

Comments
 (0)