Skip to content

Commit 8e8e83e

Browse files
samiyacSamiya Caur
andauthored
test: update all the eval scenarios to work with experimentalPageIdRouting flag (#2085)
Refs #2052 This change updates all the eval scenarios so that they work with and without experimentalPageIdRouting flag Co-authored-by: Samiya Caur <samiyac@chromium.org>
1 parent 6992106 commit 8e8e83e

21 files changed

Lines changed: 362 additions & 196 deletions

scripts/eval_gemini.ts

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,10 @@ const ROOT_DIR = path.resolve(import.meta.dirname, '..');
1919
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
2020
const SKILL_PATH = path.join(ROOT_DIR, 'skills', 'chrome-devtools', 'SKILL.md');
2121

22-
// Define schema for our test scenarios
23-
export interface CapturedFunctionCall {
24-
name: string;
25-
args: Record<string, unknown>;
26-
}
27-
28-
export interface TestScenario {
29-
prompt: string;
30-
maxTurns: number;
31-
expectations: (calls: CapturedFunctionCall[]) => void;
32-
htmlRoute?: {
33-
path: string;
34-
htmlContent: string;
35-
};
36-
/** Extra CLI flags passed to the MCP server (e.g. '--experimental-page-id-routing'). */
37-
serverArgs?: string[];
38-
}
22+
import type {CapturedFunctionCall, TestScenario} from './eval_result.ts';
23+
import {Result} from './eval_result.ts';
24+
export type {CapturedFunctionCall, TestScenario};
25+
export {Result};
3926

4027
async function loadScenario(scenarioPath: string): Promise<TestScenario> {
4128
const module = await import(pathToFileURL(scenarioPath).href);
@@ -54,6 +41,7 @@ async function runSingleScenario(
5441
modelId: string,
5542
debug: boolean,
5643
includeSkill: boolean,
44+
extraServerArgs: string[] = [],
5745
): Promise<void> {
5846
const debugLog = (...args: unknown[]) => {
5947
if (debug) {
@@ -125,6 +113,9 @@ async function runSingleScenario(
125113
if (scenario.serverArgs) {
126114
args.push(...scenario.serverArgs);
127115
}
116+
if (extraServerArgs.length > 0) {
117+
args.push(...extraServerArgs);
118+
}
128119

129120
transport = new StdioClientTransport({
130121
command: 'node',
@@ -175,7 +166,7 @@ async function runSingleScenario(
175166
debugLog(`\n--- Response ---\n${result.text}`);
176167

177168
debugLog('\nVerifying expectations...');
178-
scenario.expectations(allCalls);
169+
scenario.expectations(new Result(allCalls, args));
179170
} finally {
180171
try {
181172
await client?.close();
@@ -195,7 +186,7 @@ async function main() {
195186
options: {
196187
model: {
197188
type: 'string',
198-
default: 'gemini-2.5-flash',
189+
default: 'gemini-3-flash-preview',
199190
},
200191
debug: {
201192
type: 'boolean',
@@ -209,6 +200,9 @@ async function main() {
209200
type: 'boolean',
210201
default: false,
211202
},
203+
'server-args': {
204+
type: 'string',
205+
},
212206
},
213207
allowPositionals: true,
214208
});
@@ -217,6 +211,9 @@ async function main() {
217211
const debug = values.debug;
218212
const repeat = values.repeat;
219213
const includeSkill = values['include-skill'];
214+
const extraServerArgs = values['server-args']
215+
? values['server-args'].split(/\s+/)
216+
: [];
220217

221218
const scenarioFiles =
222219
positionals.length > 0
@@ -248,6 +245,7 @@ async function main() {
248245
modelId,
249246
debug,
250247
includeSkill,
248+
extraServerArgs,
251249
);
252250
console.log(`✔ ${path.relative(ROOT_DIR, scenarioPath)} (Run ${i})`);
253251
successCount++;

scripts/eval_result.ts

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import assert from 'node:assert';
8+
9+
export interface CapturedFunctionCall {
10+
name: string;
11+
args: Record<string, unknown>;
12+
}
13+
14+
export class Result {
15+
private nextCallIndex = 0;
16+
public readonly calls: CapturedFunctionCall[];
17+
public readonly serverArgs: string[];
18+
19+
constructor(calls: CapturedFunctionCall[], serverArgs: string[]) {
20+
this.calls = calls;
21+
this.serverArgs = serverArgs;
22+
}
23+
24+
get hasPageIdRouting(): boolean {
25+
return this.serverArgs.includes('--experimental-page-id-routing');
26+
}
27+
28+
get remainingCalls(): CapturedFunctionCall[] {
29+
return this.calls.slice(this.nextCallIndex);
30+
}
31+
32+
/**
33+
* Consumes initial page navigation/setup boilerplate.
34+
* - Ignores/skips leading list_pages calls.
35+
* - Asserts that new_page or navigate_page was called.
36+
* - Determines the expected pageId.
37+
* - Returns the active pageId.
38+
*/
39+
consumePageNavigation(): number | undefined {
40+
if (this.calls[this.nextCallIndex]?.name === 'list_pages') {
41+
this.nextCallIndex++;
42+
}
43+
44+
const navCall = this.calls[this.nextCallIndex];
45+
assert.ok(
46+
navCall &&
47+
(navCall.name === 'new_page' || navCall.name === 'navigate_page'),
48+
`Expected navigation call (new_page or navigate_page), but got: ${navCall?.name || 'none'}`,
49+
);
50+
this.nextCallIndex++;
51+
52+
const isNewPage = navCall.name === 'new_page';
53+
let pageId: number | undefined;
54+
if (this.hasPageIdRouting) {
55+
pageId = isNewPage ? 2 : 1;
56+
}
57+
58+
return pageId;
59+
}
60+
61+
/**
62+
* Asserts that the next call in sequence has the correct name and matches expected arguments.
63+
* Increments the internal call index.
64+
*/
65+
assertNextCall(
66+
name: string,
67+
expectedArgs?: Record<string, unknown>,
68+
): CapturedFunctionCall {
69+
const call = this.calls[this.nextCallIndex];
70+
assert.ok(
71+
call,
72+
`Expected call at index ${this.nextCallIndex} (name: '${name}') to exist`,
73+
);
74+
assert.strictEqual(
75+
call.name,
76+
name,
77+
`Expected call at index ${this.nextCallIndex} to be '${name}', but got '${call.name}'`,
78+
);
79+
80+
if (expectedArgs) {
81+
for (const entry of Object.entries(expectedArgs)) {
82+
const key = entry[0];
83+
const value = entry[1];
84+
assert.deepStrictEqual(
85+
call.args[key],
86+
value,
87+
`Expected argument '${key}' on call '${name}' to be ${JSON.stringify(value)}, got ${JSON.stringify(call.args[key])}`,
88+
);
89+
}
90+
}
91+
92+
this.nextCallIndex++;
93+
return call;
94+
}
95+
}
96+
97+
export interface TestScenario {
98+
prompt: string;
99+
maxTurns: number;
100+
expectations: (result: Result) => void;
101+
htmlRoute?: {
102+
path: string;
103+
htmlContent: string;
104+
};
105+
/** Extra CLI flags passed to the MCP server (e.g. '--experimental-page-id-routing'). */
106+
serverArgs?: string[];
107+
}

scripts/eval_scenarios/console_test.ts

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import type {TestScenario} from '../eval_gemini.ts';
1010

1111
export const scenario: TestScenario = {
1212
prompt: 'Navigate to <TEST_URL> and check the console messages.',
13-
maxTurns: 2,
13+
maxTurns: 3,
1414
htmlRoute: {
1515
path: '/console_test.html',
1616
htmlContent: `
@@ -20,16 +20,12 @@ export const scenario: TestScenario = {
2020
</script>
2121
`,
2222
},
23-
expectations: calls => {
24-
assert.strictEqual(calls.length, 2);
25-
assert.ok(
26-
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
27-
'First call should be navigation',
28-
);
29-
assert.strictEqual(
30-
calls[1].name,
23+
expectations: result => {
24+
const pageId = result.consumePageNavigation();
25+
assert.strictEqual(result.remainingCalls.length, 1);
26+
result.assertNextCall(
3127
'list_console_messages',
32-
'Second call should be list_console_messages',
28+
result.hasPageIdRouting ? {pageId} : undefined,
3329
);
3430
},
3531
};

scripts/eval_scenarios/emulation_test.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ import type {TestScenario} from '../eval_gemini.ts';
1111
export const scenario: TestScenario = {
1212
prompt: 'Emulate offline network conditions.',
1313
maxTurns: 2,
14-
expectations: calls => {
15-
assert.strictEqual(calls.length, 1);
16-
assert.strictEqual(calls[0].name, 'emulate');
17-
assert.strictEqual(calls[0].args.networkConditions, 'Offline');
14+
expectations: result => {
15+
assert.ok(result.remainingCalls.length >= 1);
16+
result.assertNextCall('list_pages');
17+
result.assertNextCall('emulate', {
18+
networkConditions: 'Offline',
19+
pageId: result.hasPageIdRouting ? 1 : undefined,
20+
});
1821
},
1922
};

scripts/eval_scenarios/emulation_userAgent_test.ts

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@ import assert from 'node:assert';
99
import type {TestScenario} from '../eval_gemini.ts';
1010

1111
export const scenario: TestScenario = {
12-
prompt: 'Emulate iPhone 14 user agent',
12+
prompt: 'Emulate current page with iPhone 14 user agent',
1313
maxTurns: 2,
14-
expectations: calls => {
15-
assert.strictEqual(calls.length, 1);
16-
assert.strictEqual(calls[0].name, 'emulate');
17-
assert.deepStrictEqual(
18-
calls[0].args.userAgent,
19-
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
20-
);
14+
expectations: result => {
15+
assert.ok(result.remainingCalls.length >= 1);
16+
if (
17+
result.hasPageIdRouting ||
18+
result.remainingCalls[0]?.name === 'list_pages'
19+
) {
20+
result.assertNextCall('list_pages');
21+
}
22+
result.assertNextCall('emulate', {
23+
userAgent:
24+
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
25+
pageId: result.hasPageIdRouting ? 1 : undefined,
26+
});
2127
},
2228
};

scripts/eval_scenarios/emulation_viewport_test.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,19 @@ import assert from 'node:assert';
99
import type {TestScenario} from '../eval_gemini.ts';
1010

1111
export const scenario: TestScenario = {
12-
prompt: 'Emulate iPhone 14 viewport',
12+
prompt: 'Emulate current page with iPhone 14 viewport',
1313
maxTurns: 2,
14-
expectations: calls => {
15-
assert.strictEqual(calls.length, 1);
16-
assert.strictEqual(calls[0].name, 'emulate');
17-
assert.deepStrictEqual(calls[0].args.viewport, '390x844x3,mobile,touch');
14+
expectations: result => {
15+
assert.ok(result.remainingCalls.length >= 1);
16+
if (
17+
result.hasPageIdRouting ||
18+
result.remainingCalls[0]?.name === 'list_pages'
19+
) {
20+
result.assertNextCall('list_pages');
21+
}
22+
result.assertNextCall('emulate', {
23+
viewport: '390x844x3,mobile,touch',
24+
pageId: result.hasPageIdRouting ? 1 : undefined,
25+
});
1826
},
1927
};

scripts/eval_scenarios/fill_select_and_checkboxes_test.ts

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import type {TestScenario} from '../eval_gemini.ts';
1111
export const scenario: TestScenario = {
1212
prompt:
1313
'Go to <TEST_URL>, fill the form with size = 2 CPUs and components = [docker, nginx].',
14-
maxTurns: 4, // allow for at least one extra turn to verify there are no extra clicks after fill_form
14+
maxTurns: 5, // allow for at least one extra turn to verify there are no extra clicks after fill_form
1515
htmlRoute: {
1616
path: '/input_test.html',
1717
htmlContent: `
@@ -40,32 +40,42 @@ export const scenario: TestScenario = {
4040
</form>
4141
`,
4242
},
43-
expectations: calls => {
44-
assert.ok(calls.length >= 3, 'Not enough calls made');
45-
assert.ok(
46-
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
43+
expectations: result => {
44+
const pageId = result.consumePageNavigation();
45+
assert.ok(result.remainingCalls.length >= 2, 'Not enough calls made');
46+
result.assertNextCall(
47+
'take_snapshot',
48+
result.hasPageIdRouting ? {pageId} : undefined,
49+
);
50+
const fillFormCall = result.assertNextCall(
51+
'fill_form',
52+
result.hasPageIdRouting ? {pageId} : undefined,
4753
);
48-
assert.strictEqual(calls[1].name, 'take_snapshot');
49-
assert.strictEqual(calls[2].name, 'fill_form');
5054

51-
const elements = calls[2].args.elements as Array<{
52-
uid: string;
53-
value: string;
54-
}>;
55+
const elements = fillFormCall.args.elements;
56+
assert.ok(Array.isArray(elements), 'elements should be an array');
5557
assert.strictEqual(
5658
elements.length,
5759
3,
5860
'fill_form should be used with all form elements at once',
5961
);
6062

61-
const uids = new Set(elements.map(e => e.uid));
63+
const typedElements = elements.map(e => {
64+
assert.ok(e && typeof e === 'object' && 'uid' in e && 'value' in e);
65+
return {
66+
uid: String(e.uid),
67+
value: String(e.value),
68+
};
69+
});
70+
71+
const uids = new Set(typedElements.map(e => e.uid));
6272
assert.strictEqual(
6373
uids.size,
6474
3,
6575
'fill_form should target three distinct elements',
6676
);
6777

68-
const values = elements.map(e => e.value).sort();
78+
const values = typedElements.map(e => e.value).sort();
6979
assert.deepStrictEqual(
7080
values,
7181
['2 vCPU, 4GB RAM', 'true', 'true'],
@@ -74,11 +84,9 @@ export const scenario: TestScenario = {
7484

7585
const submitUid = '1_15';
7686

77-
const extraFormInteractions = calls
78-
.slice(3)
79-
.filter(
80-
c => ['fill', 'click'].includes(c.name) && c.args.uid !== submitUid,
81-
);
87+
const extraFormInteractions = result.remainingCalls.filter(
88+
c => ['fill', 'click'].includes(c.name) && c.args.uid !== submitUid,
89+
);
8290
assert.deepEqual(
8391
extraFormInteractions.length,
8492
0,

0 commit comments

Comments
 (0)