Skip to content

Commit faac61d

Browse files
zyzyzyryxyPiotr Paulski
andauthored
test: fix fill_form eval (#2011)
Improve fill_form eval by checking if fill_form was the only tool called for the task and provide more descriptive failure messages. Co-authored-by: Piotr Paulski <piotrpaulski@chromium.org>
1 parent a0d6ea1 commit faac61d

1 file changed

Lines changed: 25 additions & 4 deletions

File tree

scripts/eval_scenarios/fill_select_and_checkboxes_test.ts

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import type {TestScenario} from '../eval_gemini.ts';
1111
export const scenario: TestScenario = {
1212
prompt:
1313
'Go to <TEST_URL>, fill the form with size = 2 CPUs and components = [docker, nginx].',
14-
maxTurns: 3,
14+
maxTurns: 4, // allow for at least one extra turn to verify there are no extra clicks after fill_form
1515
htmlRoute: {
1616
path: '/input_test.html',
1717
htmlContent: `
@@ -41,7 +41,7 @@ export const scenario: TestScenario = {
4141
`,
4242
},
4343
expectations: calls => {
44-
assert.strictEqual(calls.length, 3);
44+
assert.ok(calls.length >= 3, 'Not enough calls made');
4545
assert.ok(
4646
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
4747
);
@@ -52,7 +52,11 @@ export const scenario: TestScenario = {
5252
uid: string;
5353
value: string;
5454
}>;
55-
assert.strictEqual(elements.length, 3);
55+
assert.strictEqual(
56+
elements.length,
57+
3,
58+
'fill_form should be used with all form elements at once',
59+
);
5660

5761
const uids = new Set(elements.map(e => e.uid));
5862
assert.strictEqual(
@@ -62,6 +66,23 @@ export const scenario: TestScenario = {
6266
);
6367

6468
const values = elements.map(e => e.value).sort();
65-
assert.deepStrictEqual(values, ['2 vCPU, 4GB RAM', 'true', 'true']);
69+
assert.deepStrictEqual(
70+
values,
71+
['2 vCPU, 4GB RAM', 'true', 'true'],
72+
'fill_form should be used with correct values',
73+
);
74+
75+
const submitUid = '1_15';
76+
77+
const extraFormInteractions = calls
78+
.slice(3)
79+
.filter(
80+
c => ['fill', 'click'].includes(c.name) && c.args.uid !== submitUid,
81+
);
82+
assert.deepEqual(
83+
extraFormInteractions.length,
84+
0,
85+
'No extra clicks and fills after fill_form',
86+
);
6687
},
6788
};

0 commit comments

Comments
 (0)