-
Notifications
You must be signed in to change notification settings - Fork 253
Expand file tree
/
Copy pathgemini-plan-execute.eval.ts
More file actions
81 lines (69 loc) · 2.58 KB
/
gemini-plan-execute.eval.ts
File metadata and controls
81 lines (69 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import { describe, expect, it, vi } from 'vitest';
import { TestRig } from './test-rig';
import { mkdirSync, copyFileSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
interface ExecutionCase {
id: string;
inputs: Record<string, string>;
expected_tools: string[];
expected_plan_keywords: string[];
}
const datasetPath = join(__dirname, 'data/gemini-plan-execute.json');
const dataset: ExecutionCase[] = JSON.parse(readFileSync(datasetPath, 'utf-8'));
describe('Gemini Plan Execution Workflow', () => {
for (const item of dataset) {
it.concurrent(`should execute a specific plan: ${item.id}`, async () => {
const rig = new TestRig(`plan-execute-${item.id}`);
try {
rig.initGit();
rig.setupMockMcp();
mkdirSync(join(rig.testDir, '.gemini/commands'), { recursive: true });
copyFileSync(
'.github/commands/gemini-plan-execute.toml',
join(rig.testDir, '.gemini/commands/gemini-plan-execute.toml'),
);
const stdout = await rig.run(
['--prompt', '/gemini-plan-execute', '--yolo'],
item.inputs,
);
// Add a small delay to ensure telemetry logs are flushed
await new Promise((resolve) => setTimeout(resolve, 2000));
const toolCalls = rig.readToolLogs();
const toolNames = toolCalls.map((c) => c.name);
// 1. Structural check
const toolNamesStripped = toolNames.map(name => name.replace(/^mcp_github_/, ''));
const hasSomeExpectedToolCalls =
item.expected_tools.length === 0 ||
item.expected_tools.some(
(action) =>
toolNamesStripped.includes(action) ||
toolCalls.some(
(c) =>
c.name === 'run_shell_command' && c.args.includes(action),
),
);
if (!hasSomeExpectedToolCalls) {
console.error(
`Expected some of ${item.expected_tools} but got tools:`,
toolNames,
);
}
expect(hasSomeExpectedToolCalls).toBe(true);
// 2. Content check (plan relevance)
const outputLower = stdout.toLowerCase();
const foundKeywords = item.expected_plan_keywords.filter((kw) =>
outputLower.includes(kw.toLowerCase()),
);
if (foundKeywords.length === 0) {
console.warn(
`Plan execution for ${item.id} didn't mention expected keywords in response. Output:`,
stdout,
);
}
expect(stdout.length).toBeGreaterThan(0);
} finally {
rig.cleanup();
}
});
}
});