run-gemini-cli/evals/gemini-plan-execute.eval.ts at 40c1dde21a5149023cdc2842796d7118c983d892 · google-github-actions/run-gemini-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import { describe, expect, it, vi } from 'vitest';
import { TestRig } from './test-rig';
import { mkdirSync, copyFileSync, readFileSync } from 'node:fs';
import { join } from 'node:path';

interface ExecutionCase {
  id: string;
  inputs: Record<string, string>;
  expected_tools: string[];
  expected_plan_keywords: string[];
}

const datasetPath = join(__dirname, 'data/gemini-plan-execute.json');
const dataset: ExecutionCase[] = JSON.parse(readFileSync(datasetPath, 'utf-8'));

describe('Gemini Plan Execution Workflow', () => {
  for (const item of dataset) {
    it.concurrent(`should execute a specific plan: ${item.id}`, async () => {
      const rig = new TestRig(`plan-execute-${item.id}`);
      try {
        rig.initGit();
        rig.setupMockMcp();

        mkdirSync(join(rig.testDir, '.gemini/commands'), { recursive: true });
        copyFileSync(
          '.github/commands/gemini-plan-execute.toml',
          join(rig.testDir, '.gemini/commands/gemini-plan-execute.toml'),
        );

        const stdout = await rig.run(
          ['--prompt', '/gemini-plan-execute', '--yolo'],
          item.inputs,
        );

        // Add a small delay to ensure telemetry logs are flushed
        await new Promise((resolve) => setTimeout(resolve, 2000));

        const toolCalls = rig.readToolLogs();
        const toolNames = toolCalls.map((c) => c.name);

        // 1. Structural check
        const toolNamesStripped = toolNames.map(name => name.replace(/^mcp_github_/, ''));
        const hasSomeExpectedToolCalls =
          item.expected_tools.length === 0 ||
          item.expected_tools.some(
            (action) =>
              toolNamesStripped.includes(action) ||
              toolCalls.some(
                (c) =>
                  c.name === 'run_shell_command' && c.args.includes(action),
              ),
          );

        if (!hasSomeExpectedToolCalls) {
          console.error(
            `Expected some of ${item.expected_tools} but got tools:`,
            toolNames,
          );
        }
        expect(hasSomeExpectedToolCalls).toBe(true);

        // 2. Content check (plan relevance)
        const outputLower = stdout.toLowerCase();
        const foundKeywords = item.expected_plan_keywords.filter((kw) =>
          outputLower.includes(kw.toLowerCase()),
        );

        if (foundKeywords.length === 0) {
          console.warn(
            `Plan execution for ${item.id} didn't mention expected keywords in response. Output:`,
            stdout,
          );
        }

        expect(stdout.length).toBeGreaterThan(0);
      } finally {
        rig.cleanup();
      }
    });
  }
});