|
| 1 | +package cmd |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "encoding/json" |
| 6 | + "fmt" |
| 7 | + "time" |
| 8 | + |
| 9 | + hawkconfig "github.com/GrayCodeAI/hawk/internal/config" |
| 10 | + "github.com/GrayCodeAI/hawk/internal/feature/eval" |
| 11 | + "github.com/GrayCodeAI/hawk/internal/types" |
| 12 | + "github.com/spf13/cobra" |
| 13 | +) |
| 14 | + |
| 15 | +var evalToolsOutput string |
| 16 | + |
| 17 | +func init() { |
| 18 | + evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json") |
| 19 | + evalCmd.AddCommand(evalToolsCmd) |
| 20 | +} |
| 21 | + |
| 22 | +var evalToolsCmd = &cobra.Command{ |
| 23 | + Use: "tools", |
| 24 | + Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy", |
| 25 | + Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " + |
| 26 | + "expected tool (or none). Triggering (did the model call a tool when it should) " + |
| 27 | + "is scored as a confusion matrix, separately from payload accuracy (right tool + args).", |
| 28 | + RunE: runEvalTools, |
| 29 | +} |
| 30 | + |
| 31 | +// defaultToolUseCases is a small built-in set exercising clear positive and |
| 32 | +// negative tool-trigger situations against hawk's standard tools. |
| 33 | +func defaultToolUseCases() []eval.ToolUseCase { |
| 34 | + return []eval.ToolUseCase{ |
| 35 | + { |
| 36 | + ID: "read-existing-file", |
| 37 | + Prompt: "Show me the contents of go.mod.", |
| 38 | + Expected: &eval.ExpectedCall{Tool: "Read"}, |
| 39 | + }, |
| 40 | + { |
| 41 | + ID: "list-directory", |
| 42 | + Prompt: "What files are in the cmd directory?", |
| 43 | + Expected: &eval.ExpectedCall{Tool: "LS"}, |
| 44 | + }, |
| 45 | + { |
| 46 | + ID: "run-command", |
| 47 | + Prompt: "Run the test suite for this project.", |
| 48 | + Expected: &eval.ExpectedCall{Tool: "Bash"}, |
| 49 | + }, |
| 50 | + { |
| 51 | + ID: "search-code", |
| 52 | + Prompt: "Find every place that defines an http handler in this repo.", |
| 53 | + Expected: &eval.ExpectedCall{Tool: "Grep"}, |
| 54 | + }, |
| 55 | + { |
| 56 | + // Negative case: a pure-knowledge question needs no tool. |
| 57 | + ID: "no-tool-trivia", |
| 58 | + Prompt: "In one sentence, what does the SOLID 'S' stand for?", |
| 59 | + Expected: nil, |
| 60 | + }, |
| 61 | + { |
| 62 | + // Negative case: a greeting needs no tool. |
| 63 | + ID: "no-tool-greeting", |
| 64 | + Prompt: "Say hello.", |
| 65 | + Expected: nil, |
| 66 | + }, |
| 67 | + } |
| 68 | +} |
| 69 | + |
| 70 | +func runEvalTools(cmd *cobra.Command, _ []string) error { |
| 71 | + settings := hawkconfig.LoadSettings() |
| 72 | + |
| 73 | + registry, err := defaultRegistry(settings) |
| 74 | + if err != nil { |
| 75 | + return fmt.Errorf("building tool registry: %w", err) |
| 76 | + } |
| 77 | + systemPrompt, err := buildSystemPrompt() |
| 78 | + if err != nil { |
| 79 | + return err |
| 80 | + } |
| 81 | + model, provider := effectiveModelAndProvider(settings) |
| 82 | + sess := newHawkSession(settings, provider, model, systemPrompt, registry) |
| 83 | + if err := configureSession(sess, settings); err != nil { |
| 84 | + return err |
| 85 | + } |
| 86 | + |
| 87 | + tools := registry.EyrieTools() |
| 88 | + |
| 89 | + // caller performs one tool-aware turn and reports the first tool the model |
| 90 | + // chose (if any). It does not execute the tool — we are scoring selection, |
| 91 | + // not effects. |
| 92 | + caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) { |
| 93 | + resp, err := sess.Chat(ctx, []types.EyrieMessage{ |
| 94 | + {Role: "user", Content: c.Prompt}, |
| 95 | + }, types.ChatOptions{Model: model, Tools: tools}) |
| 96 | + if err != nil { |
| 97 | + return eval.ObservedCall{}, err |
| 98 | + } |
| 99 | + if resp == nil || len(resp.ToolCalls) == 0 { |
| 100 | + return eval.ObservedCall{}, nil // no tool called |
| 101 | + } |
| 102 | + tc := resp.ToolCalls[0] |
| 103 | + return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil |
| 104 | + } |
| 105 | + |
| 106 | + ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute) |
| 107 | + defer cancel() |
| 108 | + |
| 109 | + cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model) |
| 110 | + report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller) |
| 111 | + |
| 112 | + switch evalToolsOutput { |
| 113 | + case "json": |
| 114 | + data, _ := json.MarshalIndent(report, "", " ") |
| 115 | + cmd.Println(string(data)) |
| 116 | + default: |
| 117 | + cmd.Println(report.Markdown()) |
| 118 | + } |
| 119 | + return nil |
| 120 | +} |
0 commit comments