From 0052ed43d8b3a152bc7d4f5c37cf32df6ebd1448 Mon Sep 17 00:00:00 2001 From: Patel230 Date: Sun, 7 Jun 2026 23:53:58 +0530 Subject: [PATCH 1/3] feat: adopt MiniMax-AI OSS patterns into hawk Verified-net-new improvements identified by scanning MiniMax-AI's open-source repos and adversarially mapping ideas onto hawk's actual source. Patterns only, no code copied (all sources MIT). - exit-code taxonomy: internal/hawkerr/exitcode.go classifies errors into stable codes (auth/network/ratelimit/timeout/...) so callers can branch on the failure reason instead of a bare 1; wired into cmd/hawk/main.go - IsTerminal color gate: suppress ANSI/Unicode when stdout is piped (non-TTY), fixing ANSI leaking into captured/JSON output; FORCE_COLOR still overrides - MCP client honors isError: parseToolCallResult surfaces remote tool failures as errors the agent can self-correct on - batch web search/browse: queries[]/urls[] concurrent fan-out (bounded) in WebSearch and AgenticFetch; single query/url still works; adds a relevance-refusal contract to the AgenticFetch sub-agent prompt - RiskLevel -> MCP annotations: ToolAnnotations (readOnly/destructive hints) on the MCP server's tools/list so clients can self-throttle - hawk mcp serve / hawk mcp config: run hawk as an MCP server (machinery existed but had no entrypoint) and emit the client registration JSON block - tool-call confusion-matrix eval: internal/feature/eval/toolmatrix.go scores trigger (TP/FN/FP/TN, precision/recall/F1) separately from payload accuracy; exposed via `hawk eval tools` All new code is unit-tested; touched packages build, vet, and test clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/eval_tools.go | 120 ++++++++++++ cmd/formatter.go | 28 ++- cmd/formatter_test.go | 38 ++++ cmd/hawk/main.go | 7 +- cmd/mcp_serve.go | 106 +++++++++++ cmd/mcp_serve_test.go | 73 ++++++++ cmd/root.go | 5 +- internal/feature/eval/toolmatrix.go | 226 +++++++++++++++++++++++ internal/feature/eval/toolmatrix_test.go | 139 ++++++++++++++ internal/hawkerr/exitcode.go | 122 ++++++++++++ internal/hawkerr/exitcode_test.go | 54 ++++++ internal/mcp/mcp.go | 19 ++ internal/mcp/server.go | 17 ++ internal/mcp/server_test.go | 69 +++++++ internal/mcp/server_tools.go | 30 ++- internal/mcp/toolresult_test.go | 64 +++++++ internal/tool/agentic_fetch.go | 86 +++++++-- internal/tool/agentic_fetch_test.go | 87 +++++++++ internal/tool/web_search.go | 92 +++++++-- internal/tool/web_test.go | 4 +- 20 files changed, 1348 insertions(+), 38 deletions(-) create mode 100644 cmd/eval_tools.go create mode 100644 cmd/mcp_serve.go create mode 100644 cmd/mcp_serve_test.go create mode 100644 internal/feature/eval/toolmatrix.go create mode 100644 internal/feature/eval/toolmatrix_test.go create mode 100644 internal/hawkerr/exitcode.go create mode 100644 internal/hawkerr/exitcode_test.go create mode 100644 internal/mcp/toolresult_test.go create mode 100644 internal/tool/agentic_fetch_test.go diff --git a/cmd/eval_tools.go b/cmd/eval_tools.go new file mode 100644 index 00000000..9a207816 --- /dev/null +++ b/cmd/eval_tools.go @@ -0,0 +1,120 @@ +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "time" + + hawkconfig "github.com/GrayCodeAI/hawk/internal/config" + "github.com/GrayCodeAI/hawk/internal/feature/eval" + "github.com/GrayCodeAI/hawk/internal/types" + "github.com/spf13/cobra" +) + +var evalToolsOutput string + +func init() { + evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json") + evalCmd.AddCommand(evalToolsCmd) +} + +var evalToolsCmd = &cobra.Command{ + Use: "tools", + Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy", + Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " + + "expected tool (or none). Triggering (did the model call a tool when it should) " + + "is scored as a confusion matrix, separately from payload accuracy (right tool + args).", + RunE: runEvalTools, +} + +// defaultToolUseCases is a small built-in set exercising clear positive and +// negative tool-trigger situations against hawk's standard tools. +func defaultToolUseCases() []eval.ToolUseCase { + return []eval.ToolUseCase{ + { + ID: "read-existing-file", + Prompt: "Show me the contents of go.mod.", + Expected: &eval.ExpectedCall{Tool: "Read"}, + }, + { + ID: "list-directory", + Prompt: "What files are in the cmd directory?", + Expected: &eval.ExpectedCall{Tool: "LS"}, + }, + { + ID: "run-command", + Prompt: "Run the test suite for this project.", + Expected: &eval.ExpectedCall{Tool: "Bash"}, + }, + { + ID: "search-code", + Prompt: "Find every place that defines an http handler in this repo.", + Expected: &eval.ExpectedCall{Tool: "Grep"}, + }, + { + // Negative case: a pure-knowledge question needs no tool. + ID: "no-tool-trivia", + Prompt: "In one sentence, what does the SOLID 'S' stand for?", + Expected: nil, + }, + { + // Negative case: a greeting needs no tool. + ID: "no-tool-greeting", + Prompt: "Say hello.", + Expected: nil, + }, + } +} + +func runEvalTools(cmd *cobra.Command, _ []string) error { + settings := hawkconfig.LoadSettings() + + registry, err := defaultRegistry(settings) + if err != nil { + return fmt.Errorf("building tool registry: %w", err) + } + systemPrompt, err := buildSystemPrompt() + if err != nil { + return err + } + model, provider := effectiveModelAndProvider(settings) + sess := newHawkSession(settings, provider, model, systemPrompt, registry) + if err := configureSession(sess, settings); err != nil { + return err + } + + tools := registry.EyrieTools() + + // caller performs one tool-aware turn and reports the first tool the model + // chose (if any). It does not execute the tool — we are scoring selection, + // not effects. + caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) { + resp, err := sess.Chat(ctx, []types.EyrieMessage{ + {Role: "user", Content: c.Prompt}, + }, types.ChatOptions{Model: model, Tools: tools}) + if err != nil { + return eval.ObservedCall{}, err + } + if resp == nil || len(resp.ToolCalls) == 0 { + return eval.ObservedCall{}, nil // no tool called + } + tc := resp.ToolCalls[0] + return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil + } + + ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute) + defer cancel() + + cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model) + report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller) + + switch evalToolsOutput { + case "json": + data, _ := json.MarshalIndent(report, "", " ") + cmd.Println(string(data)) + default: + cmd.Println(report.Markdown()) + } + return nil +} diff --git a/cmd/formatter.go b/cmd/formatter.go index 17499057..a582061f 100644 --- a/cmd/formatter.go +++ b/cmd/formatter.go @@ -6,8 +6,19 @@ import ( "strings" "sync" "time" + + "golang.org/x/term" ) +// stdoutIsTerminal reports whether stdout is connected to a terminal (TTY). +// When stdout is a pipe or file — which is exactly the case when an agent or +// shell script captures hawk's output — this is false, and color/Unicode +// chrome must be suppressed so the payload stays clean. It is a var so tests +// can override it. +var stdoutIsTerminal = func() bool { + return term.IsTerminal(int(os.Stdout.Fd())) +} + // TreeNode represents a node in a tree structure for FormatTree. type TreeNode struct { Name string @@ -544,9 +555,16 @@ func DetectColorSupport() bool { return false } + // Stdout is not a TTY (piped to an agent, file, or another process): + // suppress ANSI so the captured output is clean. An explicit FORCE_COLOR + // above already overrode this for callers that pipe but still want color. + if !stdoutIsTerminal() { + return false + } + // Check TERM - term := os.Getenv("TERM") - if term == "dumb" || term == "" { + t := os.Getenv("TERM") + if t == "dumb" || t == "" { return false } @@ -555,6 +573,12 @@ func DetectColorSupport() bool { // DetectUnicodeSupport checks if the terminal supports Unicode characters. func DetectUnicodeSupport() bool { + // Non-TTY stdout: emit ASCII so box-drawing/glyphs don't corrupt captured + // output. FORCE_COLOR is a color signal only, so it does not override here. + if !stdoutIsTerminal() { + return false + } + lang := os.Getenv("LANG") lcAll := os.Getenv("LC_ALL") lcCtype := os.Getenv("LC_CTYPE") diff --git a/cmd/formatter_test.go b/cmd/formatter_test.go index 01f29df8..a4f86aa6 100644 --- a/cmd/formatter_test.go +++ b/cmd/formatter_test.go @@ -7,6 +7,38 @@ import ( "time" ) +func TestDetectColorSupport_NonTTYStdout(t *testing.T) { + orig := stdoutIsTerminal + defer func() { stdoutIsTerminal = orig }() + + t.Setenv("NO_COLOR", "") + t.Setenv("FORCE_COLOR", "") + t.Setenv("TERM", "xterm-256color") + + // Piped stdout (not a TTY) with a normal TERM must still disable color — + // this is the "agent captured ANSI escapes in its JSON" regression. + stdoutIsTerminal = func() bool { return false } + if DetectColorSupport() { + t.Error("DetectColorSupport() = true for non-TTY stdout; want false") + } + if DetectUnicodeSupport() { + t.Error("DetectUnicodeSupport() = true for non-TTY stdout; want false") + } + + // A TTY with a good TERM keeps color. + stdoutIsTerminal = func() bool { return true } + if !DetectColorSupport() { + t.Error("DetectColorSupport() = false for TTY stdout; want true") + } + + // FORCE_COLOR overrides the non-TTY gate (deliberate piped color). + stdoutIsTerminal = func() bool { return false } + t.Setenv("FORCE_COLOR", "1") + if !DetectColorSupport() { + t.Error("DetectColorSupport() = false with FORCE_COLOR over a pipe; want true") + } +} + func newTestFormatter(color, unicode bool, width int) *OutputFormatter { theme := OutputTheme{} if color { @@ -552,6 +584,12 @@ func TestDetectColorSupport(t *testing.T) { } func TestDetectUnicodeSupport(t *testing.T) { + // These subtests exercise the locale-env logic, so pin stdout to a TTY; + // the non-TTY suppression gate is covered by TestDetectColorSupport_NonTTYStdout. + origIsTTY := stdoutIsTerminal + stdoutIsTerminal = func() bool { return true } + defer func() { stdoutIsTerminal = origIsTTY }() + t.Run("UTF-8 lang", func(t *testing.T) { origLang := os.Getenv("LANG") origLcAll := os.Getenv("LC_ALL") diff --git a/cmd/hawk/main.go b/cmd/hawk/main.go index 46cec807..dd5bd0c7 100644 --- a/cmd/hawk/main.go +++ b/cmd/hawk/main.go @@ -7,6 +7,7 @@ import ( "github.com/GrayCodeAI/hawk/cmd" "github.com/GrayCodeAI/hawk/internal/api" + "github.com/GrayCodeAI/hawk/internal/hawkerr" "github.com/GrayCodeAI/hawk/internal/mcp" "github.com/GrayCodeAI/hawk/internal/sandbox" ) @@ -42,10 +43,14 @@ func main() { if err := cmd.Execute(); err != nil { fmt.Fprintln(os.Stderr, err) + // An explicit ExitCodeError (e.g. a wrapped Bash exit status) wins — + // it already carries the intended code. Otherwise classify the failure + // into the stable exit-code taxonomy so callers can branch on the + // reason (auth vs rate-limit vs network) instead of seeing a bare 1. var exitErr *cmd.ExitCodeError if errors.As(err, &exitErr) { os.Exit(exitErr.Code) } - os.Exit(1) + os.Exit(hawkerr.ClassifyExitCode(err)) } } diff --git a/cmd/mcp_serve.go b/cmd/mcp_serve.go new file mode 100644 index 00000000..8997af38 --- /dev/null +++ b/cmd/mcp_serve.go @@ -0,0 +1,106 @@ +package cmd + +import ( + "encoding/json" + "os" + "os/signal" + "syscall" + + hawkconfig "github.com/GrayCodeAI/hawk/internal/config" + "github.com/GrayCodeAI/hawk/internal/mcp" + "github.com/spf13/cobra" +) + +var mcpConfigWrite bool + +func init() { + mcpConfigCmd.Flags().BoolVar(&mcpConfigWrite, "write", false, + "also print the well-known client config paths to paste the block into") + mcpCmd.AddCommand(mcpServeCmd) + mcpCmd.AddCommand(mcpConfigCmd) +} + +// mcpServeCmd runs hawk itself as an MCP server over stdio, exposing hawk's +// capabilities (chat, search, memory, review, scan, compress) to MCP clients +// such as Claude Desktop, Cursor, and Windsurf. +var mcpServeCmd = &cobra.Command{ + Use: "serve", + Short: "Run hawk as an MCP server over stdio", + Long: "Run hawk as a Model Context Protocol server over stdio (JSON-RPC 2.0), " + + "exposing hawk's tools to MCP clients like Claude Desktop, Cursor, and Windsurf.\n\n" + + "Use `hawk mcp config` to print the JSON block that registers this command in a client.", + RunE: runMCPServe, +} + +func runMCPServe(cmd *cobra.Command, _ []string) error { + settings := hawkconfig.LoadSettings() + + serverVersion := version + if serverVersion == "" { + serverVersion = "dev" + } + server := mcp.NewMCPServer(mcp.ServerInfo{Name: "hawk", Version: serverVersion}) + + // Wire hawk's tool registry in as the executor so delegating tools run for + // real; a registry build failure degrades to not-configured rather than + // aborting (the server still answers initialize/tools/list). + registry, err := defaultRegistry(settings) + if err == nil { + mcp.RegisterDefaultTools(server, registry.Execute) + } else { + mcp.RegisterDefaultTools(server, nil) + } + + ctx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM) + defer stop() + return server.ServeStdio(ctx) +} + +// mcpConfigCmd emits the JSON block that registers hawk as an MCP server in a +// client's config file, so users don't hand-edit JSON. +var mcpConfigCmd = &cobra.Command{ + Use: "config", + Short: "Print the MCP-server config block to register hawk in a client", + Long: "Print the JSON block that registers hawk as an MCP server (pointing at " + + "`hawk mcp serve`) for clients like Claude Desktop, Cursor, and Windsurf.\n\n" + + "Pipe it to the client's config file, e.g.:\n" + + " hawk mcp config >> ~/Library/Application Support/Claude/claude_desktop_config.json", + RunE: runMCPConfig, +} + +func runMCPConfig(cmd *cobra.Command, _ []string) error { + exe := hawkExecutablePath() + + block := map[string]any{ + "mcpServers": map[string]any{ + "hawk": map[string]any{ + "command": exe, + "args": []string{"mcp", "serve"}, + }, + }, + } + out, err := json.MarshalIndent(block, "", " ") + if err != nil { + return err + } + + if mcpConfigWrite { + cmd.Println("# Add the \"hawk\" entry below into the \"mcpServers\" object of your client config:") + cmd.Println("# Claude Desktop (macOS): ~/Library/Application Support/Claude/claude_desktop_config.json") + cmd.Println("# Cursor: ~/.cursor/mcp.json") + cmd.Println("# Windsurf: ~/.codeium/windsurf/mcp_config.json") + cmd.Println() + } + cmd.Println(string(out)) + return nil +} + +// hawkExecutablePath returns the absolute path to the running hawk binary, or +// the bare name "hawk" if it cannot be resolved (e.g. during `go run`), so the +// emitted config is still copy-pasteable. +func hawkExecutablePath() string { + if exe, err := os.Executable(); err == nil && exe != "" { + return exe + } + return "hawk" +} diff --git a/cmd/mcp_serve_test.go b/cmd/mcp_serve_test.go new file mode 100644 index 00000000..8b37d46b --- /dev/null +++ b/cmd/mcp_serve_test.go @@ -0,0 +1,73 @@ +package cmd + +import ( + "bytes" + "encoding/json" + "strings" + "testing" + + "github.com/spf13/cobra" +) + +func TestMCPConfigEmitsValidServerBlock(t *testing.T) { + var buf bytes.Buffer + c := &cobra.Command{} + c.SetOut(&buf) + c.SetErr(&buf) + + if err := runMCPConfig(c, nil); err != nil { + t.Fatalf("runMCPConfig: %v", err) + } + + // Output should be a single JSON object (no leading comment lines without --write). + var cfg struct { + MCPServers map[string]struct { + Command string `json:"command"` + Args []string `json:"args"` + } `json:"mcpServers"` + } + if err := json.Unmarshal(buf.Bytes(), &cfg); err != nil { + t.Fatalf("emitted config is not valid JSON: %v\n%s", err, buf.String()) + } + + hawk, ok := cfg.MCPServers["hawk"] + if !ok { + t.Fatal(`config missing "hawk" server entry`) + } + if hawk.Command == "" { + t.Error("hawk server entry missing command") + } + if len(hawk.Args) != 2 || hawk.Args[0] != "mcp" || hawk.Args[1] != "serve" { + t.Errorf(`args = %v, want ["mcp" "serve"]`, hawk.Args) + } +} + +func TestMCPConfigWriteAddsPathHints(t *testing.T) { + orig := mcpConfigWrite + defer func() { mcpConfigWrite = orig }() + mcpConfigWrite = true + + var buf bytes.Buffer + c := &cobra.Command{} + c.SetOut(&buf) + c.SetErr(&buf) + + if err := runMCPConfig(c, nil); err != nil { + t.Fatalf("runMCPConfig: %v", err) + } + out := buf.String() + for _, want := range []string{"claude_desktop_config.json", "cursor", "windsurf"} { + if !strings.Contains(strings.ToLower(out), strings.ToLower(want)) { + t.Errorf("--write output missing hint %q\n%s", want, out) + } + } + // The JSON block must still be present and parseable after the comments. + brace := strings.Index(out, "{") + if brace < 0 { + t.Fatal("no JSON block found in --write output") + } + var anyObj map[string]any + if err := json.Unmarshal([]byte(out[brace:]), &anyObj); err != nil { + t.Fatalf("JSON block after hints is invalid: %v", err) + } +} diff --git a/cmd/root.go b/cmd/root.go index d8e1f60f..5d6b7ba3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -466,7 +466,10 @@ var configCmd = &cobra.Command{ var mcpCmd = &cobra.Command{ Use: "mcp", - Short: "Show MCP server configuration", + Short: "Show MCP configuration; run or register hawk as an MCP server", + Long: "With no subcommand, summarizes the MCP servers hawk connects to (consumes).\n" + + " hawk mcp serve — run hawk itself as an MCP server over stdio\n" + + " hawk mcp config — print the JSON block to register hawk in Claude Desktop/Cursor/Windsurf", RunE: func(cmd *cobra.Command, args []string) error { settings, err := loadEffectiveSettings() if err != nil { diff --git a/internal/feature/eval/toolmatrix.go b/internal/feature/eval/toolmatrix.go new file mode 100644 index 00000000..f1d7a225 --- /dev/null +++ b/internal/feature/eval/toolmatrix.go @@ -0,0 +1,226 @@ +package eval + +import ( + "context" + "fmt" + "sort" +) + +// Tool-use evaluation. +// +// A coding agent's tool behavior has two independent failure modes that a +// single pass/fail score conflates: +// +// 1. Triggering — did the agent invoke a tool when it should have (and refrain +// when it should not)? This is a binary classification problem, scored as a +// confusion matrix (TP/FN/FP/TN) with precision/recall. +// 2. Well-formedness — given that a tool was triggered, did the agent pick the +// right tool and format the arguments correctly? Scored separately, and only +// over cases where a trigger actually fired. +// +// With 40+ tools and model-swapping via eyrie, "picked the wrong moment to call +// a tool" and "called the right tool with bad args" are distinct regressions. +// Scoring them apart makes which one regressed legible. + +// ExpectedCall describes the tool behavior a case expects. A nil ExpectedCall +// means "the agent should NOT call any tool" (a negative case). +type ExpectedCall struct { + Tool string // expected tool name + // ArgsMatch optionally validates the arguments the model supplied. If nil, + // only the tool name is checked for payload accuracy. + ArgsMatch func(args map[string]any) bool +} + +// ToolUseCase is a single model-in-the-loop tool-selection test. +type ToolUseCase struct { + ID string + Prompt string + // Expected is the tool the model should call, or nil if it should not call + // any tool for this prompt. + Expected *ExpectedCall +} + +// ObservedCall is what the model actually did for a case: the tool it invoked +// (empty Tool means it invoked none) and the arguments it passed. +type ObservedCall struct { + Tool string + Args map[string]any +} + +// ToolCaller runs one case against the model under test and reports what tool +// (if any) the model chose to call. Injecting this keeps scoring testable +// without a live model. +type ToolCaller func(ctx context.Context, c ToolUseCase) (ObservedCall, error) + +// TriggerMatrix is the confusion matrix for the binary "should a tool fire?" +// decision, aggregated across cases. +type TriggerMatrix struct { + TP int // expected a tool and the model called one + FN int // expected a tool but the model called none + FP int // expected no tool but the model called one + TN int // expected no tool and the model called none +} + +// Precision = TP / (TP+FP); 1.0 when there are no positive predictions. +func (m TriggerMatrix) Precision() float64 { + d := m.TP + m.FP + if d == 0 { + return 1.0 + } + return float64(m.TP) / float64(d) +} + +// Recall = TP / (TP+FN); 1.0 when there are no positive expectations. +func (m TriggerMatrix) Recall() float64 { + d := m.TP + m.FN + if d == 0 { + return 1.0 + } + return float64(m.TP) / float64(d) +} + +// F1 is the harmonic mean of precision and recall. +func (m TriggerMatrix) F1() float64 { + p, r := m.Precision(), m.Recall() + if p+r == 0 { + return 0 + } + return 2 * p * r / (p + r) +} + +// PayloadAccuracy scores well-formedness, conditioned on a trigger firing. +type PayloadAccuracy struct { + Evaluated int // cases where a tool was both expected and called (TP) + CorrectTool int // of those, the model picked the expected tool + CorrectArgs int // of those with the right tool, args also matched +} + +// ToolName returns the fraction of triggered cases with the correct tool name. +func (p PayloadAccuracy) ToolNameRate() float64 { + if p.Evaluated == 0 { + return 1.0 + } + return float64(p.CorrectTool) / float64(p.Evaluated) +} + +// ArgsRate returns the fraction of triggered cases with correct tool AND args. +func (p PayloadAccuracy) ArgsRate() float64 { + if p.Evaluated == 0 { + return 1.0 + } + return float64(p.CorrectArgs) / float64(p.Evaluated) +} + +// CaseOutcome records the per-case verdict, useful for reporting which prompts +// regressed. +type CaseOutcome struct { + ID string + ExpectedTool string // "" means no tool expected + ObservedTool string // "" means no tool called + TriggerResult string // "TP" | "FN" | "FP" | "TN" + CorrectTool bool // only meaningful when TriggerResult == "TP" + CorrectArgs bool // only meaningful when CorrectTool + Err string +} + +// ToolUseReport is the full result of scoring a set of cases. +type ToolUseReport struct { + Matrix TriggerMatrix + Payload PayloadAccuracy + Outcomes []CaseOutcome +} + +// ScoreToolUse runs each case through the caller and produces the trigger +// confusion matrix and payload accuracy. A caller error is recorded on the +// outcome and treated as "no tool called" for trigger purposes, so a model that +// errors when it should have acted is counted as a false negative rather than +// silently dropped. +func ScoreToolUse(ctx context.Context, cases []ToolUseCase, caller ToolCaller) ToolUseReport { + rep := ToolUseReport{Outcomes: make([]CaseOutcome, 0, len(cases))} + + for _, c := range cases { + obs, err := caller(ctx, c) + oc := CaseOutcome{ID: c.ID, ObservedTool: obs.Tool} + if err != nil { + oc.Err = err.Error() + obs.Tool = "" // an error means no usable tool call + } + + expectsTool := c.Expected != nil + calledTool := obs.Tool != "" + if expectsTool { + oc.ExpectedTool = c.Expected.Tool + } + + switch { + case expectsTool && calledTool: + rep.Matrix.TP++ + oc.TriggerResult = "TP" + rep.Payload.Evaluated++ + if obs.Tool == c.Expected.Tool { + oc.CorrectTool = true + rep.Payload.CorrectTool++ + if c.Expected.ArgsMatch == nil || c.Expected.ArgsMatch(obs.Args) { + oc.CorrectArgs = true + rep.Payload.CorrectArgs++ + } + } + case expectsTool && !calledTool: + rep.Matrix.FN++ + oc.TriggerResult = "FN" + case !expectsTool && calledTool: + rep.Matrix.FP++ + oc.TriggerResult = "FP" + default: + rep.Matrix.TN++ + oc.TriggerResult = "TN" + } + + rep.Outcomes = append(rep.Outcomes, oc) + } + + return rep +} + +// Markdown renders the report as a compact, human-readable summary. +func (r ToolUseReport) Markdown() string { + m := r.Matrix + out := "## Tool-use evaluation\n\n" + out += "### Trigger confusion matrix\n\n" + out += "| | tool called | no tool called |\n" + out += "|---|---|---|\n" + out += fmt.Sprintf("| **tool expected** | %d (TP) | %d (FN) |\n", m.TP, m.FN) + out += fmt.Sprintf("| **no tool expected** | %d (FP) | %d (TN) |\n\n", m.FP, m.TN) + out += fmt.Sprintf("- precision: %.2f recall: %.2f F1: %.2f\n\n", m.Precision(), m.Recall(), m.F1()) + out += "### Payload accuracy (over triggered cases)\n\n" + out += fmt.Sprintf("- correct tool: %.2f (%d/%d)\n", r.Payload.ToolNameRate(), r.Payload.CorrectTool, r.Payload.Evaluated) + out += fmt.Sprintf("- correct tool + args: %.2f (%d/%d)\n", r.Payload.ArgsRate(), r.Payload.CorrectArgs, r.Payload.Evaluated) + + // List failures (deterministic order) so a regression is actionable. + var failures []CaseOutcome + for _, oc := range r.Outcomes { + if oc.TriggerResult == "FN" || oc.TriggerResult == "FP" || (oc.TriggerResult == "TP" && !oc.CorrectArgs) { + failures = append(failures, oc) + } + } + if len(failures) > 0 { + sort.Slice(failures, func(i, j int) bool { return failures[i].ID < failures[j].ID }) + out += "\n### Failures\n\n" + for _, oc := range failures { + detail := oc.TriggerResult + if oc.TriggerResult == "TP" { + detail = "wrong tool/args" + } + out += fmt.Sprintf("- %s: %s (expected %q, got %q)\n", + oc.ID, detail, orNone(oc.ExpectedTool), orNone(oc.ObservedTool)) + } + } + return out +} + +func orNone(s string) string { + if s == "" { + return "" + } + return s +} diff --git a/internal/feature/eval/toolmatrix_test.go b/internal/feature/eval/toolmatrix_test.go new file mode 100644 index 00000000..fb52addb --- /dev/null +++ b/internal/feature/eval/toolmatrix_test.go @@ -0,0 +1,139 @@ +package eval + +import ( + "context" + "errors" + "math" + "strings" + "testing" +) + +// scriptedCaller returns observations keyed by case ID. +func scriptedCaller(script map[string]ObservedCall, errs map[string]error) ToolCaller { + return func(_ context.Context, c ToolUseCase) (ObservedCall, error) { + if errs != nil { + if e, ok := errs[c.ID]; ok { + return ObservedCall{}, e + } + } + return script[c.ID], nil + } +} + +func approx(a, b float64) bool { return math.Abs(a-b) < 1e-9 } + +func TestScoreToolUse_AllMatrixCells(t *testing.T) { + readFile := &ExpectedCall{Tool: "read_file"} + cases := []ToolUseCase{ + {ID: "tp", Prompt: "show me main.go", Expected: readFile}, // expects tool, calls it + {ID: "fn", Prompt: "open config.yaml", Expected: readFile}, // expects tool, calls none + {ID: "fp", Prompt: "what is 2+2?", Expected: nil}, // expects none, calls one + {ID: "tn", Prompt: "say hello", Expected: nil}, // expects none, calls none + } + script := map[string]ObservedCall{ + "tp": {Tool: "read_file", Args: map[string]any{"path": "main.go"}}, + "fn": {Tool: ""}, + "fp": {Tool: "bash", Args: map[string]any{"cmd": "echo 4"}}, + "tn": {Tool: ""}, + } + + rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil)) + m := rep.Matrix + if m.TP != 1 || m.FN != 1 || m.FP != 1 || m.TN != 1 { + t.Fatalf("matrix = %+v, want one of each", m) + } + if !approx(m.Precision(), 0.5) { + t.Errorf("precision = %v, want 0.5", m.Precision()) + } + if !approx(m.Recall(), 0.5) { + t.Errorf("recall = %v, want 0.5", m.Recall()) + } + if !approx(m.F1(), 0.5) { + t.Errorf("F1 = %v, want 0.5", m.F1()) + } +} + +func TestScoreToolUse_PayloadAccuracy(t *testing.T) { + exp := &ExpectedCall{ + Tool: "read_file", + ArgsMatch: func(a map[string]any) bool { return a["path"] == "main.go" }, + } + cases := []ToolUseCase{ + {ID: "good", Expected: exp}, // right tool, right args + {ID: "badargs", Expected: exp}, // right tool, wrong args + {ID: "wrongtool", Expected: exp}, // wrong tool + } + script := map[string]ObservedCall{ + "good": {Tool: "read_file", Args: map[string]any{"path": "main.go"}}, + "badargs": {Tool: "read_file", Args: map[string]any{"path": "other.go"}}, + "wrongtool": {Tool: "bash", Args: map[string]any{"cmd": "cat main.go"}}, + } + + rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil)) + if rep.Matrix.TP != 3 { + t.Fatalf("all three triggered, TP = %d", rep.Matrix.TP) + } + p := rep.Payload + if p.Evaluated != 3 { + t.Errorf("Evaluated = %d, want 3", p.Evaluated) + } + if p.CorrectTool != 2 { // good + badargs picked read_file + t.Errorf("CorrectTool = %d, want 2", p.CorrectTool) + } + if p.CorrectArgs != 1 { // only good matched args + t.Errorf("CorrectArgs = %d, want 1", p.CorrectArgs) + } + if !approx(p.ToolNameRate(), 2.0/3.0) { + t.Errorf("ToolNameRate = %v", p.ToolNameRate()) + } + if !approx(p.ArgsRate(), 1.0/3.0) { + t.Errorf("ArgsRate = %v", p.ArgsRate()) + } +} + +func TestScoreToolUse_ErrorCountsAsNoCall(t *testing.T) { + cases := []ToolUseCase{ + {ID: "boom", Prompt: "do a thing", Expected: &ExpectedCall{Tool: "read_file"}}, + } + errs := map[string]error{"boom": errors.New("provider exploded")} + rep := ScoreToolUse(context.Background(), cases, scriptedCaller(nil, errs)) + + // Expected a tool, model errored (=no call) → false negative, not a crash. + if rep.Matrix.FN != 1 || rep.Matrix.TP != 0 { + t.Errorf("matrix = %+v, want FN=1", rep.Matrix) + } + if rep.Outcomes[0].Err == "" { + t.Error("outcome should record the error") + } +} + +func TestTriggerMatrix_EmptyDivisors(t *testing.T) { + // No positive predictions / expectations → precision & recall default to 1. + m := TriggerMatrix{TN: 5} + if !approx(m.Precision(), 1.0) || !approx(m.Recall(), 1.0) { + t.Errorf("empty matrix precision/recall = %v/%v, want 1/1", m.Precision(), m.Recall()) + } +} + +func TestToolUseReport_MarkdownListsFailures(t *testing.T) { + cases := []ToolUseCase{ + {ID: "z-miss", Expected: &ExpectedCall{Tool: "read_file"}}, + {ID: "a-spurious", Expected: nil}, + } + script := map[string]ObservedCall{ + "z-miss": {Tool: ""}, // FN + "a-spurious": {Tool: "bash"}, // FP + } + rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil)) + md := rep.Markdown() + if !strings.Contains(md, "confusion matrix") { + t.Error("markdown missing matrix section") + } + if !strings.Contains(md, "Failures") { + t.Error("markdown missing failures section") + } + // Failures must be sorted by ID: a-spurious before z-miss. + if strings.Index(md, "a-spurious") > strings.Index(md, "z-miss") { + t.Error("failures not sorted by ID") + } +} diff --git a/internal/hawkerr/exitcode.go b/internal/hawkerr/exitcode.go new file mode 100644 index 00000000..cd224eda --- /dev/null +++ b/internal/hawkerr/exitcode.go @@ -0,0 +1,122 @@ +package hawkerr + +import "strings" + +// Exit-code taxonomy. +// +// hawk historically collapsed every failure to exit code 1, which gives an +// agent or shell script driving `hawk --print` no way to branch on *why* the +// run failed without scraping stderr. The codes below assign a stable, +// documented small-integer meaning to each failure class so a caller can, for +// example, retry on a rate-limit (5) but stop immediately on bad credentials +// (3). +// +// The numbers are part of hawk's public CLI contract — append new codes, do +// not renumber existing ones. Codes are kept below 64 to avoid colliding with +// the shell's 128+signal convention and 126/127 (not-executable / not-found). +const ( + ExitOK = 0 // success + ExitGeneral = 1 // unclassified / general failure + ExitUsage = 2 // bad flags or arguments (reserved for the CLI layer) + ExitAuth = 3 // missing/invalid/expired credentials, 401/403 + ExitNetwork = 4 // DNS, connection refused/reset, provider 5xx, TLS + ExitRateLimit = 5 // 429 / quota / insufficient credits + ExitTimeout = 6 // request timeout, deadline exceeded, cancellation + ExitToolFailure = 7 // a tool execution failed or timed out + ExitPolicyBlock = 8 // permission/guardrail/policy denial + ExitConfig = 9 // malformed settings/config + ExitContextLimit = 10 // prompt exceeds the model's context window + ExitNotFound = 11 // model/endpoint/resource not found (404) + ExitDiskFull = 12 // out of disk space or quota +) + +// ClassifyExitCode maps an error to a stable exit code from the taxonomy above. +// +// It deliberately mirrors the textual classification already performed by the +// CLI's friendlyError (cmd/errors.go): the same provider/network/auth signals +// that produce a friendly message here produce a stable exit code, so the two +// never disagree about what kind of failure occurred. Order matters — the most +// specific and most actionable classes are checked first. +// +// A nil error yields ExitOK; an unrecognized error yields ExitGeneral. +func ClassifyExitCode(err error) int { + if err == nil { + return ExitOK + } + low := strings.ToLower(err.Error()) + + contains := func(subs ...string) bool { + for _, s := range subs { + if strings.Contains(low, s) { + return true + } + } + return false + } + + switch { + // Rate limiting / quota / credits — checked before generic auth because a + // 429 is retriable whereas a 401 is not. + case contains("429", "rate limit", "rate_limit", "too many requests", + "insufficient credits", "insufficient balance", "out of credits", + "requires more credits", "can only afford", "quota exceeded"): + return ExitRateLimit + + // Authentication / authorization — bad or missing key, 401/403. + case contains("401", "unauthorized", "invalid api key", "invalid_api_key", + "authentication", "api key is missing", "403", "forbidden", + "access denied", "payment required"): + return ExitAuth + + // Context window overflow — distinct from a generic 400 so callers can + // react by compacting rather than aborting. + case contains("context length", "context_length", "context window", + "token limit", "too many tokens", "maximum context", + "max_tokens exceeded", "max tokens exceeded", "prompt is too long"): + return ExitContextLimit + + // Policy / permission / guardrail denial. + case contains("permission denied", "guardrail", "policy", "blocked by", + "approval denied", "not permitted", "operation not allowed"): + return ExitPolicyBlock + + // Tool execution failures and tool timeouts. + case contains("tool timeout", "tool_timeout", "tool execution", + "tool failed", "tool error"): + return ExitToolFailure + + // Disk space / quota. + case contains("no space left", "disk full", "not enough space", "disk quota"): + return ExitDiskFull + + // Malformed configuration. + case contains("invalid json in config") || + (contains("settings", "config") && contains("unmarshal", "syntax error", "invalid character")): + return ExitConfig + + // Not found — model/endpoint/resource (404). + case contains("model not found", "model_not_found", "unknown model", + "invalid model", "404", "no such host"): + // "no such host" is a DNS failure, not a 404 — route it to network. + if contains("no such host") { + return ExitNetwork + } + return ExitNotFound + + // Network errors — DNS, refused/reset connections, provider 5xx, TLS. + case contains("network is unreachable", "network unreachable", + "connection refused", "connection reset", "broken pipe", + "dns", "lookup", "500", "502", "503", "504", + "internal server error", "bad gateway", "service unavailable", + "gateway timeout", "certificate", "tls", "x509"): + return ExitNetwork + + // Timeouts / cancellation — checked after the 5xx gateway-timeout cases so + // "504 gateway timeout" stays a network error. + case contains("timeout", "timed out", "deadline exceeded", "context canceled"): + return ExitTimeout + + default: + return ExitGeneral + } +} diff --git a/internal/hawkerr/exitcode_test.go b/internal/hawkerr/exitcode_test.go new file mode 100644 index 00000000..1d8d6230 --- /dev/null +++ b/internal/hawkerr/exitcode_test.go @@ -0,0 +1,54 @@ +package hawkerr + +import ( + "errors" + "testing" +) + +func TestClassifyExitCode(t *testing.T) { + cases := []struct { + name string + err error + want int + }{ + {"nil", nil, ExitOK}, + {"rate limit 429", errors.New("HTTP 429 Too Many Requests"), ExitRateLimit}, + {"rate limit phrase", errors.New("you have hit the rate limit"), ExitRateLimit}, + {"insufficient credits", errors.New("requires more credits to run"), ExitRateLimit}, + {"auth 401", errors.New("401 Unauthorized"), ExitAuth}, + {"invalid api key", errors.New("invalid_api_key provided"), ExitAuth}, + {"forbidden 403", errors.New("403 Forbidden: access denied"), ExitAuth}, + {"context window", errors.New("prompt is too long for the context window"), ExitContextLimit}, + {"token limit", errors.New("maximum context length exceeded"), ExitContextLimit}, + {"policy block", errors.New("operation not allowed by guardrail policy"), ExitPolicyBlock}, + {"permission denied", errors.New("permission denied writing file"), ExitPolicyBlock}, + {"tool timeout", errors.New("tool timeout after 120s"), ExitToolFailure}, + {"disk full", errors.New("write failed: no space left on device"), ExitDiskFull}, + {"bad config", errors.New("failed to parse settings: invalid character '}'"), ExitConfig}, + {"model not found", errors.New("model_not_found: gpt-5-ultra"), ExitNotFound}, + {"404 model", errors.New("404 unknown model"), ExitNotFound}, + {"dns no such host", errors.New("dial tcp: lookup api.example.com: no such host"), ExitNetwork}, + {"connection refused", errors.New("connection refused"), ExitNetwork}, + {"provider 503", errors.New("503 Service Unavailable"), ExitNetwork}, + {"tls", errors.New("x509: certificate signed by unknown authority"), ExitNetwork}, + {"timeout", errors.New("context deadline exceeded"), ExitTimeout}, + {"plain timeout", errors.New("request timed out"), ExitTimeout}, + {"unclassified", errors.New("something weird happened"), ExitGeneral}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := ClassifyExitCode(tc.err); got != tc.want { + t.Errorf("ClassifyExitCode(%q) = %d, want %d", tc.err, got, tc.want) + } + }) + } +} + +// Wrapped errors must classify by their flattened message too. +func TestClassifyExitCode_Wrapped(t *testing.T) { + wrapped := errors.New("call failed: 429 rate_limit") + if got := ClassifyExitCode(wrapped); got != ExitRateLimit { + t.Errorf("wrapped rate-limit = %d, want %d", got, ExitRateLimit) + } +} diff --git a/internal/mcp/mcp.go b/internal/mcp/mcp.go index a542938d..25c20ea1 100644 --- a/internal/mcp/mcp.go +++ b/internal/mcp/mcp.go @@ -231,11 +231,24 @@ func (s *Server) CallTool(ctx context.Context, name string, args map[string]inte if err != nil { return "", err } + return parseToolCallResult(result) +} + +// parseToolCallResult decodes an MCP tools/call result into flattened text. +// +// It honors the spec's isError flag: when the remote tool reports a failure +// (isError:true), the content carries the failure detail and this returns it as +// a Go error so the agent loop surfaces it to the model — which can then +// self-correct — instead of mistaking a failure for a successful result. +// hawk's own MCP server sets this flag (internal/mcp/server.go), so the client +// must read it for symmetry. An undecodable result falls back to the raw bytes. +func parseToolCallResult(result json.RawMessage) (string, error) { var resp struct { Content []struct { Type string `json:"type"` Text string `json:"text"` } `json:"content"` + IsError bool `json:"isError"` } if err := json.Unmarshal(result, &resp); err != nil { return string(result), nil @@ -246,6 +259,12 @@ func (s *Server) CallTool(ctx context.Context, name string, args map[string]inte text += c.Text } } + if resp.IsError { + if text == "" { + text = "remote MCP tool reported an error" + } + return text, fmt.Errorf("MCP tool error: %s", text) + } return text, nil } diff --git a/internal/mcp/server.go b/internal/mcp/server.go index 8cc64e4b..fb1532ef 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -62,9 +62,24 @@ type MCPToolHandler struct { Name string Description string InputSchema map[string]interface{} + // Annotations carry behavioral hints (read-only, destructive, …) so a + // connecting client can self-throttle — e.g. ask the user before a tool + // that may run shell commands or edit files. Nil means "no hints". + Annotations *ToolAnnotations Handler func(ctx context.Context, params json.RawMessage) (string, error) } +// ToolAnnotations are the optional MCP tool behavior hints from the spec. +// All fields are advisory and pointer-typed so an unset hint is omitted from +// the wire payload rather than sent as a misleading false. +type ToolAnnotations struct { + Title string `json:"title,omitempty"` + ReadOnlyHint *bool `json:"readOnlyHint,omitempty"` + DestructiveHint *bool `json:"destructiveHint,omitempty"` + IdempotentHint *bool `json:"idempotentHint,omitempty"` + OpenWorldHint *bool `json:"openWorldHint,omitempty"` +} + // JSON-RPC 2.0 request from a client. type JSONRPCRequest struct { JSONRPC string `json:"jsonrpc"` @@ -251,6 +266,7 @@ func (s *MCPServer) handleToolsList(req *JSONRPCRequest) *JSONRPCResponse { Name string `json:"name"` Description string `json:"description"` InputSchema map[string]interface{} `json:"inputSchema"` + Annotations *ToolAnnotations `json:"annotations,omitempty"` } tools := make([]toolSchema, 0, len(s.tools)) @@ -259,6 +275,7 @@ func (s *MCPServer) handleToolsList(req *JSONRPCRequest) *JSONRPCResponse { Name: t.Name, Description: t.Description, InputSchema: t.InputSchema, + Annotations: t.Annotations, }) } diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index 2d8ea2e8..69d7cb8f 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -143,6 +143,75 @@ func TestToolsList(t *testing.T) { if toolMap["inputSchema"] == nil { t.Error("tool missing inputSchema") } + // Neither alpha nor beta declared annotations, so the field must be + // omitted entirely (omitempty) rather than sent as null. + if _, present := toolMap["annotations"]; present { + t.Errorf("tool %v should omit annotations when unset", toolMap["name"]) + } + } +} + +func TestToolsListAnnotations(t *testing.T) { + server := NewMCPServer(ServerInfo{Name: "hawk", Version: "1.0.0"}) + server.RegisterTool(MCPToolHandler{ + Name: "danger", + Description: "Runs an agent", + InputSchema: map[string]interface{}{"type": "object"}, + Annotations: &ToolAnnotations{ + Title: "Run agent", + ReadOnlyHint: boolPtr(false), + DestructiveHint: boolPtr(true), + }, + Handler: func(ctx context.Context, params json.RawMessage) (string, error) { return "ok", nil }, + }) + + resp := sendRequest(t, server, `{"jsonrpc":"2.0","id":7,"method":"tools/list"}`+"\n") + if resp.Error != nil { + t.Fatalf("unexpected error: %+v", resp.Error) + } + tools := resp.Result.(map[string]interface{})["tools"].([]interface{}) + if len(tools) != 1 { + t.Fatalf("expected 1 tool, got %d", len(tools)) + } + ann, ok := tools[0].(map[string]interface{})["annotations"].(map[string]interface{}) + if !ok { + t.Fatalf("expected annotations map, got %T", tools[0].(map[string]interface{})["annotations"]) + } + if ann["destructiveHint"] != true { + t.Errorf("destructiveHint = %v, want true", ann["destructiveHint"]) + } + if ann["readOnlyHint"] != false { + t.Errorf("readOnlyHint = %v, want false", ann["readOnlyHint"]) + } + if ann["title"] != "Run agent" { + t.Errorf("title = %v, want %q", ann["title"], "Run agent") + } + // idempotentHint was never set, so it must be omitted. + if _, present := ann["idempotentHint"]; present { + t.Error("idempotentHint should be omitted when unset") + } +} + +// TestDefaultToolsHaveAnnotations guards that every default-registered tool +// carries a risk hint, so a future tool addition can't silently ship without +// one for connecting clients to self-throttle on. +func TestDefaultToolsHaveAnnotations(t *testing.T) { + server := NewMCPServer(ServerInfo{Name: "hawk", Version: "1.0.0"}) + RegisterDefaultTools(server, nil) + + resp := sendRequest(t, server, `{"jsonrpc":"2.0","id":8,"method":"tools/list"}`+"\n") + if resp.Error != nil { + t.Fatalf("unexpected error: %+v", resp.Error) + } + tools := resp.Result.(map[string]interface{})["tools"].([]interface{}) + if len(tools) == 0 { + t.Fatal("no default tools registered") + } + for _, raw := range tools { + tm := raw.(map[string]interface{}) + if _, ok := tm["annotations"].(map[string]interface{}); !ok { + t.Errorf("default tool %v missing annotations", tm["name"]) + } } } diff --git a/internal/mcp/server_tools.go b/internal/mcp/server_tools.go index 2594df40..db535a87 100644 --- a/internal/mcp/server_tools.go +++ b/internal/mcp/server_tools.go @@ -10,6 +10,15 @@ import ( // This avoids importing the tool package (which already imports mcp). type ToolExecutor func(ctx context.Context, name string, input json.RawMessage) (string, error) +// boolPtr returns a pointer to b, for the pointer-typed MCP annotation hints. +func boolPtr(b bool) *bool { return &b } + +// readOnlyAnnotations marks a tool that only reads/inspects and never mutates +// the workspace, so a client can run it without prompting. +func readOnlyAnnotations(title string) *ToolAnnotations { + return &ToolAnnotations{Title: title, ReadOnlyHint: boolPtr(true), DestructiveHint: boolPtr(false)} +} + // RegisterDefaultTools registers hawk's standard capabilities as MCP tools. // If executor is non-nil, tools that delegate to hawk's tool registry will // use it for execution; otherwise those tools return a not-configured error. @@ -26,8 +35,15 @@ func RegisterDefaultTools(server *MCPServer, executor ToolExecutor) { // hawkChatTool sends a prompt to hawk and returns the response. func hawkChatTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ - Name: "hawk_chat", - Description: "Send a prompt to the hawk AI coding agent and receive a response.", + Name: "hawk_chat", + Description: "Send a prompt to the hawk AI coding agent and receive a response. " + + "WARNING: this runs an autonomous agent that may execute shell commands and modify files.", + Annotations: &ToolAnnotations{ + Title: "Run hawk agent", + ReadOnlyHint: boolPtr(false), + DestructiveHint: boolPtr(true), + OpenWorldHint: boolPtr(true), + }, InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -58,6 +74,7 @@ func hawkSearchTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_search", Description: "Search across hawk sessions and conversation history.", + Annotations: readOnlyAnnotations("Search hawk sessions"), InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -93,6 +110,7 @@ func hawkMemoryRecallTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_memory_recall", Description: "Recall stored information from hawk's persistent memory (yaad).", + Annotations: readOnlyAnnotations("Recall from hawk memory"), InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -128,6 +146,11 @@ func hawkMemoryStoreTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_memory_store", Description: "Store information in hawk's persistent memory (yaad) for future recall.", + Annotations: &ToolAnnotations{ + Title: "Store in hawk memory", + ReadOnlyHint: boolPtr(false), + DestructiveHint: boolPtr(false), // additive write, does not destroy existing data + }, InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -168,6 +191,7 @@ func hawkReviewTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_review", Description: "Trigger a code review using hawk's sight module. Analyzes code for quality, style, and potential issues.", + Annotations: readOnlyAnnotations("Review code (sight)"), InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -203,6 +227,7 @@ func hawkScanTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_scan", Description: "Trigger a security scan using hawk's inspect module. Identifies vulnerabilities and security issues.", + Annotations: readOnlyAnnotations("Security scan (inspect)"), InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ @@ -239,6 +264,7 @@ func hawkCompressTool(executor ToolExecutor) MCPToolHandler { return MCPToolHandler{ Name: "hawk_compress", Description: "Compress text using hawk's tok module to reduce token usage while preserving meaning.", + Annotations: readOnlyAnnotations("Compress text (tok)"), InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ diff --git a/internal/mcp/toolresult_test.go b/internal/mcp/toolresult_test.go new file mode 100644 index 00000000..3cda8d2b --- /dev/null +++ b/internal/mcp/toolresult_test.go @@ -0,0 +1,64 @@ +package mcp + +import ( + "encoding/json" + "testing" +) + +func TestParseToolCallResult(t *testing.T) { + tests := []struct { + name string + raw string + wantText string + wantErr bool + }{ + { + name: "success flattens text content", + raw: `{"content":[{"type":"text","text":"hello "},{"type":"text","text":"world"}]}`, + wantText: "hello world", + wantErr: false, + }, + { + name: "isError surfaces content as error", + raw: `{"content":[{"type":"text","text":"file not found"}],"isError":true}`, + wantText: "file not found", + wantErr: true, + }, + { + name: "isError with empty content gets a default message", + raw: `{"content":[],"isError":true}`, + wantText: "remote MCP tool reported an error", + wantErr: true, + }, + { + name: "isError false is a normal result", + raw: `{"content":[{"type":"text","text":"ok"}],"isError":false}`, + wantText: "ok", + wantErr: false, + }, + { + name: "non-text content is skipped", + raw: `{"content":[{"type":"image","text":""},{"type":"text","text":"caption"}]}`, + wantText: "caption", + wantErr: false, + }, + { + name: "undecodable result falls back to raw bytes", + raw: `not json`, + wantText: "not json", + wantErr: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + text, err := parseToolCallResult(json.RawMessage(tc.raw)) + if (err != nil) != tc.wantErr { + t.Fatalf("err = %v, wantErr = %v", err, tc.wantErr) + } + if text != tc.wantText { + t.Errorf("text = %q, want %q", text, tc.wantText) + } + }) + } +} diff --git a/internal/tool/agentic_fetch.go b/internal/tool/agentic_fetch.go index 25a2680f..a16cbb04 100644 --- a/internal/tool/agentic_fetch.go +++ b/internal/tool/agentic_fetch.go @@ -4,8 +4,15 @@ import ( "context" "encoding/json" "fmt" + "strings" + "sync" ) +// maxAgenticFetchConcurrency bounds how many pages are fetched and summarized +// in parallel when a batch of URLs is supplied, matching the WebSearch cap so +// the SSRF-aware fetch client does not face a connection storm. +const maxAgenticFetchConcurrency = 4 + // AgenticFetchTool spawns a sub-agent to fetch, process, and summarize web content. type AgenticFetchTool struct{} @@ -20,22 +27,33 @@ func (AgenticFetchTool) Parameters() map[string]interface{} { return map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ - "url": map[string]interface{}{"type": "string", "description": "URL to fetch and analyze"}, - "query": map[string]interface{}{"type": "string", "description": "What to look for or extract from the page"}, + "url": map[string]interface{}{"type": "string", "description": "URL to fetch and analyze. Provide this OR urls (not both)."}, + "urls": map[string]interface{}{ + "type": "array", + "items": map[string]interface{}{"type": "string"}, + "description": "Multiple URLs to fetch and summarize concurrently against the same query, in a single call.", + }, + "query": map[string]interface{}{"type": "string", "description": "What to look for or extract from the page(s)"}, }, } } -func (AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) { +func (t AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) { var p struct { - URL string `json:"url"` - Query string `json:"query"` + URL string `json:"url"` + URLs []string `json:"urls"` + Query string `json:"query"` } if err := json.Unmarshal(input, &p); err != nil { return "", err } - if p.URL == "" { - return "", fmt.Errorf("url is required") + + urls := p.URLs + if p.URL != "" { + urls = append(urls, p.URL) + } + if len(urls) == 0 { + return "", fmt.Errorf("url or urls is required") } if p.Query == "" { p.Query = "Extract the key information from this page." @@ -43,17 +61,55 @@ func (AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (str tc := GetToolContext(ctx) if tc == nil || tc.AgentSpawnFn == nil { - // Fallback: do a direct fetch if no sub-agent available. - return (WebFetchTool{}).Execute(ctx, input) + // Fallback: do a direct fetch if no sub-agent available. Only the + // single-URL case maps onto WebFetch's input shape. + if len(urls) == 1 { + fetchInput, _ := json.Marshal(map[string]string{"url": urls[0]}) + return (WebFetchTool{}).Execute(ctx, fetchInput) + } + return "", fmt.Errorf("batch agentic fetch requires a sub-agent, which is unavailable in this context") + } + + if len(urls) == 1 { + return tc.AgentSpawnFn(ctx, t.researchPrompt(urls[0], p.Query)) } - prompt := fmt.Sprintf(`You are a web research agent. Your task: + // Batch: summarize each URL concurrently, bounded, with labeled sections. + outputs := make([]string, len(urls)) + var wg sync.WaitGroup + sem := make(chan struct{}, maxAgenticFetchConcurrency) + for i, u := range urls { + wg.Add(1) + go func(idx int, u string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + out, err := tc.AgentSpawnFn(ctx, t.researchPrompt(u, p.Query)) + if err != nil { + outputs[idx] = fmt.Sprintf("## %s\n\nError: %v", u, err) + return + } + outputs[idx] = fmt.Sprintf("## %s\n\n%s", u, out) + }(i, u) + } + wg.Wait() + + return strings.Join(outputs, "\n\n---\n\n"), nil +} + +// researchPrompt builds the sub-agent instruction for one URL. It includes an +// explicit relevance-refusal contract: if the page does not address the query, +// the sub-agent returns a short "NO_RELEVANT_INFORMATION" signal instead of +// padding a vague summary, keeping the caller's context clean. +func (AgenticFetchTool) researchPrompt(url, query string) string { + return fmt.Sprintf(`You are a web research agent. Your task: 1. Fetch the URL: %s 2. Extract information relevant to: %s -3. Return a concise, well-structured summary of the relevant content. -4. Include specific details, code examples, or data points — not vague descriptions. - -Use the WebFetch tool to retrieve the page content, then analyze and summarize it.`, p.URL, p.Query) +3. If the page genuinely does not contain information relevant to the query, + respond with exactly "NO_RELEVANT_INFORMATION" and one short sentence on why + — do not invent or pad a summary. +4. Otherwise, return a concise, well-structured summary of the relevant content, + including specific details, code examples, or data points — not vague descriptions. - return tc.AgentSpawnFn(ctx, prompt) +Use the WebFetch tool to retrieve the page content, then analyze and summarize it.`, url, query) } diff --git a/internal/tool/agentic_fetch_test.go b/internal/tool/agentic_fetch_test.go new file mode 100644 index 00000000..c630ace4 --- /dev/null +++ b/internal/tool/agentic_fetch_test.go @@ -0,0 +1,87 @@ +package tool + +import ( + "context" + "encoding/json" + "strings" + "sync/atomic" + "testing" +) + +func TestAgenticFetch_BatchFanOut(t *testing.T) { + var calls int32 + tc := &ToolContext{ + AgentSpawnFn: func(_ context.Context, prompt string) (string, error) { + atomic.AddInt32(&calls, 1) + // Echo back which URL this invocation was for so we can assert + // each URL produced its own labeled section. + for _, u := range []string{"https://a.example", "https://b.example", "https://c.example"} { + if strings.Contains(prompt, u) { + return "summary-of " + u, nil + } + } + return "summary", nil + }, + } + ctx := WithToolContext(context.Background(), tc) + + input, _ := json.Marshal(map[string]any{ + "urls": []string{"https://a.example", "https://b.example", "https://c.example"}, + "query": "what is this", + }) + out, err := (AgenticFetchTool{}).Execute(ctx, input) + if err != nil { + t.Fatalf("Execute: %v", err) + } + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("AgentSpawnFn called %d times, want 3", got) + } + for _, u := range []string{"https://a.example", "https://b.example", "https://c.example"} { + if !strings.Contains(out, "## "+u) { + t.Errorf("output missing labeled section for %q\n%s", u, out) + } + if !strings.Contains(out, "summary-of "+u) { + t.Errorf("output missing summary for %q", u) + } + } +} + +func TestAgenticFetch_SingleURLBackCompat(t *testing.T) { + var gotPrompt string + tc := &ToolContext{ + AgentSpawnFn: func(_ context.Context, prompt string) (string, error) { + gotPrompt = prompt + return "ok", nil + }, + } + ctx := WithToolContext(context.Background(), tc) + + input, _ := json.Marshal(map[string]any{"url": "https://only.example", "query": "q"}) + out, err := (AgenticFetchTool{}).Execute(ctx, input) + if err != nil { + t.Fatalf("Execute: %v", err) + } + // Single URL keeps the bare sub-agent output (no "## url" batch header). + if out != "ok" { + t.Errorf("single-url output = %q, want %q", out, "ok") + } + // The refusal contract must be present in the prompt. + if !strings.Contains(gotPrompt, "NO_RELEVANT_INFORMATION") { + t.Error("research prompt missing relevance-refusal contract") + } +} + +func TestAgenticFetch_RequiresURL(t *testing.T) { + input, _ := json.Marshal(map[string]any{"query": "q"}) + if _, err := (AgenticFetchTool{}).Execute(context.Background(), input); err == nil { + t.Error("expected error when neither url nor urls is provided") + } +} + +func TestWebSearch_QueryNormalization(t *testing.T) { + // Neither query nor queries → error, without touching the network. + input, _ := json.Marshal(map[string]any{"numResults": 3}) + if _, err := (WebSearchTool{}).Execute(context.Background(), input); err == nil { + t.Error("expected error when neither query nor queries is provided") + } +} diff --git a/internal/tool/web_search.go b/internal/tool/web_search.go index 6ed813ac..9e2f5257 100644 --- a/internal/tool/web_search.go +++ b/internal/tool/web_search.go @@ -8,9 +8,15 @@ import ( "net/http" "net/url" "strings" + "sync" "time" ) +// maxWebSearchConcurrency bounds the number of search backends contacted in +// parallel when a batch of queries is issued, so a large queries[] array does +// not open a burst of outbound connections (and trip provider rate limits). +const maxWebSearchConcurrency = 4 + // searchResult is the shared result type for all search backends. type searchResult struct { Title string @@ -33,7 +39,12 @@ func (WebSearchTool) Parameters() map[string]interface{} { "properties": map[string]interface{}{ "query": map[string]interface{}{ "type": "string", - "description": "Search query", + "description": "Search query. Provide this OR queries (not both).", + }, + "queries": map[string]interface{}{ + "type": "array", + "items": map[string]interface{}{"type": "string"}, + "description": "Multiple search queries to run concurrently in a single call. Use this to research several things at once instead of issuing one WebSearch per query.", }, "numResults": map[string]interface{}{ "type": "integer", @@ -49,22 +60,31 @@ func (WebSearchTool) Parameters() map[string]interface{} { "default": "web", }, }, - "required": []string{"query"}, + // Either query or queries must be supplied; validated in Execute since + // JSON Schema "required" cannot express an exclusive-or cleanly. } } -func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) { +func (t WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) { var p struct { - Query string `json:"query"` - NumResults int `json:"numResults"` - SearchType string `json:"searchType"` + Query string `json:"query"` + Queries []string `json:"queries"` + NumResults int `json:"numResults"` + SearchType string `json:"searchType"` } if err := json.Unmarshal(input, &p); err != nil { return "", err } - if p.Query == "" { - return "", fmt.Errorf("query is required") + + // Normalize the single/batch surfaces into one list of queries. + queries := p.Queries + if p.Query != "" { + queries = append(queries, p.Query) + } + if len(queries) == 0 { + return "", fmt.Errorf("query or queries is required") } + if p.NumResults <= 0 { p.NumResults = 5 } @@ -75,16 +95,58 @@ func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string p.SearchType = "web" } - // Try providers in order: Brave → SearXNG → DuckDuckGo + // Single query: keep the original output shape (no batch header). + if len(queries) == 1 { + out, err := t.searchOne(ctx, queries[0], p.NumResults) + if err != nil { + return "", err + } + return out, nil + } + + // Batch: fan out concurrently, bounded, and label each result section so + // the agent can attribute results to their query in one tool call. + outputs := make([]string, len(queries)) + var wg sync.WaitGroup + sem := make(chan struct{}, maxWebSearchConcurrency) + for i, q := range queries { + wg.Add(1) + go func(idx int, query string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + out, err := t.searchOne(ctx, query, p.NumResults) + if err != nil { + outputs[idx] = fmt.Sprintf("Search results for: %s\n\nError: %v", query, err) + return + } + outputs[idx] = out + }(i, q) + } + wg.Wait() + + var sb strings.Builder + for i, out := range outputs { + if i > 0 { + sb.WriteString("\n\n---\n\n") + } + sb.WriteString(out) + } + return sb.String(), nil +} + +// searchOne runs a single query through the provider cascade +// (Brave → SearXNG → DuckDuckGo) and returns formatted results. +func (WebSearchTool) searchOne(ctx context.Context, query string, numResults int) (string, error) { var results []searchResult var err error // 1. Brave Search brave := newBraveClient() if brave.available() { - results, err = brave.search(ctx, p.Query, p.NumResults) + results, err = brave.search(ctx, query, numResults) if err == nil { - return formatSearchResults(p.Query, results), nil + return formatSearchResults(query, results), nil } // Fall through to next provider on error } @@ -92,19 +154,19 @@ func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string // 2. SearXNG searxng := newSearxngClient() if searxng.available() { - results, err = searxng.search(ctx, p.Query, p.NumResults) + results, err = searxng.search(ctx, query, numResults) if err == nil { - return formatSearchResults(p.Query, results), nil + return formatSearchResults(query, results), nil } // Fall through to DuckDuckGo on error } // 3. DuckDuckGo (fallback) - results, err = duckDuckGoSearch(ctx, p.Query, p.NumResults) + results, err = duckDuckGoSearch(ctx, query, numResults) if err != nil { return "", fmt.Errorf("all search providers failed: %w", err) } - return formatSearchResults(p.Query, results), nil + return formatSearchResults(query, results), nil } // formatSearchResults formats results as numbered structured text. diff --git a/internal/tool/web_test.go b/internal/tool/web_test.go index 9ce049c1..82444231 100644 --- a/internal/tool/web_test.go +++ b/internal/tool/web_test.go @@ -30,8 +30,8 @@ func TestWebSearchTool_EmptyQuery(t *testing.T) { var ws WebSearchTool input, _ := json.Marshal(map[string]string{"query": ""}) _, err := ws.Execute(context.Background(), input) - if err == nil || !strings.Contains(err.Error(), "query is required") { - t.Fatalf("expected 'query is required' error, got %v", err) + if err == nil || !strings.Contains(err.Error(), "is required") { + t.Fatalf("expected a 'required' error, got %v", err) } } From c637fc8e426bc270ecd32837b36cb21ad019eda4 Mon Sep 17 00:00:00 2001 From: Patel230 Date: Mon, 8 Jun 2026 00:25:12 +0530 Subject: [PATCH 2/3] chore: sync external submodules to latest origin/main Advance the eyrie and yaad submodule pointers to the current tip of their origin/main (the other four already matched). Picks up upstream commits that were present on the remotes but not yet recorded in hawk's index. Co-Authored-By: Claude Opus 4.8 (1M context) --- external/eyrie | 2 +- external/yaad | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/eyrie b/external/eyrie index 0f01cf69..5ffdd1f8 160000 --- a/external/eyrie +++ b/external/eyrie @@ -1 +1 @@ -Subproject commit 0f01cf692646cb12f0a2ec83b8eb57dbbb0f9b40 +Subproject commit 5ffdd1f8bd6f6a1b09485ea87d00d7a221cddf2e diff --git a/external/yaad b/external/yaad index 730bb90e..e7c21cca 160000 --- a/external/yaad +++ b/external/yaad @@ -1 +1 @@ -Subproject commit 730bb90e0ee7805d37afb07e1317d81b1f6459ca +Subproject commit e7c21cca3fc93faf01bb12b69cc20ba0a230f776 From ff905572118341cc108c4a891e7cc6c0ab7b0765 Mon Sep 17 00:00:00 2001 From: Patel230 Date: Mon, 8 Jun 2026 00:52:06 +0530 Subject: [PATCH 3/3] chore: bump eyrie submodule to merged main (efe3710) Point external/eyrie at the squash-merged eyrie main containing the provider error/image/reasoning/verify work, so hawk consumes the released eyrie changes. Co-Authored-By: Claude Opus 4.8 (1M context) --- external/eyrie | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/eyrie b/external/eyrie index 5ffdd1f8..efe3710c 160000 --- a/external/eyrie +++ b/external/eyrie @@ -1 +1 @@ -Subproject commit 5ffdd1f8bd6f6a1b09485ea87d00d7a221cddf2e +Subproject commit efe3710c2aa6cdf657fb376951762323c069f2ed