From 0052ed43d8b3a152bc7d4f5c37cf32df6ebd1448 Mon Sep 17 00:00:00 2001
From: Patel230 <Lakshmanp230@gmail.com>
Date: Sun, 7 Jun 2026 23:53:58 +0530
Subject: [PATCH 1/3] feat: adopt MiniMax-AI OSS patterns into hawk

Verified-net-new improvements identified by scanning MiniMax-AI's open-source
repos and adversarially mapping ideas onto hawk's actual source. Patterns only,
no code copied (all sources MIT).

- exit-code taxonomy: internal/hawkerr/exitcode.go classifies errors into stable
  codes (auth/network/ratelimit/timeout/...) so callers can branch on the
  failure reason instead of a bare 1; wired into cmd/hawk/main.go
- IsTerminal color gate: suppress ANSI/Unicode when stdout is piped (non-TTY),
  fixing ANSI leaking into captured/JSON output; FORCE_COLOR still overrides
- MCP client honors isError: parseToolCallResult surfaces remote tool failures
  as errors the agent can self-correct on
- batch web search/browse: queries[]/urls[] concurrent fan-out (bounded) in
  WebSearch and AgenticFetch; single query/url still works; adds a
  relevance-refusal contract to the AgenticFetch sub-agent prompt
- RiskLevel -> MCP annotations: ToolAnnotations (readOnly/destructive hints) on
  the MCP server's tools/list so clients can self-throttle
- hawk mcp serve / hawk mcp config: run hawk as an MCP server (machinery existed
  but had no entrypoint) and emit the client registration JSON block
- tool-call confusion-matrix eval: internal/feature/eval/toolmatrix.go scores
  trigger (TP/FN/FP/TN, precision/recall/F1) separately from payload accuracy;
  exposed via `hawk eval tools`

All new code is unit-tested; touched packages build, vet, and test clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cmd/eval_tools.go                        | 120 ++++++++++++
 cmd/formatter.go                         |  28 ++-
 cmd/formatter_test.go                    |  38 ++++
 cmd/hawk/main.go                         |   7 +-
 cmd/mcp_serve.go                         | 106 +++++++++++
 cmd/mcp_serve_test.go                    |  73 ++++++++
 cmd/root.go                              |   5 +-
 internal/feature/eval/toolmatrix.go      | 226 +++++++++++++++++++++++
 internal/feature/eval/toolmatrix_test.go | 139 ++++++++++++++
 internal/hawkerr/exitcode.go             | 122 ++++++++++++
 internal/hawkerr/exitcode_test.go        |  54 ++++++
 internal/mcp/mcp.go                      |  19 ++
 internal/mcp/server.go                   |  17 ++
 internal/mcp/server_test.go              |  69 +++++++
 internal/mcp/server_tools.go             |  30 ++-
 internal/mcp/toolresult_test.go          |  64 +++++++
 internal/tool/agentic_fetch.go           |  86 +++++++--
 internal/tool/agentic_fetch_test.go      |  87 +++++++++
 internal/tool/web_search.go              |  92 +++++++--
 internal/tool/web_test.go                |   4 +-
 20 files changed, 1348 insertions(+), 38 deletions(-)
 create mode 100644 cmd/eval_tools.go
 create mode 100644 cmd/mcp_serve.go
 create mode 100644 cmd/mcp_serve_test.go
 create mode 100644 internal/feature/eval/toolmatrix.go
 create mode 100644 internal/feature/eval/toolmatrix_test.go
 create mode 100644 internal/hawkerr/exitcode.go
 create mode 100644 internal/hawkerr/exitcode_test.go
 create mode 100644 internal/mcp/toolresult_test.go
 create mode 100644 internal/tool/agentic_fetch_test.go

diff --git a/cmd/eval_tools.go b/cmd/eval_tools.go
new file mode 100644
index 00000000..9a207816
--- /dev/null
+++ b/cmd/eval_tools.go
@@ -0,0 +1,120 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
+	"github.com/GrayCodeAI/hawk/internal/feature/eval"
+	"github.com/GrayCodeAI/hawk/internal/types"
+	"github.com/spf13/cobra"
+)
+
+var evalToolsOutput string
+
+func init() {
+	evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json")
+	evalCmd.AddCommand(evalToolsCmd)
+}
+
+var evalToolsCmd = &cobra.Command{
+	Use:   "tools",
+	Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy",
+	Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " +
+		"expected tool (or none). Triggering (did the model call a tool when it should) " +
+		"is scored as a confusion matrix, separately from payload accuracy (right tool + args).",
+	RunE: runEvalTools,
+}
+
+// defaultToolUseCases is a small built-in set exercising clear positive and
+// negative tool-trigger situations against hawk's standard tools.
+func defaultToolUseCases() []eval.ToolUseCase {
+	return []eval.ToolUseCase{
+		{
+			ID:       "read-existing-file",
+			Prompt:   "Show me the contents of go.mod.",
+			Expected: &eval.ExpectedCall{Tool: "Read"},
+		},
+		{
+			ID:       "list-directory",
+			Prompt:   "What files are in the cmd directory?",
+			Expected: &eval.ExpectedCall{Tool: "LS"},
+		},
+		{
+			ID:       "run-command",
+			Prompt:   "Run the test suite for this project.",
+			Expected: &eval.ExpectedCall{Tool: "Bash"},
+		},
+		{
+			ID:       "search-code",
+			Prompt:   "Find every place that defines an http handler in this repo.",
+			Expected: &eval.ExpectedCall{Tool: "Grep"},
+		},
+		{
+			// Negative case: a pure-knowledge question needs no tool.
+			ID:       "no-tool-trivia",
+			Prompt:   "In one sentence, what does the SOLID 'S' stand for?",
+			Expected: nil,
+		},
+		{
+			// Negative case: a greeting needs no tool.
+			ID:       "no-tool-greeting",
+			Prompt:   "Say hello.",
+			Expected: nil,
+		},
+	}
+}
+
+func runEvalTools(cmd *cobra.Command, _ []string) error {
+	settings := hawkconfig.LoadSettings()
+
+	registry, err := defaultRegistry(settings)
+	if err != nil {
+		return fmt.Errorf("building tool registry: %w", err)
+	}
+	systemPrompt, err := buildSystemPrompt()
+	if err != nil {
+		return err
+	}
+	model, provider := effectiveModelAndProvider(settings)
+	sess := newHawkSession(settings, provider, model, systemPrompt, registry)
+	if err := configureSession(sess, settings); err != nil {
+		return err
+	}
+
+	tools := registry.EyrieTools()
+
+	// caller performs one tool-aware turn and reports the first tool the model
+	// chose (if any). It does not execute the tool — we are scoring selection,
+	// not effects.
+	caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) {
+		resp, err := sess.Chat(ctx, []types.EyrieMessage{
+			{Role: "user", Content: c.Prompt},
+		}, types.ChatOptions{Model: model, Tools: tools})
+		if err != nil {
+			return eval.ObservedCall{}, err
+		}
+		if resp == nil || len(resp.ToolCalls) == 0 {
+			return eval.ObservedCall{}, nil // no tool called
+		}
+		tc := resp.ToolCalls[0]
+		return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil
+	}
+
+	ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute)
+	defer cancel()
+
+	cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model)
+	report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller)
+
+	switch evalToolsOutput {
+	case "json":
+		data, _ := json.MarshalIndent(report, "", "  ")
+		cmd.Println(string(data))
+	default:
+		cmd.Println(report.Markdown())
+	}
+	return nil
+}
diff --git a/cmd/formatter.go b/cmd/formatter.go
index 17499057..a582061f 100644
--- a/cmd/formatter.go
+++ b/cmd/formatter.go
@@ -6,8 +6,19 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"golang.org/x/term"
 )
 
+// stdoutIsTerminal reports whether stdout is connected to a terminal (TTY).
+// When stdout is a pipe or file — which is exactly the case when an agent or
+// shell script captures hawk's output — this is false, and color/Unicode
+// chrome must be suppressed so the payload stays clean. It is a var so tests
+// can override it.
+var stdoutIsTerminal = func() bool {
+	return term.IsTerminal(int(os.Stdout.Fd()))
+}
+
 // TreeNode represents a node in a tree structure for FormatTree.
 type TreeNode struct {
 	Name     string
@@ -544,9 +555,16 @@ func DetectColorSupport() bool {
 		return false
 	}
 
+	// Stdout is not a TTY (piped to an agent, file, or another process):
+	// suppress ANSI so the captured output is clean. An explicit FORCE_COLOR
+	// above already overrode this for callers that pipe but still want color.
+	if !stdoutIsTerminal() {
+		return false
+	}
+
 	// Check TERM
-	term := os.Getenv("TERM")
-	if term == "dumb" || term == "" {
+	t := os.Getenv("TERM")
+	if t == "dumb" || t == "" {
 		return false
 	}
 
@@ -555,6 +573,12 @@ func DetectColorSupport() bool {
 
 // DetectUnicodeSupport checks if the terminal supports Unicode characters.
 func DetectUnicodeSupport() bool {
+	// Non-TTY stdout: emit ASCII so box-drawing/glyphs don't corrupt captured
+	// output. FORCE_COLOR is a color signal only, so it does not override here.
+	if !stdoutIsTerminal() {
+		return false
+	}
+
 	lang := os.Getenv("LANG")
 	lcAll := os.Getenv("LC_ALL")
 	lcCtype := os.Getenv("LC_CTYPE")
diff --git a/cmd/formatter_test.go b/cmd/formatter_test.go
index 01f29df8..a4f86aa6 100644
--- a/cmd/formatter_test.go
+++ b/cmd/formatter_test.go
@@ -7,6 +7,38 @@ import (
 	"time"
 )
 
+func TestDetectColorSupport_NonTTYStdout(t *testing.T) {
+	orig := stdoutIsTerminal
+	defer func() { stdoutIsTerminal = orig }()
+
+	t.Setenv("NO_COLOR", "")
+	t.Setenv("FORCE_COLOR", "")
+	t.Setenv("TERM", "xterm-256color")
+
+	// Piped stdout (not a TTY) with a normal TERM must still disable color —
+	// this is the "agent captured ANSI escapes in its JSON" regression.
+	stdoutIsTerminal = func() bool { return false }
+	if DetectColorSupport() {
+		t.Error("DetectColorSupport() = true for non-TTY stdout; want false")
+	}
+	if DetectUnicodeSupport() {
+		t.Error("DetectUnicodeSupport() = true for non-TTY stdout; want false")
+	}
+
+	// A TTY with a good TERM keeps color.
+	stdoutIsTerminal = func() bool { return true }
+	if !DetectColorSupport() {
+		t.Error("DetectColorSupport() = false for TTY stdout; want true")
+	}
+
+	// FORCE_COLOR overrides the non-TTY gate (deliberate piped color).
+	stdoutIsTerminal = func() bool { return false }
+	t.Setenv("FORCE_COLOR", "1")
+	if !DetectColorSupport() {
+		t.Error("DetectColorSupport() = false with FORCE_COLOR over a pipe; want true")
+	}
+}
+
 func newTestFormatter(color, unicode bool, width int) *OutputFormatter {
 	theme := OutputTheme{}
 	if color {
@@ -552,6 +584,12 @@ func TestDetectColorSupport(t *testing.T) {
 }
 
 func TestDetectUnicodeSupport(t *testing.T) {
+	// These subtests exercise the locale-env logic, so pin stdout to a TTY;
+	// the non-TTY suppression gate is covered by TestDetectColorSupport_NonTTYStdout.
+	origIsTTY := stdoutIsTerminal
+	stdoutIsTerminal = func() bool { return true }
+	defer func() { stdoutIsTerminal = origIsTTY }()
+
 	t.Run("UTF-8 lang", func(t *testing.T) {
 		origLang := os.Getenv("LANG")
 		origLcAll := os.Getenv("LC_ALL")
diff --git a/cmd/hawk/main.go b/cmd/hawk/main.go
index 46cec807..dd5bd0c7 100644
--- a/cmd/hawk/main.go
+++ b/cmd/hawk/main.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/GrayCodeAI/hawk/cmd"
 	"github.com/GrayCodeAI/hawk/internal/api"
+	"github.com/GrayCodeAI/hawk/internal/hawkerr"
 	"github.com/GrayCodeAI/hawk/internal/mcp"
 	"github.com/GrayCodeAI/hawk/internal/sandbox"
 )
@@ -42,10 +43,14 @@ func main() {
 
 	if err := cmd.Execute(); err != nil {
 		fmt.Fprintln(os.Stderr, err)
+		// An explicit ExitCodeError (e.g. a wrapped Bash exit status) wins —
+		// it already carries the intended code. Otherwise classify the failure
+		// into the stable exit-code taxonomy so callers can branch on the
+		// reason (auth vs rate-limit vs network) instead of seeing a bare 1.
 		var exitErr *cmd.ExitCodeError
 		if errors.As(err, &exitErr) {
 			os.Exit(exitErr.Code)
 		}
-		os.Exit(1)
+		os.Exit(hawkerr.ClassifyExitCode(err))
 	}
 }
diff --git a/cmd/mcp_serve.go b/cmd/mcp_serve.go
new file mode 100644
index 00000000..8997af38
--- /dev/null
+++ b/cmd/mcp_serve.go
@@ -0,0 +1,106 @@
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+	"os/signal"
+	"syscall"
+
+	hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
+	"github.com/GrayCodeAI/hawk/internal/mcp"
+	"github.com/spf13/cobra"
+)
+
+var mcpConfigWrite bool
+
+func init() {
+	mcpConfigCmd.Flags().BoolVar(&mcpConfigWrite, "write", false,
+		"also print the well-known client config paths to paste the block into")
+	mcpCmd.AddCommand(mcpServeCmd)
+	mcpCmd.AddCommand(mcpConfigCmd)
+}
+
+// mcpServeCmd runs hawk itself as an MCP server over stdio, exposing hawk's
+// capabilities (chat, search, memory, review, scan, compress) to MCP clients
+// such as Claude Desktop, Cursor, and Windsurf.
+var mcpServeCmd = &cobra.Command{
+	Use:   "serve",
+	Short: "Run hawk as an MCP server over stdio",
+	Long: "Run hawk as a Model Context Protocol server over stdio (JSON-RPC 2.0), " +
+		"exposing hawk's tools to MCP clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
+		"Use `hawk mcp config` to print the JSON block that registers this command in a client.",
+	RunE: runMCPServe,
+}
+
+func runMCPServe(cmd *cobra.Command, _ []string) error {
+	settings := hawkconfig.LoadSettings()
+
+	serverVersion := version
+	if serverVersion == "" {
+		serverVersion = "dev"
+	}
+	server := mcp.NewMCPServer(mcp.ServerInfo{Name: "hawk", Version: serverVersion})
+
+	// Wire hawk's tool registry in as the executor so delegating tools run for
+	// real; a registry build failure degrades to not-configured rather than
+	// aborting (the server still answers initialize/tools/list).
+	registry, err := defaultRegistry(settings)
+	if err == nil {
+		mcp.RegisterDefaultTools(server, registry.Execute)
+	} else {
+		mcp.RegisterDefaultTools(server, nil)
+	}
+
+	ctx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM)
+	defer stop()
+	return server.ServeStdio(ctx)
+}
+
+// mcpConfigCmd emits the JSON block that registers hawk as an MCP server in a
+// client's config file, so users don't hand-edit JSON.
+var mcpConfigCmd = &cobra.Command{
+	Use:   "config",
+	Short: "Print the MCP-server config block to register hawk in a client",
+	Long: "Print the JSON block that registers hawk as an MCP server (pointing at " +
+		"`hawk mcp serve`) for clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
+		"Pipe it to the client's config file, e.g.:\n" +
+		"  hawk mcp config >> ~/Library/Application Support/Claude/claude_desktop_config.json",
+	RunE: runMCPConfig,
+}
+
+func runMCPConfig(cmd *cobra.Command, _ []string) error {
+	exe := hawkExecutablePath()
+
+	block := map[string]any{
+		"mcpServers": map[string]any{
+			"hawk": map[string]any{
+				"command": exe,
+				"args":    []string{"mcp", "serve"},
+			},
+		},
+	}
+	out, err := json.MarshalIndent(block, "", "  ")
+	if err != nil {
+		return err
+	}
+
+	if mcpConfigWrite {
+		cmd.Println("# Add the \"hawk\" entry below into the \"mcpServers\" object of your client config:")
+		cmd.Println("#   Claude Desktop (macOS): ~/Library/Application Support/Claude/claude_desktop_config.json")
+		cmd.Println("#   Cursor:                 ~/.cursor/mcp.json")
+		cmd.Println("#   Windsurf:               ~/.codeium/windsurf/mcp_config.json")
+		cmd.Println()
+	}
+	cmd.Println(string(out))
+	return nil
+}
+
+// hawkExecutablePath returns the absolute path to the running hawk binary, or
+// the bare name "hawk" if it cannot be resolved (e.g. during `go run`), so the
+// emitted config is still copy-pasteable.
+func hawkExecutablePath() string {
+	if exe, err := os.Executable(); err == nil && exe != "" {
+		return exe
+	}
+	return "hawk"
+}
diff --git a/cmd/mcp_serve_test.go b/cmd/mcp_serve_test.go
new file mode 100644
index 00000000..8b37d46b
--- /dev/null
+++ b/cmd/mcp_serve_test.go
@@ -0,0 +1,73 @@
+package cmd
+
+import (
+	"bytes"
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"github.com/spf13/cobra"
+)
+
+func TestMCPConfigEmitsValidServerBlock(t *testing.T) {
+	var buf bytes.Buffer
+	c := &cobra.Command{}
+	c.SetOut(&buf)
+	c.SetErr(&buf)
+
+	if err := runMCPConfig(c, nil); err != nil {
+		t.Fatalf("runMCPConfig: %v", err)
+	}
+
+	// Output should be a single JSON object (no leading comment lines without --write).
+	var cfg struct {
+		MCPServers map[string]struct {
+			Command string   `json:"command"`
+			Args    []string `json:"args"`
+		} `json:"mcpServers"`
+	}
+	if err := json.Unmarshal(buf.Bytes(), &cfg); err != nil {
+		t.Fatalf("emitted config is not valid JSON: %v\n%s", err, buf.String())
+	}
+
+	hawk, ok := cfg.MCPServers["hawk"]
+	if !ok {
+		t.Fatal(`config missing "hawk" server entry`)
+	}
+	if hawk.Command == "" {
+		t.Error("hawk server entry missing command")
+	}
+	if len(hawk.Args) != 2 || hawk.Args[0] != "mcp" || hawk.Args[1] != "serve" {
+		t.Errorf(`args = %v, want ["mcp" "serve"]`, hawk.Args)
+	}
+}
+
+func TestMCPConfigWriteAddsPathHints(t *testing.T) {
+	orig := mcpConfigWrite
+	defer func() { mcpConfigWrite = orig }()
+	mcpConfigWrite = true
+
+	var buf bytes.Buffer
+	c := &cobra.Command{}
+	c.SetOut(&buf)
+	c.SetErr(&buf)
+
+	if err := runMCPConfig(c, nil); err != nil {
+		t.Fatalf("runMCPConfig: %v", err)
+	}
+	out := buf.String()
+	for _, want := range []string{"claude_desktop_config.json", "cursor", "windsurf"} {
+		if !strings.Contains(strings.ToLower(out), strings.ToLower(want)) {
+			t.Errorf("--write output missing hint %q\n%s", want, out)
+		}
+	}
+	// The JSON block must still be present and parseable after the comments.
+	brace := strings.Index(out, "{")
+	if brace < 0 {
+		t.Fatal("no JSON block found in --write output")
+	}
+	var anyObj map[string]any
+	if err := json.Unmarshal([]byte(out[brace:]), &anyObj); err != nil {
+		t.Fatalf("JSON block after hints is invalid: %v", err)
+	}
+}
diff --git a/cmd/root.go b/cmd/root.go
index d8e1f60f..5d6b7ba3 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -466,7 +466,10 @@ var configCmd = &cobra.Command{
 
 var mcpCmd = &cobra.Command{
 	Use:   "mcp",
-	Short: "Show MCP server configuration",
+	Short: "Show MCP configuration; run or register hawk as an MCP server",
+	Long: "With no subcommand, summarizes the MCP servers hawk connects to (consumes).\n" +
+		"  hawk mcp serve   — run hawk itself as an MCP server over stdio\n" +
+		"  hawk mcp config  — print the JSON block to register hawk in Claude Desktop/Cursor/Windsurf",
 	RunE: func(cmd *cobra.Command, args []string) error {
 		settings, err := loadEffectiveSettings()
 		if err != nil {
diff --git a/internal/feature/eval/toolmatrix.go b/internal/feature/eval/toolmatrix.go
new file mode 100644
index 00000000..f1d7a225
--- /dev/null
+++ b/internal/feature/eval/toolmatrix.go
@@ -0,0 +1,226 @@
+package eval
+
+import (
+	"context"
+	"fmt"
+	"sort"
+)
+
+// Tool-use evaluation.
+//
+// A coding agent's tool behavior has two independent failure modes that a
+// single pass/fail score conflates:
+//
+//  1. Triggering — did the agent invoke a tool when it should have (and refrain
+//     when it should not)? This is a binary classification problem, scored as a
+//     confusion matrix (TP/FN/FP/TN) with precision/recall.
+//  2. Well-formedness — given that a tool was triggered, did the agent pick the
+//     right tool and format the arguments correctly? Scored separately, and only
+//     over cases where a trigger actually fired.
+//
+// With 40+ tools and model-swapping via eyrie, "picked the wrong moment to call
+// a tool" and "called the right tool with bad args" are distinct regressions.
+// Scoring them apart makes which one regressed legible.
+
+// ExpectedCall describes the tool behavior a case expects. A nil ExpectedCall
+// means "the agent should NOT call any tool" (a negative case).
+type ExpectedCall struct {
+	Tool string // expected tool name
+	// ArgsMatch optionally validates the arguments the model supplied. If nil,
+	// only the tool name is checked for payload accuracy.
+	ArgsMatch func(args map[string]any) bool
+}
+
+// ToolUseCase is a single model-in-the-loop tool-selection test.
+type ToolUseCase struct {
+	ID     string
+	Prompt string
+	// Expected is the tool the model should call, or nil if it should not call
+	// any tool for this prompt.
+	Expected *ExpectedCall
+}
+
+// ObservedCall is what the model actually did for a case: the tool it invoked
+// (empty Tool means it invoked none) and the arguments it passed.
+type ObservedCall struct {
+	Tool string
+	Args map[string]any
+}
+
+// ToolCaller runs one case against the model under test and reports what tool
+// (if any) the model chose to call. Injecting this keeps scoring testable
+// without a live model.
+type ToolCaller func(ctx context.Context, c ToolUseCase) (ObservedCall, error)
+
+// TriggerMatrix is the confusion matrix for the binary "should a tool fire?"
+// decision, aggregated across cases.
+type TriggerMatrix struct {
+	TP int // expected a tool and the model called one
+	FN int // expected a tool but the model called none
+	FP int // expected no tool but the model called one
+	TN int // expected no tool and the model called none
+}
+
+// Precision = TP / (TP+FP); 1.0 when there are no positive predictions.
+func (m TriggerMatrix) Precision() float64 {
+	d := m.TP + m.FP
+	if d == 0 {
+		return 1.0
+	}
+	return float64(m.TP) / float64(d)
+}
+
+// Recall = TP / (TP+FN); 1.0 when there are no positive expectations.
+func (m TriggerMatrix) Recall() float64 {
+	d := m.TP + m.FN
+	if d == 0 {
+		return 1.0
+	}
+	return float64(m.TP) / float64(d)
+}
+
+// F1 is the harmonic mean of precision and recall.
+func (m TriggerMatrix) F1() float64 {
+	p, r := m.Precision(), m.Recall()
+	if p+r == 0 {
+		return 0
+	}
+	return 2 * p * r / (p + r)
+}
+
+// PayloadAccuracy scores well-formedness, conditioned on a trigger firing.
+type PayloadAccuracy struct {
+	Evaluated   int // cases where a tool was both expected and called (TP)
+	CorrectTool int // of those, the model picked the expected tool
+	CorrectArgs int // of those with the right tool, args also matched
+}
+
+// ToolName returns the fraction of triggered cases with the correct tool name.
+func (p PayloadAccuracy) ToolNameRate() float64 {
+	if p.Evaluated == 0 {
+		return 1.0
+	}
+	return float64(p.CorrectTool) / float64(p.Evaluated)
+}
+
+// ArgsRate returns the fraction of triggered cases with correct tool AND args.
+func (p PayloadAccuracy) ArgsRate() float64 {
+	if p.Evaluated == 0 {
+		return 1.0
+	}
+	return float64(p.CorrectArgs) / float64(p.Evaluated)
+}
+
+// CaseOutcome records the per-case verdict, useful for reporting which prompts
+// regressed.
+type CaseOutcome struct {
+	ID            string
+	ExpectedTool  string // "" means no tool expected
+	ObservedTool  string // "" means no tool called
+	TriggerResult string // "TP" | "FN" | "FP" | "TN"
+	CorrectTool   bool   // only meaningful when TriggerResult == "TP"
+	CorrectArgs   bool   // only meaningful when CorrectTool
+	Err           string
+}
+
+// ToolUseReport is the full result of scoring a set of cases.
+type ToolUseReport struct {
+	Matrix   TriggerMatrix
+	Payload  PayloadAccuracy
+	Outcomes []CaseOutcome
+}
+
+// ScoreToolUse runs each case through the caller and produces the trigger
+// confusion matrix and payload accuracy. A caller error is recorded on the
+// outcome and treated as "no tool called" for trigger purposes, so a model that
+// errors when it should have acted is counted as a false negative rather than
+// silently dropped.
+func ScoreToolUse(ctx context.Context, cases []ToolUseCase, caller ToolCaller) ToolUseReport {
+	rep := ToolUseReport{Outcomes: make([]CaseOutcome, 0, len(cases))}
+
+	for _, c := range cases {
+		obs, err := caller(ctx, c)
+		oc := CaseOutcome{ID: c.ID, ObservedTool: obs.Tool}
+		if err != nil {
+			oc.Err = err.Error()
+			obs.Tool = "" // an error means no usable tool call
+		}
+
+		expectsTool := c.Expected != nil
+		calledTool := obs.Tool != ""
+		if expectsTool {
+			oc.ExpectedTool = c.Expected.Tool
+		}
+
+		switch {
+		case expectsTool && calledTool:
+			rep.Matrix.TP++
+			oc.TriggerResult = "TP"
+			rep.Payload.Evaluated++
+			if obs.Tool == c.Expected.Tool {
+				oc.CorrectTool = true
+				rep.Payload.CorrectTool++
+				if c.Expected.ArgsMatch == nil || c.Expected.ArgsMatch(obs.Args) {
+					oc.CorrectArgs = true
+					rep.Payload.CorrectArgs++
+				}
+			}
+		case expectsTool && !calledTool:
+			rep.Matrix.FN++
+			oc.TriggerResult = "FN"
+		case !expectsTool && calledTool:
+			rep.Matrix.FP++
+			oc.TriggerResult = "FP"
+		default:
+			rep.Matrix.TN++
+			oc.TriggerResult = "TN"
+		}
+
+		rep.Outcomes = append(rep.Outcomes, oc)
+	}
+
+	return rep
+}
+
+// Markdown renders the report as a compact, human-readable summary.
+func (r ToolUseReport) Markdown() string {
+	m := r.Matrix
+	out := "## Tool-use evaluation\n\n"
+	out += "### Trigger confusion matrix\n\n"
+	out += "| | tool called | no tool called |\n"
+	out += "|---|---|---|\n"
+	out += fmt.Sprintf("| **tool expected** | %d (TP) | %d (FN) |\n", m.TP, m.FN)
+	out += fmt.Sprintf("| **no tool expected** | %d (FP) | %d (TN) |\n\n", m.FP, m.TN)
+	out += fmt.Sprintf("- precision: %.2f  recall: %.2f  F1: %.2f\n\n", m.Precision(), m.Recall(), m.F1())
+	out += "### Payload accuracy (over triggered cases)\n\n"
+	out += fmt.Sprintf("- correct tool: %.2f (%d/%d)\n", r.Payload.ToolNameRate(), r.Payload.CorrectTool, r.Payload.Evaluated)
+	out += fmt.Sprintf("- correct tool + args: %.2f (%d/%d)\n", r.Payload.ArgsRate(), r.Payload.CorrectArgs, r.Payload.Evaluated)
+
+	// List failures (deterministic order) so a regression is actionable.
+	var failures []CaseOutcome
+	for _, oc := range r.Outcomes {
+		if oc.TriggerResult == "FN" || oc.TriggerResult == "FP" || (oc.TriggerResult == "TP" && !oc.CorrectArgs) {
+			failures = append(failures, oc)
+		}
+	}
+	if len(failures) > 0 {
+		sort.Slice(failures, func(i, j int) bool { return failures[i].ID < failures[j].ID })
+		out += "\n### Failures\n\n"
+		for _, oc := range failures {
+			detail := oc.TriggerResult
+			if oc.TriggerResult == "TP" {
+				detail = "wrong tool/args"
+			}
+			out += fmt.Sprintf("- %s: %s (expected %q, got %q)\n",
+				oc.ID, detail, orNone(oc.ExpectedTool), orNone(oc.ObservedTool))
+		}
+	}
+	return out
+}
+
+func orNone(s string) string {
+	if s == "" {
+		return "<none>"
+	}
+	return s
+}
diff --git a/internal/feature/eval/toolmatrix_test.go b/internal/feature/eval/toolmatrix_test.go
new file mode 100644
index 00000000..fb52addb
--- /dev/null
+++ b/internal/feature/eval/toolmatrix_test.go
@@ -0,0 +1,139 @@
+package eval
+
+import (
+	"context"
+	"errors"
+	"math"
+	"strings"
+	"testing"
+)
+
+// scriptedCaller returns observations keyed by case ID.
+func scriptedCaller(script map[string]ObservedCall, errs map[string]error) ToolCaller {
+	return func(_ context.Context, c ToolUseCase) (ObservedCall, error) {
+		if errs != nil {
+			if e, ok := errs[c.ID]; ok {
+				return ObservedCall{}, e
+			}
+		}
+		return script[c.ID], nil
+	}
+}
+
+func approx(a, b float64) bool { return math.Abs(a-b) < 1e-9 }
+
+func TestScoreToolUse_AllMatrixCells(t *testing.T) {
+	readFile := &ExpectedCall{Tool: "read_file"}
+	cases := []ToolUseCase{
+		{ID: "tp", Prompt: "show me main.go", Expected: readFile},  // expects tool, calls it
+		{ID: "fn", Prompt: "open config.yaml", Expected: readFile}, // expects tool, calls none
+		{ID: "fp", Prompt: "what is 2+2?", Expected: nil},          // expects none, calls one
+		{ID: "tn", Prompt: "say hello", Expected: nil},             // expects none, calls none
+	}
+	script := map[string]ObservedCall{
+		"tp": {Tool: "read_file", Args: map[string]any{"path": "main.go"}},
+		"fn": {Tool: ""},
+		"fp": {Tool: "bash", Args: map[string]any{"cmd": "echo 4"}},
+		"tn": {Tool: ""},
+	}
+
+	rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil))
+	m := rep.Matrix
+	if m.TP != 1 || m.FN != 1 || m.FP != 1 || m.TN != 1 {
+		t.Fatalf("matrix = %+v, want one of each", m)
+	}
+	if !approx(m.Precision(), 0.5) {
+		t.Errorf("precision = %v, want 0.5", m.Precision())
+	}
+	if !approx(m.Recall(), 0.5) {
+		t.Errorf("recall = %v, want 0.5", m.Recall())
+	}
+	if !approx(m.F1(), 0.5) {
+		t.Errorf("F1 = %v, want 0.5", m.F1())
+	}
+}
+
+func TestScoreToolUse_PayloadAccuracy(t *testing.T) {
+	exp := &ExpectedCall{
+		Tool:      "read_file",
+		ArgsMatch: func(a map[string]any) bool { return a["path"] == "main.go" },
+	}
+	cases := []ToolUseCase{
+		{ID: "good", Expected: exp},      // right tool, right args
+		{ID: "badargs", Expected: exp},   // right tool, wrong args
+		{ID: "wrongtool", Expected: exp}, // wrong tool
+	}
+	script := map[string]ObservedCall{
+		"good":      {Tool: "read_file", Args: map[string]any{"path": "main.go"}},
+		"badargs":   {Tool: "read_file", Args: map[string]any{"path": "other.go"}},
+		"wrongtool": {Tool: "bash", Args: map[string]any{"cmd": "cat main.go"}},
+	}
+
+	rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil))
+	if rep.Matrix.TP != 3 {
+		t.Fatalf("all three triggered, TP = %d", rep.Matrix.TP)
+	}
+	p := rep.Payload
+	if p.Evaluated != 3 {
+		t.Errorf("Evaluated = %d, want 3", p.Evaluated)
+	}
+	if p.CorrectTool != 2 { // good + badargs picked read_file
+		t.Errorf("CorrectTool = %d, want 2", p.CorrectTool)
+	}
+	if p.CorrectArgs != 1 { // only good matched args
+		t.Errorf("CorrectArgs = %d, want 1", p.CorrectArgs)
+	}
+	if !approx(p.ToolNameRate(), 2.0/3.0) {
+		t.Errorf("ToolNameRate = %v", p.ToolNameRate())
+	}
+	if !approx(p.ArgsRate(), 1.0/3.0) {
+		t.Errorf("ArgsRate = %v", p.ArgsRate())
+	}
+}
+
+func TestScoreToolUse_ErrorCountsAsNoCall(t *testing.T) {
+	cases := []ToolUseCase{
+		{ID: "boom", Prompt: "do a thing", Expected: &ExpectedCall{Tool: "read_file"}},
+	}
+	errs := map[string]error{"boom": errors.New("provider exploded")}
+	rep := ScoreToolUse(context.Background(), cases, scriptedCaller(nil, errs))
+
+	// Expected a tool, model errored (=no call) → false negative, not a crash.
+	if rep.Matrix.FN != 1 || rep.Matrix.TP != 0 {
+		t.Errorf("matrix = %+v, want FN=1", rep.Matrix)
+	}
+	if rep.Outcomes[0].Err == "" {
+		t.Error("outcome should record the error")
+	}
+}
+
+func TestTriggerMatrix_EmptyDivisors(t *testing.T) {
+	// No positive predictions / expectations → precision & recall default to 1.
+	m := TriggerMatrix{TN: 5}
+	if !approx(m.Precision(), 1.0) || !approx(m.Recall(), 1.0) {
+		t.Errorf("empty matrix precision/recall = %v/%v, want 1/1", m.Precision(), m.Recall())
+	}
+}
+
+func TestToolUseReport_MarkdownListsFailures(t *testing.T) {
+	cases := []ToolUseCase{
+		{ID: "z-miss", Expected: &ExpectedCall{Tool: "read_file"}},
+		{ID: "a-spurious", Expected: nil},
+	}
+	script := map[string]ObservedCall{
+		"z-miss":     {Tool: ""},     // FN
+		"a-spurious": {Tool: "bash"}, // FP
+	}
+	rep := ScoreToolUse(context.Background(), cases, scriptedCaller(script, nil))
+	md := rep.Markdown()
+	if !strings.Contains(md, "confusion matrix") {
+		t.Error("markdown missing matrix section")
+	}
+	if !strings.Contains(md, "Failures") {
+		t.Error("markdown missing failures section")
+	}
+	// Failures must be sorted by ID: a-spurious before z-miss.
+	if strings.Index(md, "a-spurious") > strings.Index(md, "z-miss") {
+		t.Error("failures not sorted by ID")
+	}
+}
diff --git a/internal/hawkerr/exitcode.go b/internal/hawkerr/exitcode.go
new file mode 100644
index 00000000..cd224eda
--- /dev/null
+++ b/internal/hawkerr/exitcode.go
@@ -0,0 +1,122 @@
+package hawkerr
+
+import "strings"
+
+// Exit-code taxonomy.
+//
+// hawk historically collapsed every failure to exit code 1, which gives an
+// agent or shell script driving `hawk --print` no way to branch on *why* the
+// run failed without scraping stderr. The codes below assign a stable,
+// documented small-integer meaning to each failure class so a caller can, for
+// example, retry on a rate-limit (5) but stop immediately on bad credentials
+// (3).
+//
+// The numbers are part of hawk's public CLI contract — append new codes, do
+// not renumber existing ones. Codes are kept below 64 to avoid colliding with
+// the shell's 128+signal convention and 126/127 (not-executable / not-found).
+const (
+	ExitOK           = 0  // success
+	ExitGeneral      = 1  // unclassified / general failure
+	ExitUsage        = 2  // bad flags or arguments (reserved for the CLI layer)
+	ExitAuth         = 3  // missing/invalid/expired credentials, 401/403
+	ExitNetwork      = 4  // DNS, connection refused/reset, provider 5xx, TLS
+	ExitRateLimit    = 5  // 429 / quota / insufficient credits
+	ExitTimeout      = 6  // request timeout, deadline exceeded, cancellation
+	ExitToolFailure  = 7  // a tool execution failed or timed out
+	ExitPolicyBlock  = 8  // permission/guardrail/policy denial
+	ExitConfig       = 9  // malformed settings/config
+	ExitContextLimit = 10 // prompt exceeds the model's context window
+	ExitNotFound     = 11 // model/endpoint/resource not found (404)
+	ExitDiskFull     = 12 // out of disk space or quota
+)
+
+// ClassifyExitCode maps an error to a stable exit code from the taxonomy above.
+//
+// It deliberately mirrors the textual classification already performed by the
+// CLI's friendlyError (cmd/errors.go): the same provider/network/auth signals
+// that produce a friendly message here produce a stable exit code, so the two
+// never disagree about what kind of failure occurred. Order matters — the most
+// specific and most actionable classes are checked first.
+//
+// A nil error yields ExitOK; an unrecognized error yields ExitGeneral.
+func ClassifyExitCode(err error) int {
+	if err == nil {
+		return ExitOK
+	}
+	low := strings.ToLower(err.Error())
+
+	contains := func(subs ...string) bool {
+		for _, s := range subs {
+			if strings.Contains(low, s) {
+				return true
+			}
+		}
+		return false
+	}
+
+	switch {
+	// Rate limiting / quota / credits — checked before generic auth because a
+	// 429 is retriable whereas a 401 is not.
+	case contains("429", "rate limit", "rate_limit", "too many requests",
+		"insufficient credits", "insufficient balance", "out of credits",
+		"requires more credits", "can only afford", "quota exceeded"):
+		return ExitRateLimit
+
+	// Authentication / authorization — bad or missing key, 401/403.
+	case contains("401", "unauthorized", "invalid api key", "invalid_api_key",
+		"authentication", "api key is missing", "403", "forbidden",
+		"access denied", "payment required"):
+		return ExitAuth
+
+	// Context window overflow — distinct from a generic 400 so callers can
+	// react by compacting rather than aborting.
+	case contains("context length", "context_length", "context window",
+		"token limit", "too many tokens", "maximum context",
+		"max_tokens exceeded", "max tokens exceeded", "prompt is too long"):
+		return ExitContextLimit
+
+	// Policy / permission / guardrail denial.
+	case contains("permission denied", "guardrail", "policy", "blocked by",
+		"approval denied", "not permitted", "operation not allowed"):
+		return ExitPolicyBlock
+
+	// Tool execution failures and tool timeouts.
+	case contains("tool timeout", "tool_timeout", "tool execution",
+		"tool failed", "tool error"):
+		return ExitToolFailure
+
+	// Disk space / quota.
+	case contains("no space left", "disk full", "not enough space", "disk quota"):
+		return ExitDiskFull
+
+	// Malformed configuration.
+	case contains("invalid json in config") ||
+		(contains("settings", "config") && contains("unmarshal", "syntax error", "invalid character")):
+		return ExitConfig
+
+	// Not found — model/endpoint/resource (404).
+	case contains("model not found", "model_not_found", "unknown model",
+		"invalid model", "404", "no such host"):
+		// "no such host" is a DNS failure, not a 404 — route it to network.
+		if contains("no such host") {
+			return ExitNetwork
+		}
+		return ExitNotFound
+
+	// Network errors — DNS, refused/reset connections, provider 5xx, TLS.
+	case contains("network is unreachable", "network unreachable",
+		"connection refused", "connection reset", "broken pipe",
+		"dns", "lookup", "500", "502", "503", "504",
+		"internal server error", "bad gateway", "service unavailable",
+		"gateway timeout", "certificate", "tls", "x509"):
+		return ExitNetwork
+
+	// Timeouts / cancellation — checked after the 5xx gateway-timeout cases so
+	// "504 gateway timeout" stays a network error.
+	case contains("timeout", "timed out", "deadline exceeded", "context canceled"):
+		return ExitTimeout
+
+	default:
+		return ExitGeneral
+	}
+}
diff --git a/internal/hawkerr/exitcode_test.go b/internal/hawkerr/exitcode_test.go
new file mode 100644
index 00000000..1d8d6230
--- /dev/null
+++ b/internal/hawkerr/exitcode_test.go
@@ -0,0 +1,54 @@
+package hawkerr
+
+import (
+	"errors"
+	"testing"
+)
+
+func TestClassifyExitCode(t *testing.T) {
+	cases := []struct {
+		name string
+		err  error
+		want int
+	}{
+		{"nil", nil, ExitOK},
+		{"rate limit 429", errors.New("HTTP 429 Too Many Requests"), ExitRateLimit},
+		{"rate limit phrase", errors.New("you have hit the rate limit"), ExitRateLimit},
+		{"insufficient credits", errors.New("requires more credits to run"), ExitRateLimit},
+		{"auth 401", errors.New("401 Unauthorized"), ExitAuth},
+		{"invalid api key", errors.New("invalid_api_key provided"), ExitAuth},
+		{"forbidden 403", errors.New("403 Forbidden: access denied"), ExitAuth},
+		{"context window", errors.New("prompt is too long for the context window"), ExitContextLimit},
+		{"token limit", errors.New("maximum context length exceeded"), ExitContextLimit},
+		{"policy block", errors.New("operation not allowed by guardrail policy"), ExitPolicyBlock},
+		{"permission denied", errors.New("permission denied writing file"), ExitPolicyBlock},
+		{"tool timeout", errors.New("tool timeout after 120s"), ExitToolFailure},
+		{"disk full", errors.New("write failed: no space left on device"), ExitDiskFull},
+		{"bad config", errors.New("failed to parse settings: invalid character '}'"), ExitConfig},
+		{"model not found", errors.New("model_not_found: gpt-5-ultra"), ExitNotFound},
+		{"404 model", errors.New("404 unknown model"), ExitNotFound},
+		{"dns no such host", errors.New("dial tcp: lookup api.example.com: no such host"), ExitNetwork},
+		{"connection refused", errors.New("connection refused"), ExitNetwork},
+		{"provider 503", errors.New("503 Service Unavailable"), ExitNetwork},
+		{"tls", errors.New("x509: certificate signed by unknown authority"), ExitNetwork},
+		{"timeout", errors.New("context deadline exceeded"), ExitTimeout},
+		{"plain timeout", errors.New("request timed out"), ExitTimeout},
+		{"unclassified", errors.New("something weird happened"), ExitGeneral},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := ClassifyExitCode(tc.err); got != tc.want {
+				t.Errorf("ClassifyExitCode(%q) = %d, want %d", tc.err, got, tc.want)
+			}
+		})
+	}
+}
+
+// Wrapped errors must classify by their flattened message too.
+func TestClassifyExitCode_Wrapped(t *testing.T) {
+	wrapped := errors.New("call failed: 429 rate_limit")
+	if got := ClassifyExitCode(wrapped); got != ExitRateLimit {
+		t.Errorf("wrapped rate-limit = %d, want %d", got, ExitRateLimit)
+	}
+}
diff --git a/internal/mcp/mcp.go b/internal/mcp/mcp.go
index a542938d..25c20ea1 100644
--- a/internal/mcp/mcp.go
+++ b/internal/mcp/mcp.go
@@ -231,11 +231,24 @@ func (s *Server) CallTool(ctx context.Context, name string, args map[string]inte
 	if err != nil {
 		return "", err
 	}
+	return parseToolCallResult(result)
+}
+
+// parseToolCallResult decodes an MCP tools/call result into flattened text.
+//
+// It honors the spec's isError flag: when the remote tool reports a failure
+// (isError:true), the content carries the failure detail and this returns it as
+// a Go error so the agent loop surfaces it to the model — which can then
+// self-correct — instead of mistaking a failure for a successful result.
+// hawk's own MCP server sets this flag (internal/mcp/server.go), so the client
+// must read it for symmetry. An undecodable result falls back to the raw bytes.
+func parseToolCallResult(result json.RawMessage) (string, error) {
 	var resp struct {
 		Content []struct {
 			Type string `json:"type"`
 			Text string `json:"text"`
 		} `json:"content"`
+		IsError bool `json:"isError"`
 	}
 	if err := json.Unmarshal(result, &resp); err != nil {
 		return string(result), nil
@@ -246,6 +259,12 @@ func (s *Server) CallTool(ctx context.Context, name string, args map[string]inte
 			text += c.Text
 		}
 	}
+	if resp.IsError {
+		if text == "" {
+			text = "remote MCP tool reported an error"
+		}
+		return text, fmt.Errorf("MCP tool error: %s", text)
+	}
 	return text, nil
 }
 
diff --git a/internal/mcp/server.go b/internal/mcp/server.go
index 8cc64e4b..fb1532ef 100644
--- a/internal/mcp/server.go
+++ b/internal/mcp/server.go
@@ -62,9 +62,24 @@ type MCPToolHandler struct {
 	Name        string
 	Description string
 	InputSchema map[string]interface{}
+	// Annotations carry behavioral hints (read-only, destructive, …) so a
+	// connecting client can self-throttle — e.g. ask the user before a tool
+	// that may run shell commands or edit files. Nil means "no hints".
+	Annotations *ToolAnnotations
 	Handler     func(ctx context.Context, params json.RawMessage) (string, error)
 }
 
+// ToolAnnotations are the optional MCP tool behavior hints from the spec.
+// All fields are advisory and pointer-typed so an unset hint is omitted from
+// the wire payload rather than sent as a misleading false.
+type ToolAnnotations struct {
+	Title           string `json:"title,omitempty"`
+	ReadOnlyHint    *bool  `json:"readOnlyHint,omitempty"`
+	DestructiveHint *bool  `json:"destructiveHint,omitempty"`
+	IdempotentHint  *bool  `json:"idempotentHint,omitempty"`
+	OpenWorldHint   *bool  `json:"openWorldHint,omitempty"`
+}
+
 // JSON-RPC 2.0 request from a client.
 type JSONRPCRequest struct {
 	JSONRPC string          `json:"jsonrpc"`
@@ -251,6 +266,7 @@ func (s *MCPServer) handleToolsList(req *JSONRPCRequest) *JSONRPCResponse {
 		Name        string                 `json:"name"`
 		Description string                 `json:"description"`
 		InputSchema map[string]interface{} `json:"inputSchema"`
+		Annotations *ToolAnnotations       `json:"annotations,omitempty"`
 	}
 
 	tools := make([]toolSchema, 0, len(s.tools))
@@ -259,6 +275,7 @@ func (s *MCPServer) handleToolsList(req *JSONRPCRequest) *JSONRPCResponse {
 			Name:        t.Name,
 			Description: t.Description,
 			InputSchema: t.InputSchema,
+			Annotations: t.Annotations,
 		})
 	}
 
diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go
index 2d8ea2e8..69d7cb8f 100644
--- a/internal/mcp/server_test.go
+++ b/internal/mcp/server_test.go
@@ -143,6 +143,75 @@ func TestToolsList(t *testing.T) {
 		if toolMap["inputSchema"] == nil {
 			t.Error("tool missing inputSchema")
 		}
+		// Neither alpha nor beta declared annotations, so the field must be
+		// omitted entirely (omitempty) rather than sent as null.
+		if _, present := toolMap["annotations"]; present {
+			t.Errorf("tool %v should omit annotations when unset", toolMap["name"])
+		}
+	}
+}
+
+func TestToolsListAnnotations(t *testing.T) {
+	server := NewMCPServer(ServerInfo{Name: "hawk", Version: "1.0.0"})
+	server.RegisterTool(MCPToolHandler{
+		Name:        "danger",
+		Description: "Runs an agent",
+		InputSchema: map[string]interface{}{"type": "object"},
+		Annotations: &ToolAnnotations{
+			Title:           "Run agent",
+			ReadOnlyHint:    boolPtr(false),
+			DestructiveHint: boolPtr(true),
+		},
+		Handler: func(ctx context.Context, params json.RawMessage) (string, error) { return "ok", nil },
+	})
+
+	resp := sendRequest(t, server, `{"jsonrpc":"2.0","id":7,"method":"tools/list"}`+"\n")
+	if resp.Error != nil {
+		t.Fatalf("unexpected error: %+v", resp.Error)
+	}
+	tools := resp.Result.(map[string]interface{})["tools"].([]interface{})
+	if len(tools) != 1 {
+		t.Fatalf("expected 1 tool, got %d", len(tools))
+	}
+	ann, ok := tools[0].(map[string]interface{})["annotations"].(map[string]interface{})
+	if !ok {
+		t.Fatalf("expected annotations map, got %T", tools[0].(map[string]interface{})["annotations"])
+	}
+	if ann["destructiveHint"] != true {
+		t.Errorf("destructiveHint = %v, want true", ann["destructiveHint"])
+	}
+	if ann["readOnlyHint"] != false {
+		t.Errorf("readOnlyHint = %v, want false", ann["readOnlyHint"])
+	}
+	if ann["title"] != "Run agent" {
+		t.Errorf("title = %v, want %q", ann["title"], "Run agent")
+	}
+	// idempotentHint was never set, so it must be omitted.
+	if _, present := ann["idempotentHint"]; present {
+		t.Error("idempotentHint should be omitted when unset")
+	}
+}
+
+// TestDefaultToolsHaveAnnotations guards that every default-registered tool
+// carries a risk hint, so a future tool addition can't silently ship without
+// one for connecting clients to self-throttle on.
+func TestDefaultToolsHaveAnnotations(t *testing.T) {
+	server := NewMCPServer(ServerInfo{Name: "hawk", Version: "1.0.0"})
+	RegisterDefaultTools(server, nil)
+
+	resp := sendRequest(t, server, `{"jsonrpc":"2.0","id":8,"method":"tools/list"}`+"\n")
+	if resp.Error != nil {
+		t.Fatalf("unexpected error: %+v", resp.Error)
+	}
+	tools := resp.Result.(map[string]interface{})["tools"].([]interface{})
+	if len(tools) == 0 {
+		t.Fatal("no default tools registered")
+	}
+	for _, raw := range tools {
+		tm := raw.(map[string]interface{})
+		if _, ok := tm["annotations"].(map[string]interface{}); !ok {
+			t.Errorf("default tool %v missing annotations", tm["name"])
+		}
 	}
 }
 
diff --git a/internal/mcp/server_tools.go b/internal/mcp/server_tools.go
index 2594df40..db535a87 100644
--- a/internal/mcp/server_tools.go
+++ b/internal/mcp/server_tools.go
@@ -10,6 +10,15 @@ import (
 // This avoids importing the tool package (which already imports mcp).
 type ToolExecutor func(ctx context.Context, name string, input json.RawMessage) (string, error)
 
+// boolPtr returns a pointer to b, for the pointer-typed MCP annotation hints.
+func boolPtr(b bool) *bool { return &b }
+
+// readOnlyAnnotations marks a tool that only reads/inspects and never mutates
+// the workspace, so a client can run it without prompting.
+func readOnlyAnnotations(title string) *ToolAnnotations {
+	return &ToolAnnotations{Title: title, ReadOnlyHint: boolPtr(true), DestructiveHint: boolPtr(false)}
+}
+
 // RegisterDefaultTools registers hawk's standard capabilities as MCP tools.
 // If executor is non-nil, tools that delegate to hawk's tool registry will
 // use it for execution; otherwise those tools return a not-configured error.
@@ -26,8 +35,15 @@ func RegisterDefaultTools(server *MCPServer, executor ToolExecutor) {
 // hawkChatTool sends a prompt to hawk and returns the response.
 func hawkChatTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
-		Name:        "hawk_chat",
-		Description: "Send a prompt to the hawk AI coding agent and receive a response.",
+		Name: "hawk_chat",
+		Description: "Send a prompt to the hawk AI coding agent and receive a response. " +
+			"WARNING: this runs an autonomous agent that may execute shell commands and modify files.",
+		Annotations: &ToolAnnotations{
+			Title:           "Run hawk agent",
+			ReadOnlyHint:    boolPtr(false),
+			DestructiveHint: boolPtr(true),
+			OpenWorldHint:   boolPtr(true),
+		},
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -58,6 +74,7 @@ func hawkSearchTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_search",
 		Description: "Search across hawk sessions and conversation history.",
+		Annotations: readOnlyAnnotations("Search hawk sessions"),
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -93,6 +110,7 @@ func hawkMemoryRecallTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_memory_recall",
 		Description: "Recall stored information from hawk's persistent memory (yaad).",
+		Annotations: readOnlyAnnotations("Recall from hawk memory"),
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -128,6 +146,11 @@ func hawkMemoryStoreTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_memory_store",
 		Description: "Store information in hawk's persistent memory (yaad) for future recall.",
+		Annotations: &ToolAnnotations{
+			Title:           "Store in hawk memory",
+			ReadOnlyHint:    boolPtr(false),
+			DestructiveHint: boolPtr(false), // additive write, does not destroy existing data
+		},
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -168,6 +191,7 @@ func hawkReviewTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_review",
 		Description: "Trigger a code review using hawk's sight module. Analyzes code for quality, style, and potential issues.",
+		Annotations: readOnlyAnnotations("Review code (sight)"),
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -203,6 +227,7 @@ func hawkScanTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_scan",
 		Description: "Trigger a security scan using hawk's inspect module. Identifies vulnerabilities and security issues.",
+		Annotations: readOnlyAnnotations("Security scan (inspect)"),
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
@@ -239,6 +264,7 @@ func hawkCompressTool(executor ToolExecutor) MCPToolHandler {
 	return MCPToolHandler{
 		Name:        "hawk_compress",
 		Description: "Compress text using hawk's tok module to reduce token usage while preserving meaning.",
+		Annotations: readOnlyAnnotations("Compress text (tok)"),
 		InputSchema: map[string]interface{}{
 			"type": "object",
 			"properties": map[string]interface{}{
diff --git a/internal/mcp/toolresult_test.go b/internal/mcp/toolresult_test.go
new file mode 100644
index 00000000..3cda8d2b
--- /dev/null
+++ b/internal/mcp/toolresult_test.go
@@ -0,0 +1,64 @@
+package mcp
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+func TestParseToolCallResult(t *testing.T) {
+	tests := []struct {
+		name     string
+		raw      string
+		wantText string
+		wantErr  bool
+	}{
+		{
+			name:     "success flattens text content",
+			raw:      `{"content":[{"type":"text","text":"hello "},{"type":"text","text":"world"}]}`,
+			wantText: "hello world",
+			wantErr:  false,
+		},
+		{
+			name:     "isError surfaces content as error",
+			raw:      `{"content":[{"type":"text","text":"file not found"}],"isError":true}`,
+			wantText: "file not found",
+			wantErr:  true,
+		},
+		{
+			name:     "isError with empty content gets a default message",
+			raw:      `{"content":[],"isError":true}`,
+			wantText: "remote MCP tool reported an error",
+			wantErr:  true,
+		},
+		{
+			name:     "isError false is a normal result",
+			raw:      `{"content":[{"type":"text","text":"ok"}],"isError":false}`,
+			wantText: "ok",
+			wantErr:  false,
+		},
+		{
+			name:     "non-text content is skipped",
+			raw:      `{"content":[{"type":"image","text":""},{"type":"text","text":"caption"}]}`,
+			wantText: "caption",
+			wantErr:  false,
+		},
+		{
+			name:     "undecodable result falls back to raw bytes",
+			raw:      `not json`,
+			wantText: "not json",
+			wantErr:  false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			text, err := parseToolCallResult(json.RawMessage(tc.raw))
+			if (err != nil) != tc.wantErr {
+				t.Fatalf("err = %v, wantErr = %v", err, tc.wantErr)
+			}
+			if text != tc.wantText {
+				t.Errorf("text = %q, want %q", text, tc.wantText)
+			}
+		})
+	}
+}
diff --git a/internal/tool/agentic_fetch.go b/internal/tool/agentic_fetch.go
index 25a2680f..a16cbb04 100644
--- a/internal/tool/agentic_fetch.go
+++ b/internal/tool/agentic_fetch.go
@@ -4,8 +4,15 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"strings"
+	"sync"
 )
 
+// maxAgenticFetchConcurrency bounds how many pages are fetched and summarized
+// in parallel when a batch of URLs is supplied, matching the WebSearch cap so
+// the SSRF-aware fetch client does not face a connection storm.
+const maxAgenticFetchConcurrency = 4
+
 // AgenticFetchTool spawns a sub-agent to fetch, process, and summarize web content.
 type AgenticFetchTool struct{}
 
@@ -20,22 +27,33 @@ func (AgenticFetchTool) Parameters() map[string]interface{} {
 	return map[string]interface{}{
 		"type": "object",
 		"properties": map[string]interface{}{
-			"url":   map[string]interface{}{"type": "string", "description": "URL to fetch and analyze"},
-			"query": map[string]interface{}{"type": "string", "description": "What to look for or extract from the page"},
+			"url": map[string]interface{}{"type": "string", "description": "URL to fetch and analyze. Provide this OR urls (not both)."},
+			"urls": map[string]interface{}{
+				"type":        "array",
+				"items":       map[string]interface{}{"type": "string"},
+				"description": "Multiple URLs to fetch and summarize concurrently against the same query, in a single call.",
+			},
+			"query": map[string]interface{}{"type": "string", "description": "What to look for or extract from the page(s)"},
 		},
 	}
 }
 
-func (AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) {
+func (t AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) {
 	var p struct {
-		URL   string `json:"url"`
-		Query string `json:"query"`
+		URL   string   `json:"url"`
+		URLs  []string `json:"urls"`
+		Query string   `json:"query"`
 	}
 	if err := json.Unmarshal(input, &p); err != nil {
 		return "", err
 	}
-	if p.URL == "" {
-		return "", fmt.Errorf("url is required")
+
+	urls := p.URLs
+	if p.URL != "" {
+		urls = append(urls, p.URL)
+	}
+	if len(urls) == 0 {
+		return "", fmt.Errorf("url or urls is required")
 	}
 	if p.Query == "" {
 		p.Query = "Extract the key information from this page."
@@ -43,17 +61,55 @@ func (AgenticFetchTool) Execute(ctx context.Context, input json.RawMessage) (str
 
 	tc := GetToolContext(ctx)
 	if tc == nil || tc.AgentSpawnFn == nil {
-		// Fallback: do a direct fetch if no sub-agent available.
-		return (WebFetchTool{}).Execute(ctx, input)
+		// Fallback: do a direct fetch if no sub-agent available. Only the
+		// single-URL case maps onto WebFetch's input shape.
+		if len(urls) == 1 {
+			fetchInput, _ := json.Marshal(map[string]string{"url": urls[0]})
+			return (WebFetchTool{}).Execute(ctx, fetchInput)
+		}
+		return "", fmt.Errorf("batch agentic fetch requires a sub-agent, which is unavailable in this context")
+	}
+
+	if len(urls) == 1 {
+		return tc.AgentSpawnFn(ctx, t.researchPrompt(urls[0], p.Query))
 	}
 
-	prompt := fmt.Sprintf(`You are a web research agent. Your task:
+	// Batch: summarize each URL concurrently, bounded, with labeled sections.
+	outputs := make([]string, len(urls))
+	var wg sync.WaitGroup
+	sem := make(chan struct{}, maxAgenticFetchConcurrency)
+	for i, u := range urls {
+		wg.Add(1)
+		go func(idx int, u string) {
+			defer wg.Done()
+			sem <- struct{}{}
+			defer func() { <-sem }()
+			out, err := tc.AgentSpawnFn(ctx, t.researchPrompt(u, p.Query))
+			if err != nil {
+				outputs[idx] = fmt.Sprintf("## %s\n\nError: %v", u, err)
+				return
+			}
+			outputs[idx] = fmt.Sprintf("## %s\n\n%s", u, out)
+		}(i, u)
+	}
+	wg.Wait()
+
+	return strings.Join(outputs, "\n\n---\n\n"), nil
+}
+
+// researchPrompt builds the sub-agent instruction for one URL. It includes an
+// explicit relevance-refusal contract: if the page does not address the query,
+// the sub-agent returns a short "NO_RELEVANT_INFORMATION" signal instead of
+// padding a vague summary, keeping the caller's context clean.
+func (AgenticFetchTool) researchPrompt(url, query string) string {
+	return fmt.Sprintf(`You are a web research agent. Your task:
 1. Fetch the URL: %s
 2. Extract information relevant to: %s
-3. Return a concise, well-structured summary of the relevant content.
-4. Include specific details, code examples, or data points — not vague descriptions.
-
-Use the WebFetch tool to retrieve the page content, then analyze and summarize it.`, p.URL, p.Query)
+3. If the page genuinely does not contain information relevant to the query,
+   respond with exactly "NO_RELEVANT_INFORMATION" and one short sentence on why
+   — do not invent or pad a summary.
+4. Otherwise, return a concise, well-structured summary of the relevant content,
+   including specific details, code examples, or data points — not vague descriptions.
 
-	return tc.AgentSpawnFn(ctx, prompt)
+Use the WebFetch tool to retrieve the page content, then analyze and summarize it.`, url, query)
 }
diff --git a/internal/tool/agentic_fetch_test.go b/internal/tool/agentic_fetch_test.go
new file mode 100644
index 00000000..c630ace4
--- /dev/null
+++ b/internal/tool/agentic_fetch_test.go
@@ -0,0 +1,87 @@
+package tool
+
+import (
+	"context"
+	"encoding/json"
+	"strings"
+	"sync/atomic"
+	"testing"
+)
+
+func TestAgenticFetch_BatchFanOut(t *testing.T) {
+	var calls int32
+	tc := &ToolContext{
+		AgentSpawnFn: func(_ context.Context, prompt string) (string, error) {
+			atomic.AddInt32(&calls, 1)
+			// Echo back which URL this invocation was for so we can assert
+			// each URL produced its own labeled section.
+			for _, u := range []string{"https://a.example", "https://b.example", "https://c.example"} {
+				if strings.Contains(prompt, u) {
+					return "summary-of " + u, nil
+				}
+			}
+			return "summary", nil
+		},
+	}
+	ctx := WithToolContext(context.Background(), tc)
+
+	input, _ := json.Marshal(map[string]any{
+		"urls":  []string{"https://a.example", "https://b.example", "https://c.example"},
+		"query": "what is this",
+	})
+	out, err := (AgenticFetchTool{}).Execute(ctx, input)
+	if err != nil {
+		t.Fatalf("Execute: %v", err)
+	}
+	if got := atomic.LoadInt32(&calls); got != 3 {
+		t.Errorf("AgentSpawnFn called %d times, want 3", got)
+	}
+	for _, u := range []string{"https://a.example", "https://b.example", "https://c.example"} {
+		if !strings.Contains(out, "## "+u) {
+			t.Errorf("output missing labeled section for %q\n%s", u, out)
+		}
+		if !strings.Contains(out, "summary-of "+u) {
+			t.Errorf("output missing summary for %q", u)
+		}
+	}
+}
+
+func TestAgenticFetch_SingleURLBackCompat(t *testing.T) {
+	var gotPrompt string
+	tc := &ToolContext{
+		AgentSpawnFn: func(_ context.Context, prompt string) (string, error) {
+			gotPrompt = prompt
+			return "ok", nil
+		},
+	}
+	ctx := WithToolContext(context.Background(), tc)
+
+	input, _ := json.Marshal(map[string]any{"url": "https://only.example", "query": "q"})
+	out, err := (AgenticFetchTool{}).Execute(ctx, input)
+	if err != nil {
+		t.Fatalf("Execute: %v", err)
+	}
+	// Single URL keeps the bare sub-agent output (no "## url" batch header).
+	if out != "ok" {
+		t.Errorf("single-url output = %q, want %q", out, "ok")
+	}
+	// The refusal contract must be present in the prompt.
+	if !strings.Contains(gotPrompt, "NO_RELEVANT_INFORMATION") {
+		t.Error("research prompt missing relevance-refusal contract")
+	}
+}
+
+func TestAgenticFetch_RequiresURL(t *testing.T) {
+	input, _ := json.Marshal(map[string]any{"query": "q"})
+	if _, err := (AgenticFetchTool{}).Execute(context.Background(), input); err == nil {
+		t.Error("expected error when neither url nor urls is provided")
+	}
+}
+
+func TestWebSearch_QueryNormalization(t *testing.T) {
+	// Neither query nor queries → error, without touching the network.
+	input, _ := json.Marshal(map[string]any{"numResults": 3})
+	if _, err := (WebSearchTool{}).Execute(context.Background(), input); err == nil {
+		t.Error("expected error when neither query nor queries is provided")
+	}
+}
diff --git a/internal/tool/web_search.go b/internal/tool/web_search.go
index 6ed813ac..9e2f5257 100644
--- a/internal/tool/web_search.go
+++ b/internal/tool/web_search.go
@@ -8,9 +8,15 @@ import (
 	"net/http"
 	"net/url"
 	"strings"
+	"sync"
 	"time"
 )
 
+// maxWebSearchConcurrency bounds the number of search backends contacted in
+// parallel when a batch of queries is issued, so a large queries[] array does
+// not open a burst of outbound connections (and trip provider rate limits).
+const maxWebSearchConcurrency = 4
+
 // searchResult is the shared result type for all search backends.
 type searchResult struct {
 	Title       string
@@ -33,7 +39,12 @@ func (WebSearchTool) Parameters() map[string]interface{} {
 		"properties": map[string]interface{}{
 			"query": map[string]interface{}{
 				"type":        "string",
-				"description": "Search query",
+				"description": "Search query. Provide this OR queries (not both).",
+			},
+			"queries": map[string]interface{}{
+				"type":        "array",
+				"items":       map[string]interface{}{"type": "string"},
+				"description": "Multiple search queries to run concurrently in a single call. Use this to research several things at once instead of issuing one WebSearch per query.",
 			},
 			"numResults": map[string]interface{}{
 				"type":        "integer",
@@ -49,22 +60,31 @@ func (WebSearchTool) Parameters() map[string]interface{} {
 				"default":     "web",
 			},
 		},
-		"required": []string{"query"},
+		// Either query or queries must be supplied; validated in Execute since
+		// JSON Schema "required" cannot express an exclusive-or cleanly.
 	}
 }
 
-func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) {
+func (t WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string, error) {
 	var p struct {
-		Query      string `json:"query"`
-		NumResults int    `json:"numResults"`
-		SearchType string `json:"searchType"`
+		Query      string   `json:"query"`
+		Queries    []string `json:"queries"`
+		NumResults int      `json:"numResults"`
+		SearchType string   `json:"searchType"`
 	}
 	if err := json.Unmarshal(input, &p); err != nil {
 		return "", err
 	}
-	if p.Query == "" {
-		return "", fmt.Errorf("query is required")
+
+	// Normalize the single/batch surfaces into one list of queries.
+	queries := p.Queries
+	if p.Query != "" {
+		queries = append(queries, p.Query)
+	}
+	if len(queries) == 0 {
+		return "", fmt.Errorf("query or queries is required")
 	}
+
 	if p.NumResults <= 0 {
 		p.NumResults = 5
 	}
@@ -75,16 +95,58 @@ func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string
 		p.SearchType = "web"
 	}
 
-	// Try providers in order: Brave → SearXNG → DuckDuckGo
+	// Single query: keep the original output shape (no batch header).
+	if len(queries) == 1 {
+		out, err := t.searchOne(ctx, queries[0], p.NumResults)
+		if err != nil {
+			return "", err
+		}
+		return out, nil
+	}
+
+	// Batch: fan out concurrently, bounded, and label each result section so
+	// the agent can attribute results to their query in one tool call.
+	outputs := make([]string, len(queries))
+	var wg sync.WaitGroup
+	sem := make(chan struct{}, maxWebSearchConcurrency)
+	for i, q := range queries {
+		wg.Add(1)
+		go func(idx int, query string) {
+			defer wg.Done()
+			sem <- struct{}{}
+			defer func() { <-sem }()
+			out, err := t.searchOne(ctx, query, p.NumResults)
+			if err != nil {
+				outputs[idx] = fmt.Sprintf("Search results for: %s\n\nError: %v", query, err)
+				return
+			}
+			outputs[idx] = out
+		}(i, q)
+	}
+	wg.Wait()
+
+	var sb strings.Builder
+	for i, out := range outputs {
+		if i > 0 {
+			sb.WriteString("\n\n---\n\n")
+		}
+		sb.WriteString(out)
+	}
+	return sb.String(), nil
+}
+
+// searchOne runs a single query through the provider cascade
+// (Brave → SearXNG → DuckDuckGo) and returns formatted results.
+func (WebSearchTool) searchOne(ctx context.Context, query string, numResults int) (string, error) {
 	var results []searchResult
 	var err error
 
 	// 1. Brave Search
 	brave := newBraveClient()
 	if brave.available() {
-		results, err = brave.search(ctx, p.Query, p.NumResults)
+		results, err = brave.search(ctx, query, numResults)
 		if err == nil {
-			return formatSearchResults(p.Query, results), nil
+			return formatSearchResults(query, results), nil
 		}
 		// Fall through to next provider on error
 	}
@@ -92,19 +154,19 @@ func (WebSearchTool) Execute(ctx context.Context, input json.RawMessage) (string
 	// 2. SearXNG
 	searxng := newSearxngClient()
 	if searxng.available() {
-		results, err = searxng.search(ctx, p.Query, p.NumResults)
+		results, err = searxng.search(ctx, query, numResults)
 		if err == nil {
-			return formatSearchResults(p.Query, results), nil
+			return formatSearchResults(query, results), nil
 		}
 		// Fall through to DuckDuckGo on error
 	}
 
 	// 3. DuckDuckGo (fallback)
-	results, err = duckDuckGoSearch(ctx, p.Query, p.NumResults)
+	results, err = duckDuckGoSearch(ctx, query, numResults)
 	if err != nil {
 		return "", fmt.Errorf("all search providers failed: %w", err)
 	}
-	return formatSearchResults(p.Query, results), nil
+	return formatSearchResults(query, results), nil
 }
 
 // formatSearchResults formats results as numbered structured text.
diff --git a/internal/tool/web_test.go b/internal/tool/web_test.go
index 9ce049c1..82444231 100644
--- a/internal/tool/web_test.go
+++ b/internal/tool/web_test.go
@@ -30,8 +30,8 @@ func TestWebSearchTool_EmptyQuery(t *testing.T) {
 	var ws WebSearchTool
 	input, _ := json.Marshal(map[string]string{"query": ""})
 	_, err := ws.Execute(context.Background(), input)
-	if err == nil || !strings.Contains(err.Error(), "query is required") {
-		t.Fatalf("expected 'query is required' error, got %v", err)
+	if err == nil || !strings.Contains(err.Error(), "is required") {
+		t.Fatalf("expected a 'required' error, got %v", err)
 	}
 }
 

From c637fc8e426bc270ecd32837b36cb21ad019eda4 Mon Sep 17 00:00:00 2001
From: Patel230 <Lakshmanp230@gmail.com>
Date: Mon, 8 Jun 2026 00:25:12 +0530
Subject: [PATCH 2/3] chore: sync external submodules to latest origin/main

Advance the eyrie and yaad submodule pointers to the current tip of their
origin/main (the other four already matched). Picks up upstream commits that
were present on the remotes but not yet recorded in hawk's index.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 external/eyrie | 2 +-
 external/yaad  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/eyrie b/external/eyrie
index 0f01cf69..5ffdd1f8 160000
--- a/external/eyrie
+++ b/external/eyrie
@@ -1 +1 @@
-Subproject commit 0f01cf692646cb12f0a2ec83b8eb57dbbb0f9b40
+Subproject commit 5ffdd1f8bd6f6a1b09485ea87d00d7a221cddf2e
diff --git a/external/yaad b/external/yaad
index 730bb90e..e7c21cca 160000
--- a/external/yaad
+++ b/external/yaad
@@ -1 +1 @@
-Subproject commit 730bb90e0ee7805d37afb07e1317d81b1f6459ca
+Subproject commit e7c21cca3fc93faf01bb12b69cc20ba0a230f776

From ff905572118341cc108c4a891e7cc6c0ab7b0765 Mon Sep 17 00:00:00 2001
From: Patel230 <Lakshmanp230@gmail.com>
Date: Mon, 8 Jun 2026 00:52:06 +0530
Subject: [PATCH 3/3] chore: bump eyrie submodule to merged main (efe3710)

Point external/eyrie at the squash-merged eyrie main containing the provider
error/image/reasoning/verify work, so hawk consumes the released eyrie changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 external/eyrie | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/eyrie b/external/eyrie
index 5ffdd1f8..efe3710c 160000
--- a/external/eyrie
+++ b/external/eyrie
@@ -1 +1 @@
-Subproject commit 5ffdd1f8bd6f6a1b09485ea87d00d7a221cddf2e
+Subproject commit efe3710c2aa6cdf657fb376951762323c069f2ed