GrayCodeAI · Patel230 · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/cmd/eval_tools.go b/cmd/eval_tools.go
@@ -0,0 +1,120 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
+	"github.com/GrayCodeAI/hawk/internal/feature/eval"
+	"github.com/GrayCodeAI/hawk/internal/types"
+	"github.com/spf13/cobra"
+)
+
+var evalToolsOutput string
+
+func init() {
+	evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json")
+	evalCmd.AddCommand(evalToolsCmd)
+}
+
+var evalToolsCmd = &cobra.Command{
+	Use:   "tools",
+	Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy",
+	Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " +
+		"expected tool (or none). Triggering (did the model call a tool when it should) " +
+		"is scored as a confusion matrix, separately from payload accuracy (right tool + args).",
+	RunE: runEvalTools,
+}
+
+// defaultToolUseCases is a small built-in set exercising clear positive and
+// negative tool-trigger situations against hawk's standard tools.
+func defaultToolUseCases() []eval.ToolUseCase {
+	return []eval.ToolUseCase{
+		{
+			ID:       "read-existing-file",
+			Prompt:   "Show me the contents of go.mod.",
+			Expected: &eval.ExpectedCall{Tool: "Read"},
+		},
+		{
+			ID:       "list-directory",
+			Prompt:   "What files are in the cmd directory?",
+			Expected: &eval.ExpectedCall{Tool: "LS"},
+		},
+		{
+			ID:       "run-command",
+			Prompt:   "Run the test suite for this project.",
+			Expected: &eval.ExpectedCall{Tool: "Bash"},
+		},
+		{
+			ID:       "search-code",
+			Prompt:   "Find every place that defines an http handler in this repo.",
+			Expected: &eval.ExpectedCall{Tool: "Grep"},
+		},
+		{
+			// Negative case: a pure-knowledge question needs no tool.
+			ID:       "no-tool-trivia",
+			Prompt:   "In one sentence, what does the SOLID 'S' stand for?",
+			Expected: nil,
+		},
+		{
+			// Negative case: a greeting needs no tool.
+			ID:       "no-tool-greeting",
+			Prompt:   "Say hello.",
+			Expected: nil,
+		},
+	}
+}
+
+func runEvalTools(cmd *cobra.Command, _ []string) error {
+	settings := hawkconfig.LoadSettings()
+
+	registry, err := defaultRegistry(settings)
+	if err != nil {
+		return fmt.Errorf("building tool registry: %w", err)
+	}
+	systemPrompt, err := buildSystemPrompt()
+	if err != nil {
+		return err
+	}
+	model, provider := effectiveModelAndProvider(settings)
+	sess := newHawkSession(settings, provider, model, systemPrompt, registry)
+	if err := configureSession(sess, settings); err != nil {
+		return err
+	}
+
+	tools := registry.EyrieTools()
+
+	// caller performs one tool-aware turn and reports the first tool the model
+	// chose (if any). It does not execute the tool — we are scoring selection,
+	// not effects.
+	caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) {
+		resp, err := sess.Chat(ctx, []types.EyrieMessage{
+			{Role: "user", Content: c.Prompt},
+		}, types.ChatOptions{Model: model, Tools: tools})
+		if err != nil {
+			return eval.ObservedCall{}, err
+		}
+		if resp == nil || len(resp.ToolCalls) == 0 {
+			return eval.ObservedCall{}, nil // no tool called
+		}
+		tc := resp.ToolCalls[0]
+		return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil
+	}
+
+	ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute)
+	defer cancel()
+
+	cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model)
+	report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller)
+
+	switch evalToolsOutput {
+	case "json":
+		data, _ := json.MarshalIndent(report, "", "  ")
+		cmd.Println(string(data))
+	default:
+		cmd.Println(report.Markdown())
+	}
+	return nil
+}
diff --git a/cmd/formatter.go b/cmd/formatter.go
@@ -6,8 +6,19 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"golang.org/x/term"
 )
 
+// stdoutIsTerminal reports whether stdout is connected to a terminal (TTY).
+// When stdout is a pipe or file — which is exactly the case when an agent or
+// shell script captures hawk's output — this is false, and color/Unicode
+// chrome must be suppressed so the payload stays clean. It is a var so tests
+// can override it.
+var stdoutIsTerminal = func() bool {
+	return term.IsTerminal(int(os.Stdout.Fd()))
+}
+
 // TreeNode represents a node in a tree structure for FormatTree.
 type TreeNode struct {
 	Name     string
@@ -544,9 +555,16 @@ func DetectColorSupport() bool {
 		return false
 	}
 
+	// Stdout is not a TTY (piped to an agent, file, or another process):
+	// suppress ANSI so the captured output is clean. An explicit FORCE_COLOR
+	// above already overrode this for callers that pipe but still want color.
+	if !stdoutIsTerminal() {
+		return false
+	}
+
 	// Check TERM
-	term := os.Getenv("TERM")
-	if term == "dumb" || term == "" {
+	t := os.Getenv("TERM")
+	if t == "dumb" || t == "" {
 		return false
 	}
 
@@ -555,6 +573,12 @@ func DetectColorSupport() bool {
 
 // DetectUnicodeSupport checks if the terminal supports Unicode characters.
 func DetectUnicodeSupport() bool {
+	// Non-TTY stdout: emit ASCII so box-drawing/glyphs don't corrupt captured
+	// output. FORCE_COLOR is a color signal only, so it does not override here.
+	if !stdoutIsTerminal() {
+		return false
+	}
+
 	lang := os.Getenv("LANG")
 	lcAll := os.Getenv("LC_ALL")
 	lcCtype := os.Getenv("LC_CTYPE")

diff --git a/cmd/formatter_test.go b/cmd/formatter_test.go
@@ -7,6 +7,38 @@ import (
 	"time"
 )
 
+func TestDetectColorSupport_NonTTYStdout(t *testing.T) {
+	orig := stdoutIsTerminal
+	defer func() { stdoutIsTerminal = orig }()
+
+	t.Setenv("NO_COLOR", "")
+	t.Setenv("FORCE_COLOR", "")
+	t.Setenv("TERM", "xterm-256color")
+
+	// Piped stdout (not a TTY) with a normal TERM must still disable color —
+	// this is the "agent captured ANSI escapes in its JSON" regression.
+	stdoutIsTerminal = func() bool { return false }
+	if DetectColorSupport() {
+		t.Error("DetectColorSupport() = true for non-TTY stdout; want false")
+	}
+	if DetectUnicodeSupport() {
+		t.Error("DetectUnicodeSupport() = true for non-TTY stdout; want false")
+	}
+
+	// A TTY with a good TERM keeps color.
+	stdoutIsTerminal = func() bool { return true }
+	if !DetectColorSupport() {
+		t.Error("DetectColorSupport() = false for TTY stdout; want true")
+	}
+
+	// FORCE_COLOR overrides the non-TTY gate (deliberate piped color).
+	stdoutIsTerminal = func() bool { return false }
+	t.Setenv("FORCE_COLOR", "1")
+	if !DetectColorSupport() {
+		t.Error("DetectColorSupport() = false with FORCE_COLOR over a pipe; want true")
+	}
+}
+
 func newTestFormatter(color, unicode bool, width int) *OutputFormatter {
 	theme := OutputTheme{}
 	if color {
@@ -552,6 +584,12 @@ func TestDetectColorSupport(t *testing.T) {
 }
 
 func TestDetectUnicodeSupport(t *testing.T) {
+	// These subtests exercise the locale-env logic, so pin stdout to a TTY;
+	// the non-TTY suppression gate is covered by TestDetectColorSupport_NonTTYStdout.
+	origIsTTY := stdoutIsTerminal
+	stdoutIsTerminal = func() bool { return true }
+	defer func() { stdoutIsTerminal = origIsTTY }()
+
 	t.Run("UTF-8 lang", func(t *testing.T) {
 		origLang := os.Getenv("LANG")
 		origLcAll := os.Getenv("LC_ALL")

diff --git a/cmd/hawk/main.go b/cmd/hawk/main.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/GrayCodeAI/hawk/cmd"
 	"github.com/GrayCodeAI/hawk/internal/api"
+	"github.com/GrayCodeAI/hawk/internal/hawkerr"
 	"github.com/GrayCodeAI/hawk/internal/mcp"
 	"github.com/GrayCodeAI/hawk/internal/sandbox"
 )
@@ -42,10 +43,14 @@ func main() {
 
 	if err := cmd.Execute(); err != nil {
 		fmt.Fprintln(os.Stderr, err)
+		// An explicit ExitCodeError (e.g. a wrapped Bash exit status) wins —
+		// it already carries the intended code. Otherwise classify the failure
+		// into the stable exit-code taxonomy so callers can branch on the
+		// reason (auth vs rate-limit vs network) instead of seeing a bare 1.
 		var exitErr *cmd.ExitCodeError
 		if errors.As(err, &exitErr) {
 			os.Exit(exitErr.Code)
 		}
-		os.Exit(1)
+		os.Exit(hawkerr.ClassifyExitCode(err))
 	}
 }
diff --git a/cmd/mcp_serve.go b/cmd/mcp_serve.go
@@ -0,0 +1,106 @@
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+	"os/signal"
+	"syscall"
+
+	hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
+	"github.com/GrayCodeAI/hawk/internal/mcp"
+	"github.com/spf13/cobra"
+)
+
+var mcpConfigWrite bool
+
+func init() {
+	mcpConfigCmd.Flags().BoolVar(&mcpConfigWrite, "write", false,
+		"also print the well-known client config paths to paste the block into")
+	mcpCmd.AddCommand(mcpServeCmd)
+	mcpCmd.AddCommand(mcpConfigCmd)
+}
+
+// mcpServeCmd runs hawk itself as an MCP server over stdio, exposing hawk's
+// capabilities (chat, search, memory, review, scan, compress) to MCP clients
+// such as Claude Desktop, Cursor, and Windsurf.
+var mcpServeCmd = &cobra.Command{
+	Use:   "serve",
+	Short: "Run hawk as an MCP server over stdio",
+	Long: "Run hawk as a Model Context Protocol server over stdio (JSON-RPC 2.0), " +
+		"exposing hawk's tools to MCP clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
+		"Use `hawk mcp config` to print the JSON block that registers this command in a client.",
+	RunE: runMCPServe,
+}
+
+func runMCPServe(cmd *cobra.Command, _ []string) error {
+	settings := hawkconfig.LoadSettings()
+
+	serverVersion := version
+	if serverVersion == "" {
+		serverVersion = "dev"
+	}
+	server := mcp.NewMCPServer(mcp.ServerInfo{Name: "hawk", Version: serverVersion})
+
+	// Wire hawk's tool registry in as the executor so delegating tools run for
+	// real; a registry build failure degrades to not-configured rather than
+	// aborting (the server still answers initialize/tools/list).
+	registry, err := defaultRegistry(settings)
+	if err == nil {
+		mcp.RegisterDefaultTools(server, registry.Execute)
+	} else {
+		mcp.RegisterDefaultTools(server, nil)
+	}
+
+	ctx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM)
+	defer stop()
+	return server.ServeStdio(ctx)
+}
+
+// mcpConfigCmd emits the JSON block that registers hawk as an MCP server in a
+// client's config file, so users don't hand-edit JSON.
+var mcpConfigCmd = &cobra.Command{
+	Use:   "config",
+	Short: "Print the MCP-server config block to register hawk in a client",
+	Long: "Print the JSON block that registers hawk as an MCP server (pointing at " +
+		"`hawk mcp serve`) for clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
+		"Pipe it to the client's config file, e.g.:\n" +
+		"  hawk mcp config >> ~/Library/Application Support/Claude/claude_desktop_config.json",
+	RunE: runMCPConfig,
+}
+
+func runMCPConfig(cmd *cobra.Command, _ []string) error {
+	exe := hawkExecutablePath()
+
+	block := map[string]any{
+		"mcpServers": map[string]any{
+			"hawk": map[string]any{
+				"command": exe,
+				"args":    []string{"mcp", "serve"},
+			},
+		},
+	}
+	out, err := json.MarshalIndent(block, "", "  ")
+	if err != nil {
+		return err
+	}
+
+	if mcpConfigWrite {
+		cmd.Println("# Add the \"hawk\" entry below into the \"mcpServers\" object of your client config:")
+		cmd.Println("#   Claude Desktop (macOS): ~/Library/Application Support/Claude/claude_desktop_config.json")
+		cmd.Println("#   Cursor:                 ~/.cursor/mcp.json")
+		cmd.Println("#   Windsurf:               ~/.codeium/windsurf/mcp_config.json")
+		cmd.Println()
+	}
+	cmd.Println(string(out))
+	return nil
+}
+
+// hawkExecutablePath returns the absolute path to the running hawk binary, or
+// the bare name "hawk" if it cannot be resolved (e.g. during `go run`), so the
+// emitted config is still copy-pasteable.
+func hawkExecutablePath() string {
+	if exe, err := os.Executable(); err == nil && exe != "" {
+		return exe
+	}
+	return "hawk"
+}