Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions cmd/eval_tools.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package cmd

import (
"context"
"encoding/json"
"fmt"
"time"

hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
"github.com/GrayCodeAI/hawk/internal/feature/eval"
"github.com/GrayCodeAI/hawk/internal/types"
"github.com/spf13/cobra"
)

var evalToolsOutput string

func init() {
evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json")
evalCmd.AddCommand(evalToolsCmd)
}

var evalToolsCmd = &cobra.Command{
Use: "tools",
Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy",
Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " +
"expected tool (or none). Triggering (did the model call a tool when it should) " +
"is scored as a confusion matrix, separately from payload accuracy (right tool + args).",
RunE: runEvalTools,
}

// defaultToolUseCases is a small built-in set exercising clear positive and
// negative tool-trigger situations against hawk's standard tools.
func defaultToolUseCases() []eval.ToolUseCase {
return []eval.ToolUseCase{
{
ID: "read-existing-file",
Prompt: "Show me the contents of go.mod.",
Expected: &eval.ExpectedCall{Tool: "Read"},
},
{
ID: "list-directory",
Prompt: "What files are in the cmd directory?",
Expected: &eval.ExpectedCall{Tool: "LS"},
},
{
ID: "run-command",
Prompt: "Run the test suite for this project.",
Expected: &eval.ExpectedCall{Tool: "Bash"},
},
{
ID: "search-code",
Prompt: "Find every place that defines an http handler in this repo.",
Expected: &eval.ExpectedCall{Tool: "Grep"},
},
{
// Negative case: a pure-knowledge question needs no tool.
ID: "no-tool-trivia",
Prompt: "In one sentence, what does the SOLID 'S' stand for?",
Expected: nil,
},
{
// Negative case: a greeting needs no tool.
ID: "no-tool-greeting",
Prompt: "Say hello.",
Expected: nil,
},
}
}

func runEvalTools(cmd *cobra.Command, _ []string) error {
settings := hawkconfig.LoadSettings()

registry, err := defaultRegistry(settings)
if err != nil {
return fmt.Errorf("building tool registry: %w", err)
}
systemPrompt, err := buildSystemPrompt()
if err != nil {
return err
}
model, provider := effectiveModelAndProvider(settings)
sess := newHawkSession(settings, provider, model, systemPrompt, registry)
if err := configureSession(sess, settings); err != nil {
return err
}

tools := registry.EyrieTools()

// caller performs one tool-aware turn and reports the first tool the model
// chose (if any). It does not execute the tool — we are scoring selection,
// not effects.
caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) {
resp, err := sess.Chat(ctx, []types.EyrieMessage{
{Role: "user", Content: c.Prompt},
}, types.ChatOptions{Model: model, Tools: tools})
if err != nil {
return eval.ObservedCall{}, err
}
if resp == nil || len(resp.ToolCalls) == 0 {
return eval.ObservedCall{}, nil // no tool called
}
tc := resp.ToolCalls[0]
return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil
}

ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute)
defer cancel()

cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model)
report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller)

switch evalToolsOutput {
case "json":
data, _ := json.MarshalIndent(report, "", " ")
cmd.Println(string(data))
default:
cmd.Println(report.Markdown())
}
return nil
}
28 changes: 26 additions & 2 deletions cmd/formatter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@ import (
"strings"
"sync"
"time"

"golang.org/x/term"
)

// stdoutIsTerminal reports whether stdout is connected to a terminal (TTY).
// When stdout is a pipe or file — which is exactly the case when an agent or
// shell script captures hawk's output — this is false, and color/Unicode
// chrome must be suppressed so the payload stays clean. It is a var so tests
// can override it.
var stdoutIsTerminal = func() bool {
return term.IsTerminal(int(os.Stdout.Fd()))
}

// TreeNode represents a node in a tree structure for FormatTree.
type TreeNode struct {
Name string
Expand Down Expand Up @@ -544,9 +555,16 @@ func DetectColorSupport() bool {
return false
}

// Stdout is not a TTY (piped to an agent, file, or another process):
// suppress ANSI so the captured output is clean. An explicit FORCE_COLOR
// above already overrode this for callers that pipe but still want color.
if !stdoutIsTerminal() {
return false
}

// Check TERM
term := os.Getenv("TERM")
if term == "dumb" || term == "" {
t := os.Getenv("TERM")
if t == "dumb" || t == "" {
return false
}

Expand All @@ -555,6 +573,12 @@ func DetectColorSupport() bool {

// DetectUnicodeSupport checks if the terminal supports Unicode characters.
func DetectUnicodeSupport() bool {
// Non-TTY stdout: emit ASCII so box-drawing/glyphs don't corrupt captured
// output. FORCE_COLOR is a color signal only, so it does not override here.
if !stdoutIsTerminal() {
return false
}

lang := os.Getenv("LANG")
lcAll := os.Getenv("LC_ALL")
lcCtype := os.Getenv("LC_CTYPE")
Expand Down
38 changes: 38 additions & 0 deletions cmd/formatter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,38 @@ import (
"time"
)

func TestDetectColorSupport_NonTTYStdout(t *testing.T) {
orig := stdoutIsTerminal
defer func() { stdoutIsTerminal = orig }()

t.Setenv("NO_COLOR", "")
t.Setenv("FORCE_COLOR", "")
t.Setenv("TERM", "xterm-256color")

// Piped stdout (not a TTY) with a normal TERM must still disable color —
// this is the "agent captured ANSI escapes in its JSON" regression.
stdoutIsTerminal = func() bool { return false }
if DetectColorSupport() {
t.Error("DetectColorSupport() = true for non-TTY stdout; want false")
}
if DetectUnicodeSupport() {
t.Error("DetectUnicodeSupport() = true for non-TTY stdout; want false")
}

// A TTY with a good TERM keeps color.
stdoutIsTerminal = func() bool { return true }
if !DetectColorSupport() {
t.Error("DetectColorSupport() = false for TTY stdout; want true")
}

// FORCE_COLOR overrides the non-TTY gate (deliberate piped color).
stdoutIsTerminal = func() bool { return false }
t.Setenv("FORCE_COLOR", "1")
if !DetectColorSupport() {
t.Error("DetectColorSupport() = false with FORCE_COLOR over a pipe; want true")
}
}

func newTestFormatter(color, unicode bool, width int) *OutputFormatter {
theme := OutputTheme{}
if color {
Expand Down Expand Up @@ -552,6 +584,12 @@ func TestDetectColorSupport(t *testing.T) {
}

func TestDetectUnicodeSupport(t *testing.T) {
// These subtests exercise the locale-env logic, so pin stdout to a TTY;
// the non-TTY suppression gate is covered by TestDetectColorSupport_NonTTYStdout.
origIsTTY := stdoutIsTerminal
stdoutIsTerminal = func() bool { return true }
defer func() { stdoutIsTerminal = origIsTTY }()

t.Run("UTF-8 lang", func(t *testing.T) {
origLang := os.Getenv("LANG")
origLcAll := os.Getenv("LC_ALL")
Expand Down
7 changes: 6 additions & 1 deletion cmd/hawk/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/GrayCodeAI/hawk/cmd"
"github.com/GrayCodeAI/hawk/internal/api"
"github.com/GrayCodeAI/hawk/internal/hawkerr"
"github.com/GrayCodeAI/hawk/internal/mcp"
"github.com/GrayCodeAI/hawk/internal/sandbox"
)
Expand Down Expand Up @@ -42,10 +43,14 @@ func main() {

if err := cmd.Execute(); err != nil {
fmt.Fprintln(os.Stderr, err)
// An explicit ExitCodeError (e.g. a wrapped Bash exit status) wins —
// it already carries the intended code. Otherwise classify the failure
// into the stable exit-code taxonomy so callers can branch on the
// reason (auth vs rate-limit vs network) instead of seeing a bare 1.
var exitErr *cmd.ExitCodeError
if errors.As(err, &exitErr) {
os.Exit(exitErr.Code)
}
os.Exit(1)
os.Exit(hawkerr.ClassifyExitCode(err))
}
}
106 changes: 106 additions & 0 deletions cmd/mcp_serve.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package cmd

import (
"encoding/json"
"os"
"os/signal"
"syscall"

hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
"github.com/GrayCodeAI/hawk/internal/mcp"
"github.com/spf13/cobra"
)

var mcpConfigWrite bool

func init() {
mcpConfigCmd.Flags().BoolVar(&mcpConfigWrite, "write", false,
"also print the well-known client config paths to paste the block into")
mcpCmd.AddCommand(mcpServeCmd)
mcpCmd.AddCommand(mcpConfigCmd)
}

// mcpServeCmd runs hawk itself as an MCP server over stdio, exposing hawk's
// capabilities (chat, search, memory, review, scan, compress) to MCP clients
// such as Claude Desktop, Cursor, and Windsurf.
var mcpServeCmd = &cobra.Command{
Use: "serve",
Short: "Run hawk as an MCP server over stdio",
Long: "Run hawk as a Model Context Protocol server over stdio (JSON-RPC 2.0), " +
"exposing hawk's tools to MCP clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
"Use `hawk mcp config` to print the JSON block that registers this command in a client.",
RunE: runMCPServe,
}

func runMCPServe(cmd *cobra.Command, _ []string) error {
settings := hawkconfig.LoadSettings()

serverVersion := version
if serverVersion == "" {
serverVersion = "dev"
}
server := mcp.NewMCPServer(mcp.ServerInfo{Name: "hawk", Version: serverVersion})

// Wire hawk's tool registry in as the executor so delegating tools run for
// real; a registry build failure degrades to not-configured rather than
// aborting (the server still answers initialize/tools/list).
registry, err := defaultRegistry(settings)
if err == nil {
mcp.RegisterDefaultTools(server, registry.Execute)
} else {
mcp.RegisterDefaultTools(server, nil)
}

ctx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM)
defer stop()
return server.ServeStdio(ctx)
}

// mcpConfigCmd emits the JSON block that registers hawk as an MCP server in a
// client's config file, so users don't hand-edit JSON.
var mcpConfigCmd = &cobra.Command{
Use: "config",
Short: "Print the MCP-server config block to register hawk in a client",
Long: "Print the JSON block that registers hawk as an MCP server (pointing at " +
"`hawk mcp serve`) for clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
"Pipe it to the client's config file, e.g.:\n" +
" hawk mcp config >> ~/Library/Application Support/Claude/claude_desktop_config.json",
RunE: runMCPConfig,
}

func runMCPConfig(cmd *cobra.Command, _ []string) error {
exe := hawkExecutablePath()

block := map[string]any{
"mcpServers": map[string]any{
"hawk": map[string]any{
"command": exe,
"args": []string{"mcp", "serve"},
},
},
}
out, err := json.MarshalIndent(block, "", " ")
if err != nil {
return err
}

if mcpConfigWrite {
cmd.Println("# Add the \"hawk\" entry below into the \"mcpServers\" object of your client config:")
cmd.Println("# Claude Desktop (macOS): ~/Library/Application Support/Claude/claude_desktop_config.json")
cmd.Println("# Cursor: ~/.cursor/mcp.json")
cmd.Println("# Windsurf: ~/.codeium/windsurf/mcp_config.json")
cmd.Println()
}
cmd.Println(string(out))
return nil
}

// hawkExecutablePath returns the absolute path to the running hawk binary, or
// the bare name "hawk" if it cannot be resolved (e.g. during `go run`), so the
// emitted config is still copy-pasteable.
func hawkExecutablePath() string {
if exe, err := os.Executable(); err == nil && exe != "" {
return exe
}
return "hawk"
}
Loading
Loading