Skip to content

Commit a26d27a

Browse files
authored
feat: adopt MiniMax-AI OSS patterns (exit codes, MCP server/annotations, batch search, tool eval) (#14)
* feat: adopt MiniMax-AI OSS patterns into hawk Verified-net-new improvements identified by scanning MiniMax-AI's open-source repos and adversarially mapping ideas onto hawk's actual source. Patterns only, no code copied (all sources MIT). - exit-code taxonomy: internal/hawkerr/exitcode.go classifies errors into stable codes (auth/network/ratelimit/timeout/...) so callers can branch on the failure reason instead of a bare 1; wired into cmd/hawk/main.go - IsTerminal color gate: suppress ANSI/Unicode when stdout is piped (non-TTY), fixing ANSI leaking into captured/JSON output; FORCE_COLOR still overrides - MCP client honors isError: parseToolCallResult surfaces remote tool failures as errors the agent can self-correct on - batch web search/browse: queries[]/urls[] concurrent fan-out (bounded) in WebSearch and AgenticFetch; single query/url still works; adds a relevance-refusal contract to the AgenticFetch sub-agent prompt - RiskLevel -> MCP annotations: ToolAnnotations (readOnly/destructive hints) on the MCP server's tools/list so clients can self-throttle - hawk mcp serve / hawk mcp config: run hawk as an MCP server (machinery existed but had no entrypoint) and emit the client registration JSON block - tool-call confusion-matrix eval: internal/feature/eval/toolmatrix.go scores trigger (TP/FN/FP/TN, precision/recall/F1) separately from payload accuracy; exposed via `hawk eval tools` All new code is unit-tested; touched packages build, vet, and test clean. * chore: sync external submodules to latest origin/main Advance the eyrie and yaad submodule pointers to the current tip of their origin/main (the other four already matched). Picks up upstream commits that were present on the remotes but not yet recorded in hawk's index. * chore: bump eyrie submodule to merged main (efe3710) Point external/eyrie at the squash-merged eyrie main containing the provider error/image/reasoning/verify work, so hawk consumes the released eyrie changes. ---------
1 parent d9f7077 commit a26d27a

22 files changed

Lines changed: 1350 additions & 40 deletions

cmd/eval_tools.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package cmd
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"time"
8+
9+
hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
10+
"github.com/GrayCodeAI/hawk/internal/feature/eval"
11+
"github.com/GrayCodeAI/hawk/internal/types"
12+
"github.com/spf13/cobra"
13+
)
14+
15+
var evalToolsOutput string
16+
17+
func init() {
18+
evalToolsCmd.Flags().StringVarP(&evalToolsOutput, "output", "o", "markdown", "Output format: markdown, json")
19+
evalCmd.AddCommand(evalToolsCmd)
20+
}
21+
22+
var evalToolsCmd = &cobra.Command{
23+
Use: "tools",
24+
Short: "Evaluate tool selection: trigger confusion matrix + payload accuracy",
25+
Long: "Run a model-in-the-loop tool-use evaluation. Each case is a prompt with an " +
26+
"expected tool (or none). Triggering (did the model call a tool when it should) " +
27+
"is scored as a confusion matrix, separately from payload accuracy (right tool + args).",
28+
RunE: runEvalTools,
29+
}
30+
31+
// defaultToolUseCases is a small built-in set exercising clear positive and
32+
// negative tool-trigger situations against hawk's standard tools.
33+
func defaultToolUseCases() []eval.ToolUseCase {
34+
return []eval.ToolUseCase{
35+
{
36+
ID: "read-existing-file",
37+
Prompt: "Show me the contents of go.mod.",
38+
Expected: &eval.ExpectedCall{Tool: "Read"},
39+
},
40+
{
41+
ID: "list-directory",
42+
Prompt: "What files are in the cmd directory?",
43+
Expected: &eval.ExpectedCall{Tool: "LS"},
44+
},
45+
{
46+
ID: "run-command",
47+
Prompt: "Run the test suite for this project.",
48+
Expected: &eval.ExpectedCall{Tool: "Bash"},
49+
},
50+
{
51+
ID: "search-code",
52+
Prompt: "Find every place that defines an http handler in this repo.",
53+
Expected: &eval.ExpectedCall{Tool: "Grep"},
54+
},
55+
{
56+
// Negative case: a pure-knowledge question needs no tool.
57+
ID: "no-tool-trivia",
58+
Prompt: "In one sentence, what does the SOLID 'S' stand for?",
59+
Expected: nil,
60+
},
61+
{
62+
// Negative case: a greeting needs no tool.
63+
ID: "no-tool-greeting",
64+
Prompt: "Say hello.",
65+
Expected: nil,
66+
},
67+
}
68+
}
69+
70+
func runEvalTools(cmd *cobra.Command, _ []string) error {
71+
settings := hawkconfig.LoadSettings()
72+
73+
registry, err := defaultRegistry(settings)
74+
if err != nil {
75+
return fmt.Errorf("building tool registry: %w", err)
76+
}
77+
systemPrompt, err := buildSystemPrompt()
78+
if err != nil {
79+
return err
80+
}
81+
model, provider := effectiveModelAndProvider(settings)
82+
sess := newHawkSession(settings, provider, model, systemPrompt, registry)
83+
if err := configureSession(sess, settings); err != nil {
84+
return err
85+
}
86+
87+
tools := registry.EyrieTools()
88+
89+
// caller performs one tool-aware turn and reports the first tool the model
90+
// chose (if any). It does not execute the tool — we are scoring selection,
91+
// not effects.
92+
caller := func(ctx context.Context, c eval.ToolUseCase) (eval.ObservedCall, error) {
93+
resp, err := sess.Chat(ctx, []types.EyrieMessage{
94+
{Role: "user", Content: c.Prompt},
95+
}, types.ChatOptions{Model: model, Tools: tools})
96+
if err != nil {
97+
return eval.ObservedCall{}, err
98+
}
99+
if resp == nil || len(resp.ToolCalls) == 0 {
100+
return eval.ObservedCall{}, nil // no tool called
101+
}
102+
tc := resp.ToolCalls[0]
103+
return eval.ObservedCall{Tool: tc.Name, Args: tc.Arguments}, nil
104+
}
105+
106+
ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Minute)
107+
defer cancel()
108+
109+
cmd.Printf("Evaluating tool selection on %d cases with model %s...\n", len(defaultToolUseCases()), model)
110+
report := eval.ScoreToolUse(ctx, defaultToolUseCases(), caller)
111+
112+
switch evalToolsOutput {
113+
case "json":
114+
data, _ := json.MarshalIndent(report, "", " ")
115+
cmd.Println(string(data))
116+
default:
117+
cmd.Println(report.Markdown())
118+
}
119+
return nil
120+
}

cmd/formatter.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,19 @@ import (
66
"strings"
77
"sync"
88
"time"
9+
10+
"golang.org/x/term"
911
)
1012

13+
// stdoutIsTerminal reports whether stdout is connected to a terminal (TTY).
14+
// When stdout is a pipe or file — which is exactly the case when an agent or
15+
// shell script captures hawk's output — this is false, and color/Unicode
16+
// chrome must be suppressed so the payload stays clean. It is a var so tests
17+
// can override it.
18+
var stdoutIsTerminal = func() bool {
19+
return term.IsTerminal(int(os.Stdout.Fd()))
20+
}
21+
1122
// TreeNode represents a node in a tree structure for FormatTree.
1223
type TreeNode struct {
1324
Name string
@@ -544,9 +555,16 @@ func DetectColorSupport() bool {
544555
return false
545556
}
546557

558+
// Stdout is not a TTY (piped to an agent, file, or another process):
559+
// suppress ANSI so the captured output is clean. An explicit FORCE_COLOR
560+
// above already overrode this for callers that pipe but still want color.
561+
if !stdoutIsTerminal() {
562+
return false
563+
}
564+
547565
// Check TERM
548-
term := os.Getenv("TERM")
549-
if term == "dumb" || term == "" {
566+
t := os.Getenv("TERM")
567+
if t == "dumb" || t == "" {
550568
return false
551569
}
552570

@@ -555,6 +573,12 @@ func DetectColorSupport() bool {
555573

556574
// DetectUnicodeSupport checks if the terminal supports Unicode characters.
557575
func DetectUnicodeSupport() bool {
576+
// Non-TTY stdout: emit ASCII so box-drawing/glyphs don't corrupt captured
577+
// output. FORCE_COLOR is a color signal only, so it does not override here.
578+
if !stdoutIsTerminal() {
579+
return false
580+
}
581+
558582
lang := os.Getenv("LANG")
559583
lcAll := os.Getenv("LC_ALL")
560584
lcCtype := os.Getenv("LC_CTYPE")

cmd/formatter_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,38 @@ import (
77
"time"
88
)
99

10+
func TestDetectColorSupport_NonTTYStdout(t *testing.T) {
11+
orig := stdoutIsTerminal
12+
defer func() { stdoutIsTerminal = orig }()
13+
14+
t.Setenv("NO_COLOR", "")
15+
t.Setenv("FORCE_COLOR", "")
16+
t.Setenv("TERM", "xterm-256color")
17+
18+
// Piped stdout (not a TTY) with a normal TERM must still disable color —
19+
// this is the "agent captured ANSI escapes in its JSON" regression.
20+
stdoutIsTerminal = func() bool { return false }
21+
if DetectColorSupport() {
22+
t.Error("DetectColorSupport() = true for non-TTY stdout; want false")
23+
}
24+
if DetectUnicodeSupport() {
25+
t.Error("DetectUnicodeSupport() = true for non-TTY stdout; want false")
26+
}
27+
28+
// A TTY with a good TERM keeps color.
29+
stdoutIsTerminal = func() bool { return true }
30+
if !DetectColorSupport() {
31+
t.Error("DetectColorSupport() = false for TTY stdout; want true")
32+
}
33+
34+
// FORCE_COLOR overrides the non-TTY gate (deliberate piped color).
35+
stdoutIsTerminal = func() bool { return false }
36+
t.Setenv("FORCE_COLOR", "1")
37+
if !DetectColorSupport() {
38+
t.Error("DetectColorSupport() = false with FORCE_COLOR over a pipe; want true")
39+
}
40+
}
41+
1042
func newTestFormatter(color, unicode bool, width int) *OutputFormatter {
1143
theme := OutputTheme{}
1244
if color {
@@ -552,6 +584,12 @@ func TestDetectColorSupport(t *testing.T) {
552584
}
553585

554586
func TestDetectUnicodeSupport(t *testing.T) {
587+
// These subtests exercise the locale-env logic, so pin stdout to a TTY;
588+
// the non-TTY suppression gate is covered by TestDetectColorSupport_NonTTYStdout.
589+
origIsTTY := stdoutIsTerminal
590+
stdoutIsTerminal = func() bool { return true }
591+
defer func() { stdoutIsTerminal = origIsTTY }()
592+
555593
t.Run("UTF-8 lang", func(t *testing.T) {
556594
origLang := os.Getenv("LANG")
557595
origLcAll := os.Getenv("LC_ALL")

cmd/hawk/main.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77

88
"github.com/GrayCodeAI/hawk/cmd"
99
"github.com/GrayCodeAI/hawk/internal/api"
10+
"github.com/GrayCodeAI/hawk/internal/hawkerr"
1011
"github.com/GrayCodeAI/hawk/internal/mcp"
1112
"github.com/GrayCodeAI/hawk/internal/sandbox"
1213
)
@@ -42,10 +43,14 @@ func main() {
4243

4344
if err := cmd.Execute(); err != nil {
4445
fmt.Fprintln(os.Stderr, err)
46+
// An explicit ExitCodeError (e.g. a wrapped Bash exit status) wins —
47+
// it already carries the intended code. Otherwise classify the failure
48+
// into the stable exit-code taxonomy so callers can branch on the
49+
// reason (auth vs rate-limit vs network) instead of seeing a bare 1.
4550
var exitErr *cmd.ExitCodeError
4651
if errors.As(err, &exitErr) {
4752
os.Exit(exitErr.Code)
4853
}
49-
os.Exit(1)
54+
os.Exit(hawkerr.ClassifyExitCode(err))
5055
}
5156
}

cmd/mcp_serve.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package cmd
2+
3+
import (
4+
"encoding/json"
5+
"os"
6+
"os/signal"
7+
"syscall"
8+
9+
hawkconfig "github.com/GrayCodeAI/hawk/internal/config"
10+
"github.com/GrayCodeAI/hawk/internal/mcp"
11+
"github.com/spf13/cobra"
12+
)
13+
14+
var mcpConfigWrite bool
15+
16+
func init() {
17+
mcpConfigCmd.Flags().BoolVar(&mcpConfigWrite, "write", false,
18+
"also print the well-known client config paths to paste the block into")
19+
mcpCmd.AddCommand(mcpServeCmd)
20+
mcpCmd.AddCommand(mcpConfigCmd)
21+
}
22+
23+
// mcpServeCmd runs hawk itself as an MCP server over stdio, exposing hawk's
24+
// capabilities (chat, search, memory, review, scan, compress) to MCP clients
25+
// such as Claude Desktop, Cursor, and Windsurf.
26+
var mcpServeCmd = &cobra.Command{
27+
Use: "serve",
28+
Short: "Run hawk as an MCP server over stdio",
29+
Long: "Run hawk as a Model Context Protocol server over stdio (JSON-RPC 2.0), " +
30+
"exposing hawk's tools to MCP clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
31+
"Use `hawk mcp config` to print the JSON block that registers this command in a client.",
32+
RunE: runMCPServe,
33+
}
34+
35+
func runMCPServe(cmd *cobra.Command, _ []string) error {
36+
settings := hawkconfig.LoadSettings()
37+
38+
serverVersion := version
39+
if serverVersion == "" {
40+
serverVersion = "dev"
41+
}
42+
server := mcp.NewMCPServer(mcp.ServerInfo{Name: "hawk", Version: serverVersion})
43+
44+
// Wire hawk's tool registry in as the executor so delegating tools run for
45+
// real; a registry build failure degrades to not-configured rather than
46+
// aborting (the server still answers initialize/tools/list).
47+
registry, err := defaultRegistry(settings)
48+
if err == nil {
49+
mcp.RegisterDefaultTools(server, registry.Execute)
50+
} else {
51+
mcp.RegisterDefaultTools(server, nil)
52+
}
53+
54+
ctx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM)
55+
defer stop()
56+
return server.ServeStdio(ctx)
57+
}
58+
59+
// mcpConfigCmd emits the JSON block that registers hawk as an MCP server in a
60+
// client's config file, so users don't hand-edit JSON.
61+
var mcpConfigCmd = &cobra.Command{
62+
Use: "config",
63+
Short: "Print the MCP-server config block to register hawk in a client",
64+
Long: "Print the JSON block that registers hawk as an MCP server (pointing at " +
65+
"`hawk mcp serve`) for clients like Claude Desktop, Cursor, and Windsurf.\n\n" +
66+
"Pipe it to the client's config file, e.g.:\n" +
67+
" hawk mcp config >> ~/Library/Application Support/Claude/claude_desktop_config.json",
68+
RunE: runMCPConfig,
69+
}
70+
71+
func runMCPConfig(cmd *cobra.Command, _ []string) error {
72+
exe := hawkExecutablePath()
73+
74+
block := map[string]any{
75+
"mcpServers": map[string]any{
76+
"hawk": map[string]any{
77+
"command": exe,
78+
"args": []string{"mcp", "serve"},
79+
},
80+
},
81+
}
82+
out, err := json.MarshalIndent(block, "", " ")
83+
if err != nil {
84+
return err
85+
}
86+
87+
if mcpConfigWrite {
88+
cmd.Println("# Add the \"hawk\" entry below into the \"mcpServers\" object of your client config:")
89+
cmd.Println("# Claude Desktop (macOS): ~/Library/Application Support/Claude/claude_desktop_config.json")
90+
cmd.Println("# Cursor: ~/.cursor/mcp.json")
91+
cmd.Println("# Windsurf: ~/.codeium/windsurf/mcp_config.json")
92+
cmd.Println()
93+
}
94+
cmd.Println(string(out))
95+
return nil
96+
}
97+
98+
// hawkExecutablePath returns the absolute path to the running hawk binary, or
99+
// the bare name "hawk" if it cannot be resolved (e.g. during `go run`), so the
100+
// emitted config is still copy-pasteable.
101+
func hawkExecutablePath() string {
102+
if exe, err := os.Executable(); err == nil && exe != "" {
103+
return exe
104+
}
105+
return "hawk"
106+
}

0 commit comments

Comments
 (0)