feat(search): add match_reasons annotation (#2 from proposals)

razvan · razvan · commit 771a0344f318 · 2026-03-08T10:21:05.000+02:00
- pkg/scoring: DetectMatchReasons() pure function
  - case-insensitive token matching per field (symbol_name/signature/content/docstring)
  - reuses FilterTokens for consistency with rest of scoring package
  - 6 unit tests covering all edge cases

- SmartSearchInput: new include_reasons bool field
- serializeResults/resultToMap: accept query + includeReasons
- match_reasons added to result map only when include_reasons=true
  (zero overhead when not requested)
diff --git a/internal/service/tools/smart_search.go b/internal/service/tools/smart_search.go
@@ -47,7 +47,9 @@ func (t *SmartSearchTool) Description() string {
 		"Use 'mode'=\"strict_docs\" when searching for architectural plans or summaries. " +
 		"Use 'mode'=\"all\" or omit for broad scans. " +
 		"Set 'min_score' (0.0-1.0) to filter out low-relevance results. When omitted, an automatic threshold is applied: " +
-		"if the top result scores above 0.70, results below 40% of the top score are automatically pruned."
+		"if the top result scores above 0.70, results below 40% of the top score are automatically pruned. " +
+		"Set 'include_reasons' to true to include a 'match_reasons' field in each result, explaining which fields " +
+		"(symbol_name, signature, content, docstring) contributed to the match — useful for understanding result relevance."
 }
 
 type SmartSearchInput struct {
@@ -57,6 +59,7 @@ type SmartSearchInput struct {
 	MinScore           float32 `json:"min_score,omitempty"`
 	IncludeFullContent bool    `json:"include_full_content,omitempty"`
 	IncludeDocs        bool    `json:"include_docs,omitempty"`
+	IncludeReasons     bool    `json:"include_reasons,omitempty"`
 	Mode               string  `json:"mode,omitempty"`
 }
 
@@ -125,7 +128,7 @@ func (t *SmartSearchTool) Execute(ctx context.Context, input SmartSearchInput) (
 
 	isFallback := sr.meta.collection == "fallback"
 	response := t.buildResponseMeta(sr.meta, useCompact)
-	serializeResults(&response, merged, useCompact, isFallback)
+	serializeResults(&response, merged, useCompact, isFallback, query, input.IncludeReasons)
 
 	return response.JSON()
 }
diff --git a/internal/service/tools/smart_search_pipeline.go b/internal/service/tools/smart_search_pipeline.go
@@ -237,7 +237,9 @@ func (t *SmartSearchTool) buildResponseMeta(meta searchMetadata, useCompact bool
 
 // resultToMap converts a mergedResult to the output map format.
 // includeContent controls whether the full source code is included.
-func resultToMap(m mergedResult, includeContent bool) map[string]any {
+// When includeReasons is true, a match_reasons field is added explaining
+// which payload fields (symbol_name, signature, content, docstring) matched the query.
+func resultToMap(m mergedResult, includeContent bool, query string, includeReasons bool) map[string]any {
 	item := map[string]any{
 		"score":      m.score,
 		"file_path":  m.filePath,
@@ -257,6 +259,9 @@ func resultToMap(m mergedResult, includeContent bool) map[string]any {
 	if m.source != "" {
 		item["_source"] = m.source
 	}
+	if includeReasons && query != "" {
+		item["match_reasons"] = scoring.DetectMatchReasons(query, m.name, m.signature, m.content, m.docstring)
+	}
 	return item
 }
 
@@ -276,14 +281,15 @@ func buildResultsMessage(count int, useCompact, isFallback bool) string {
 
 // serializeResults populates the ToolResponse with either compact or full result data,
 // calculates telemetry savings, and detects stale indexed files.
-func serializeResults(response *ToolResponse, merged []mergedResult, useCompact, isFallback bool) {
+// query and includeReasons control the optional match_reasons annotation per result.
+func serializeResults(response *ToolResponse, merged []mergedResult, useCompact, isFallback bool, query string, includeReasons bool) {
 	data := make([]map[string]any, 0, len(merged))
 	var baselineBytes, actualBytes int64
 	seenFiles := make(map[string]bool)
 	var staleFiles []string
 
 	for _, m := range merged {
-		data = append(data, resultToMap(m, !useCompact))
+		data = append(data, resultToMap(m, !useCompact, query, includeReasons))
 
 		if !useCompact {
 			actualBytes += int64(len(m.content))
diff --git a/pkg/scoring/match_reasons.go b/pkg/scoring/match_reasons.go
@@ -0,0 +1,46 @@
+package scoring
+
+import "strings"
+
+// ─── Match Reason Annotation ─────────────────────────────────────────────────
+
+// MatchReasons describes which payload fields of a search result matched the query.
+// Useful for AI agents to understand WHY a result was returned and decide
+// whether to request full content or treat the result as high/low confidence.
+type MatchReasons struct {
+	SymbolName bool `json:"symbol_name"` // query token found in symbol name
+	Signature  bool `json:"signature"`   // query token found in function signature
+	Content    bool `json:"content"`     // query token found in code body
+	Docstring  bool `json:"docstring"`   // query token found in docstring/comments
+}
+
+// DetectMatchReasons returns which fields of a search result contain the query tokens.
+// It uses simple case-insensitive substring matching — the same heuristic used
+// by the fallback lexical scorer, intentionally kept fast and allocation-light.
+//
+// query is the original search query (will be lowercased internally).
+// name, signature, content, docstring are the corresponding payload fields.
+func DetectMatchReasons(query, name, signature, content, docstring string) MatchReasons {
+	lower := strings.ToLower(query)
+	tokens := FilterTokens(strings.Fields(lower))
+	if len(tokens) == 0 {
+		return MatchReasons{}
+	}
+
+	containsAny := func(text string) bool {
+		t := strings.ToLower(text)
+		for _, tok := range tokens {
+			if strings.Contains(t, tok) {
+				return true
+			}
+		}
+		return false
+	}
+
+	return MatchReasons{
+		SymbolName: containsAny(name),
+		Signature:  containsAny(signature),
+		Content:    containsAny(content),
+		Docstring:  containsAny(docstring),
+	}
+}
diff --git a/pkg/scoring/match_reasons_test.go b/pkg/scoring/match_reasons_test.go
@@ -0,0 +1,58 @@
+package scoring
+
+import "testing"
+
+func TestDetectMatchReasonsSymbolName(t *testing.T) {
+	r := DetectMatchReasons("Calculator", "Calculator", "", "", "")
+	if !r.SymbolName {
+		t.Error("expected SymbolName=true")
+	}
+	if r.Signature || r.Content || r.Docstring {
+		t.Error("expected other fields false")
+	}
+}
+
+func TestDetectMatchReasonsMultipleFields(t *testing.T) {
+	r := DetectMatchReasons("auth token", "authenticate", "func authenticate(token string)", "", "validates auth")
+	if !r.SymbolName {
+		t.Error("expected SymbolName=true (auth in authenticate)")
+	}
+	if !r.Signature {
+		t.Error("expected Signature=true (token in signature)")
+	}
+	if !r.Docstring {
+		t.Error("expected Docstring=true (auth in docstring)")
+	}
+	if r.Content {
+		t.Error("expected Content=false")
+	}
+}
+
+func TestDetectMatchReasonsCaseInsensitive(t *testing.T) {
+	r := DetectMatchReasons("CALCULATOR", "Calculator", "", "", "")
+	if !r.SymbolName {
+		t.Error("expected case-insensitive match on SymbolName")
+	}
+}
+
+func TestDetectMatchReasonsShortTokensIgnored(t *testing.T) {
+	// tokens "ab" and "x" are ≤2 chars → filtered out → no matches
+	r := DetectMatchReasons("ab x", "ab", "x", "ab x", "ab")
+	if r.SymbolName || r.Signature || r.Content || r.Docstring {
+		t.Error("short tokens should be filtered, no match expected")
+	}
+}
+
+func TestDetectMatchReasonsEmptyQuery(t *testing.T) {
+	r := DetectMatchReasons("", "Calculator", "func Calculator()", "body", "docs")
+	if r.SymbolName || r.Signature || r.Content || r.Docstring {
+		t.Error("empty query should produce no matches")
+	}
+}
+
+func TestDetectMatchReasonsNoMatch(t *testing.T) {
+	r := DetectMatchReasons("Payment", "UserAuth", "func UserAuth()", "body code", "user authenticates")
+	if r.SymbolName || r.Signature || r.Content || r.Docstring {
+		t.Error("payment not in any field, expected all false")
+	}
+}