Skip to content

Commit 0d4dd84

Browse files
committed
feat(search): implement tree-based deduplication and strict modes
1 parent 595619a commit 0d4dd84

2 files changed

Lines changed: 292 additions & 1 deletion

File tree

internal/service/tools/smart_search.go

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"os"
8+
"path/filepath"
89
"sort"
910
"strings"
1011
"sync"
@@ -42,7 +43,10 @@ func (t *SmartSearchTool) Description() string {
4243
"high-confidence matches return full source code, exploratory results return compact summaries. " +
4344
"No need to choose a search mode. Provide 'file_path' for faster workspace detection, or omit it for Auto-Discovery. " +
4445
"Set 'include_full_content' to true to force full source code in all results, overriding compact mode. " +
45-
"Set 'include_docs' to true to also search project documentation (README, guides, Markdown files) alongside code."
46+
"Set 'include_docs' to true to also search project documentation (README, guides, Markdown files) alongside code. " +
47+
"Use 'mode'=\"strict_code\" when you ONLY want to see implementation logic exactly (Go, Python, etc) and strictly ignore documentation. " +
48+
"Use 'mode'=\"strict_docs\" when searching for architectural plans or summaries. " +
49+
"Use 'mode'=\"all\" or omit for broad scans."
4650
}
4751

4852
type SmartSearchInput struct {
@@ -51,6 +55,7 @@ type SmartSearchInput struct {
5155
Limit int `json:"limit,omitempty"`
5256
IncludeFullContent bool `json:"include_full_content,omitempty"`
5357
IncludeDocs bool `json:"include_docs,omitempty"`
58+
Mode string `json:"mode,omitempty"`
5459
}
5560

5661
// highConfidenceThreshold: if top result score exceeds this, return full content.
@@ -172,6 +177,24 @@ func (t *SmartSearchTool) Execute(ctx context.Context, input SmartSearchInput) (
172177
// Merge and deduplicate results from both strategies
173178
merged := t.mergeResults(semanticRes, hybridRes, limit)
174179

180+
// Apply mode filtering
181+
var filtered []mergedResult
182+
for _, m := range merged {
183+
// Strict code mode: ignore completely any markdown or documentation type
184+
if input.Mode == "strict_code" && (m.symbolType == "documentation" || m.symbolType == "markdown" || m.symbolType == "code_block" || strings.HasSuffix(strings.ToLower(m.filePath), ".md") || strings.HasSuffix(strings.ToLower(m.filePath), ".html")) {
185+
continue
186+
}
187+
// Strict docs mode: ignore anything that isn't documentation
188+
if input.Mode == "strict_docs" && !(m.symbolType == "documentation" || m.symbolType == "markdown" || m.symbolType == "code_block" || strings.HasSuffix(strings.ToLower(m.filePath), ".md") || strings.HasSuffix(strings.ToLower(m.filePath), ".html")) {
189+
continue
190+
}
191+
filtered = append(filtered, m)
192+
}
193+
merged = filtered
194+
195+
// Apply tree-based grouping for documentation chunks
196+
merged = t.groupDocsByTree(merged)
197+
175198
if len(merged) == 0 {
176199
response := ToolResponse{
177200
Status: "no_results",
@@ -457,3 +480,165 @@ func (t *SmartSearchTool) handleSearchError(err error, workspaceRoot, workspaceI
457480
response.Error = fmt.Sprintf("search failed: %v", err)
458481
return response.JSON()
459482
}
483+
484+
// readLines reads a specific range of lines from a file.
485+
// Lines are 1-indexed.
486+
func readLines(filePath string, startLine, endLine int) (string, error) {
487+
content, err := os.ReadFile(filePath)
488+
if err != nil {
489+
return "", err
490+
}
491+
492+
lines := strings.Split(string(content), "\n")
493+
if startLine < 1 {
494+
startLine = 1
495+
}
496+
if endLine > len(lines) {
497+
endLine = len(lines)
498+
}
499+
if startLine > endLine || startLine > len(lines) {
500+
return "", fmt.Errorf("invalid line range")
501+
}
502+
503+
return strings.Join(lines[startLine-1:endLine], "\n"), nil
504+
}
505+
506+
// groupDocsByTree aggregates "documentation" and "code_block" chunks
507+
// from the same file and AST Signature (Markdown heading) into single unified blocks,
508+
// fetching the continuous text from disk to prevent Frankenstein gaps.
509+
func (t *SmartSearchTool) groupDocsByTree(results []mergedResult) []mergedResult {
510+
if len(results) == 0 {
511+
return results
512+
}
513+
514+
var out []mergedResult
515+
516+
// Groups are keyed by: filePath_|_signature -> slice of mergedResult indices in groups slice
517+
type docGroup struct {
518+
filePath string
519+
signature string
520+
items []*mergedResult
521+
maxScore float32
522+
minLine int
523+
maxLine int
524+
source string
525+
}
526+
527+
groupsMap := make(map[string]*docGroup)
528+
var orderedGroups []string // keep track of the first time we see a group to maintain rough sorting
529+
530+
for i := range results {
531+
res := &results[i]
532+
533+
// Only group documentation types and files ending in .md or .html
534+
ext := strings.ToLower(filepath.Ext(res.filePath))
535+
isDocFile := ext == ".md" || ext == ".markdown" || ext == ".html" || ext == ".htm" ||
536+
ext == ".yaml" || ext == ".yml" || ext == ".json" || ext == ".xml" ||
537+
ext == ".toml" || ext == ".rst" || ext == ".css" || ext == ".scss" || ext == ".svelte" || ext == ".sql" || ext == ".sh"
538+
539+
isDocType := res.symbolType == "documentation" || res.symbolType == "code_block" || res.symbolType == "markdown"
540+
541+
if !isDocType || !isDocFile || res.signature == "" {
542+
// Pass-through code or items without signature
543+
out = append(out, *res)
544+
continue
545+
}
546+
547+
key := fmt.Sprintf("%s_|_%s", res.filePath, res.signature)
548+
if g, exists := groupsMap[key]; exists {
549+
g.items = append(g.items, res)
550+
if res.score > g.maxScore {
551+
g.maxScore = res.score
552+
}
553+
if res.startLine > 0 && (g.minLine == 0 || res.startLine < g.minLine) {
554+
g.minLine = res.startLine
555+
}
556+
if res.endLine > 0 && res.endLine > g.maxLine {
557+
g.maxLine = res.endLine
558+
}
559+
if g.source != "both" && g.source != res.source {
560+
g.source = "both"
561+
}
562+
} else {
563+
minL := res.startLine
564+
if minL == 0 {
565+
minL = 1
566+
}
567+
maxL := res.endLine
568+
if maxL == 0 {
569+
maxL = 1
570+
}
571+
groupsMap[key] = &docGroup{
572+
filePath: res.filePath,
573+
signature: res.signature,
574+
items: []*mergedResult{res},
575+
maxScore: res.score,
576+
minLine: minL,
577+
maxLine: maxL,
578+
source: res.source,
579+
}
580+
orderedGroups = append(orderedGroups, key)
581+
}
582+
}
583+
584+
// Reconstruct the grouped items
585+
for _, key := range orderedGroups {
586+
g := groupsMap[key]
587+
588+
if len(g.items) == 1 {
589+
// Nothing to merge, just append
590+
out = append(out, *g.items[0])
591+
continue
592+
}
593+
594+
// Multiple chunks in this group. Let's merge them!
595+
// Attempt to read the full continuous block from the file
596+
fullContent := ""
597+
if g.minLine > 0 && g.maxLine >= g.minLine {
598+
content, err := readLines(g.filePath, g.minLine, g.maxLine)
599+
if err == nil {
600+
fullContent = content
601+
}
602+
}
603+
604+
// If reading from disk failed, append the contents manually with an ellipsis
605+
if fullContent == "" {
606+
var contents []string
607+
// Sort items by line number
608+
sortedItems := make([]*mergedResult, len(g.items))
609+
copy(sortedItems, g.items)
610+
sort.Slice(sortedItems, func(i, j int) bool {
611+
return sortedItems[i].startLine < sortedItems[j].startLine
612+
})
613+
for _, item := range sortedItems {
614+
contents = append(contents, strings.TrimSpace(item.content))
615+
}
616+
fullContent = strings.Join(contents, "\n\n[...]\n\n")
617+
}
618+
619+
baseItem := g.items[0] // take the first item as a prototype
620+
merged := mergedResult{
621+
id: fmt.Sprintf("merged_%s_%d_%d", baseItem.id, g.minLine, g.maxLine),
622+
score: g.maxScore,
623+
filePath: g.filePath,
624+
name: baseItem.name,
625+
symbolType: "documentation_merged",
626+
signature: g.signature,
627+
pkg: baseItem.pkg,
628+
docstring: fmt.Sprintf("Merged %d chunks spanning %d lines.", len(g.items), g.maxLine-g.minLine+1),
629+
content: fullContent,
630+
startLine: g.minLine,
631+
endLine: g.maxLine,
632+
source: g.source,
633+
}
634+
out = append(out, merged)
635+
}
636+
637+
// After mixing merged chunks and original unmerged items, we should re-sort by score
638+
sort.Slice(out, func(i, j int) bool {
639+
return out[i].score > out[j].score
640+
})
641+
642+
return out
643+
}
644+
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package tools
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"strings"
7+
"testing"
8+
)
9+
10+
func TestGroupDocsByTree(t *testing.T) {
11+
// Create a temporary file for testing disk reads
12+
tempDir := t.TempDir()
13+
tempFilePath := filepath.Join(tempDir, "test_doc.md")
14+
15+
// Create a dummy document with 20 lines
16+
lines := make([]string, 20)
17+
for i := 0; i < 20; i++ {
18+
lines[i] = "Line " + string(rune('A'+i)) // Line A, Line B, etc.
19+
}
20+
os.WriteFile(tempFilePath, []byte(strings.Join(lines, "\n")), 0644)
21+
22+
// Create tool instance (we only need the groupDocsByTree method)
23+
tool := &SmartSearchTool{}
24+
25+
tests := []struct {
26+
name string
27+
input []mergedResult
28+
expected int // Expected number of results after grouping
29+
check func(t *testing.T, results []mergedResult)
30+
}{
31+
{
32+
name: "No grouping needed for code",
33+
input: []mergedResult{
34+
{id: "1", filePath: "main.go", symbolType: "function", score: 0.9},
35+
{id: "2", filePath: "main.go", symbolType: "function", score: 0.8},
36+
},
37+
expected: 2,
38+
check: func(t *testing.T, results []mergedResult) {
39+
if len(results) != 2 {
40+
t.Errorf("Expected 2 results, got %d", len(results))
41+
}
42+
},
43+
},
44+
{
45+
name: "Group documentation chunks from same file and signature",
46+
input: []mergedResult{
47+
{
48+
id: "chunk_1", filePath: tempFilePath, symbolType: "documentation", signature: "### Intro",
49+
startLine: 1, endLine: 3, score: 0.8,
50+
},
51+
{
52+
id: "chunk_2", filePath: tempFilePath, symbolType: "documentation", signature: "### Intro",
53+
startLine: 4, endLine: 6, score: 0.9,
54+
},
55+
// This one is in a different signature
56+
{
57+
id: "chunk_3", filePath: tempFilePath, symbolType: "documentation", signature: "### Setup",
58+
startLine: 8, endLine: 10, score: 0.5,
59+
},
60+
// This one is code, ignore grouping
61+
{
62+
id: "chunk_4", filePath: tempFilePath, symbolType: "code_block", signature: "### Intro",
63+
startLine: 7, endLine: 7, score: 0.85,
64+
},
65+
},
66+
// Expect: 1 merged block for "### Intro", 1 block for "### Setup".
67+
// Note: chunk_4 has "code_block" type! Code blocks in markdown also get grouped!
68+
expected: 2,
69+
check: func(t *testing.T, results []mergedResult) {
70+
if len(results) != 2 {
71+
t.Fatalf("Expected 2 results, got %d", len(results))
72+
}
73+
74+
// results are sorted by score. The merged "### Intro" should have max score 0.9
75+
if results[0].score != 0.9 {
76+
t.Errorf("Expected max score 0.9, got %v", results[0].score)
77+
}
78+
79+
// Check start/end line bounds for the merged "### Intro"
80+
if results[0].startLine != 1 || results[0].endLine != 7 {
81+
t.Errorf("Expected lines 1-7, got %d-%d", results[0].startLine, results[0].endLine)
82+
}
83+
84+
// Verify the content was loaded from disk and contains Lines A to G (1 to 7)
85+
if !strings.Contains(results[0].content, "Line A") || !strings.Contains(results[0].content, "Line G") {
86+
t.Errorf("Merged content does not match expected lines from disk")
87+
}
88+
89+
// Setup should be untouched
90+
if results[1].signature != "### Setup" {
91+
t.Errorf("Expected second result to be '### Setup'")
92+
}
93+
},
94+
},
95+
}
96+
97+
for _, tc := range tests {
98+
t.Run(tc.name, func(t *testing.T) {
99+
got := tool.groupDocsByTree(tc.input)
100+
if len(got) != tc.expected {
101+
t.Errorf("Expected %d results, got %d", tc.expected, len(got))
102+
}
103+
tc.check(t, got)
104+
})
105+
}
106+
}

0 commit comments

Comments
 (0)