Skip to content

Commit 9a1c628

Browse files
authored
Implement post-retrieval chunk merging and strict search modes for rag_search.
Changes: - Tree-based chunk merging (groupDocsByTree) consolidates adjacent doc chunks from the same file and AST signature into unified blocks with disk gap fill - Strict search modes: strict_code, strict_docs, all - Auto-enable IncludeDocs when mode=strict_docs - Memory-efficient readLines using bufio.Scanner - Collision-safe struct groupKey instead of string separator - Shared isDocSymbolType/isDocExtension helpers for consistent filtering - Removed non-doc extensions (.sh, .sql, .css, .scss, .svelte) from doc filter - Unit tests for grouping, gap retrieval, multi-file isolation, and fallback
2 parents 595619a + 433e6e5 commit 9a1c628

2 files changed

Lines changed: 403 additions & 1 deletion

File tree

internal/service/tools/smart_search.go

Lines changed: 219 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package tools
22

33
import (
4+
"bufio"
45
"context"
56
"errors"
67
"fmt"
78
"os"
9+
"path/filepath"
810
"sort"
911
"strings"
1012
"sync"
@@ -42,7 +44,10 @@ func (t *SmartSearchTool) Description() string {
4244
"high-confidence matches return full source code, exploratory results return compact summaries. " +
4345
"No need to choose a search mode. Provide 'file_path' for faster workspace detection, or omit it for Auto-Discovery. " +
4446
"Set 'include_full_content' to true to force full source code in all results, overriding compact mode. " +
45-
"Set 'include_docs' to true to also search project documentation (README, guides, Markdown files) alongside code."
47+
"Set 'include_docs' to true to also search project documentation (README, guides, Markdown files) alongside code. " +
48+
"Use 'mode'=\"strict_code\" when you ONLY want to see implementation logic exactly (Go, Python, etc) and strictly ignore documentation. " +
49+
"Use 'mode'=\"strict_docs\" when searching for architectural plans or summaries. " +
50+
"Use 'mode'=\"all\" or omit for broad scans."
4651
}
4752

4853
type SmartSearchInput struct {
@@ -51,6 +56,7 @@ type SmartSearchInput struct {
5156
Limit int `json:"limit,omitempty"`
5257
IncludeFullContent bool `json:"include_full_content,omitempty"`
5358
IncludeDocs bool `json:"include_docs,omitempty"`
59+
Mode string `json:"mode,omitempty"`
5460
}
5561

5662
// highConfidenceThreshold: if top result score exceeds this, return full content.
@@ -93,6 +99,12 @@ func (t *SmartSearchTool) Execute(ctx context.Context, input SmartSearchInput) (
9399
limit = input.Limit
94100
}
95101

102+
// When strict_docs mode is requested, automatically enable docs search
103+
// so that the semantic engine actually fetches documentation results.
104+
if input.Mode == "strict_docs" {
105+
input.IncludeDocs = true
106+
}
107+
96108
// Run both search strategies in parallel
97109
type searchResult struct {
98110
label string
@@ -172,6 +184,25 @@ func (t *SmartSearchTool) Execute(ctx context.Context, input SmartSearchInput) (
172184
// Merge and deduplicate results from both strategies
173185
merged := t.mergeResults(semanticRes, hybridRes, limit)
174186

187+
// Apply mode filtering
188+
var filtered []mergedResult
189+
for _, m := range merged {
190+
isDoc := isDocSymbolType(m.symbolType) || isDocExtension(m.filePath)
191+
// Strict code mode: ignore completely any documentation type or doc file
192+
if input.Mode == "strict_code" && isDoc {
193+
continue
194+
}
195+
// Strict docs mode: ignore anything that isn't documentation
196+
if input.Mode == "strict_docs" && !isDoc {
197+
continue
198+
}
199+
filtered = append(filtered, m)
200+
}
201+
merged = filtered
202+
203+
// Apply tree-based grouping for documentation chunks
204+
merged = t.groupDocsByTree(merged)
205+
175206
if len(merged) == 0 {
176207
response := ToolResponse{
177208
Status: "no_results",
@@ -457,3 +488,190 @@ func (t *SmartSearchTool) handleSearchError(err error, workspaceRoot, workspaceI
457488
response.Error = fmt.Sprintf("search failed: %v", err)
458489
return response.JSON()
459490
}
491+
492+
// isDocSymbolType returns true if the symbol type represents documentation content.
493+
func isDocSymbolType(symbolType string) bool {
494+
return symbolType == "documentation" || symbolType == "code_block" || symbolType == "markdown"
495+
}
496+
497+
// isDocExtension returns true if the file path has a documentation or structured text extension.
498+
func isDocExtension(filePath string) bool {
499+
ext := strings.ToLower(filepath.Ext(filePath))
500+
switch ext {
501+
case ".md", ".markdown", ".html", ".htm",
502+
".yaml", ".yml", ".json", ".xml",
503+
".toml", ".rst":
504+
return true
505+
}
506+
return false
507+
}
508+
509+
// readLines reads a specific range of lines from a file using a buffered scanner
510+
// to avoid loading the entire file into memory. Lines are 1-indexed.
511+
func readLines(filePath string, startLine, endLine int) (string, error) {
512+
if startLine < 1 || endLine < startLine {
513+
return "", fmt.Errorf("invalid line range %d-%d", startLine, endLine)
514+
}
515+
516+
f, err := os.Open(filePath)
517+
if err != nil {
518+
return "", err
519+
}
520+
defer f.Close()
521+
522+
var collected []string
523+
scanner := bufio.NewScanner(f)
524+
lineNum := 0
525+
for scanner.Scan() {
526+
lineNum++
527+
if lineNum > endLine {
528+
break
529+
}
530+
if lineNum >= startLine {
531+
collected = append(collected, scanner.Text())
532+
}
533+
}
534+
if err := scanner.Err(); err != nil {
535+
return "", err
536+
}
537+
if len(collected) == 0 {
538+
return "", fmt.Errorf("invalid line range: file has %d lines, requested %d-%d", lineNum, startLine, endLine)
539+
}
540+
541+
return strings.Join(collected, "\n"), nil
542+
}
543+
544+
// groupDocsByTree aggregates "documentation" and "code_block" chunks
545+
// from the same file and AST Signature (Markdown heading) into single unified blocks,
546+
// fetching the continuous text from disk to prevent Frankenstein gaps.
547+
func (t *SmartSearchTool) groupDocsByTree(results []mergedResult) []mergedResult {
548+
if len(results) == 0 {
549+
return results
550+
}
551+
552+
var out []mergedResult
553+
554+
type groupKey struct {
555+
filePath string
556+
signature string
557+
}
558+
559+
type docGroup struct {
560+
key groupKey
561+
items []*mergedResult
562+
maxScore float32
563+
minLine int
564+
maxLine int
565+
source string
566+
}
567+
568+
groupsMap := make(map[groupKey]*docGroup)
569+
var orderedGroups []groupKey // keep track of the first time we see a group to maintain rough sorting
570+
571+
for i := range results {
572+
res := &results[i]
573+
574+
// Only group documentation types and documentation/structured text files
575+
if !isDocSymbolType(res.symbolType) || !isDocExtension(res.filePath) || res.signature == "" {
576+
// Pass-through code or items without signature
577+
out = append(out, *res)
578+
continue
579+
}
580+
581+
key := groupKey{filePath: res.filePath, signature: res.signature}
582+
if g, exists := groupsMap[key]; exists {
583+
g.items = append(g.items, res)
584+
if res.score > g.maxScore {
585+
g.maxScore = res.score
586+
}
587+
if res.startLine > 0 && (g.minLine == 0 || res.startLine < g.minLine) {
588+
g.minLine = res.startLine
589+
}
590+
if res.endLine > 0 && res.endLine > g.maxLine {
591+
g.maxLine = res.endLine
592+
}
593+
if g.source != "both" && g.source != res.source {
594+
g.source = "both"
595+
}
596+
} else {
597+
minL := res.startLine
598+
if minL == 0 {
599+
minL = 1
600+
}
601+
maxL := res.endLine
602+
if maxL == 0 {
603+
maxL = 1
604+
}
605+
groupsMap[key] = &docGroup{
606+
key: key,
607+
items: []*mergedResult{res},
608+
maxScore: res.score,
609+
minLine: minL,
610+
maxLine: maxL,
611+
source: res.source,
612+
}
613+
orderedGroups = append(orderedGroups, key)
614+
}
615+
}
616+
617+
// Reconstruct the grouped items
618+
for _, key := range orderedGroups {
619+
g := groupsMap[key]
620+
621+
if len(g.items) == 1 {
622+
// Nothing to merge, just append
623+
out = append(out, *g.items[0])
624+
continue
625+
}
626+
627+
// Multiple chunks in this group. Let's merge them!
628+
// Attempt to read the full continuous block from the file
629+
fullContent := ""
630+
if g.minLine > 0 && g.maxLine >= g.minLine {
631+
content, err := readLines(g.key.filePath, g.minLine, g.maxLine)
632+
if err == nil {
633+
fullContent = content
634+
}
635+
}
636+
637+
// If reading from disk failed, append the contents manually with an ellipsis
638+
if fullContent == "" {
639+
var contents []string
640+
// Sort items by line number
641+
sortedItems := make([]*mergedResult, len(g.items))
642+
copy(sortedItems, g.items)
643+
sort.Slice(sortedItems, func(i, j int) bool {
644+
return sortedItems[i].startLine < sortedItems[j].startLine
645+
})
646+
for _, item := range sortedItems {
647+
contents = append(contents, strings.TrimSpace(item.content))
648+
}
649+
fullContent = strings.Join(contents, "\n\n[...]\n\n")
650+
}
651+
652+
baseItem := g.items[0] // take the first item as a prototype
653+
merged := mergedResult{
654+
id: fmt.Sprintf("merged_%s_%d_%d", baseItem.id, g.minLine, g.maxLine),
655+
score: g.maxScore,
656+
filePath: g.key.filePath,
657+
name: baseItem.name,
658+
symbolType: "documentation_merged",
659+
signature: g.key.signature,
660+
pkg: baseItem.pkg,
661+
docstring: fmt.Sprintf("Merged %d chunks spanning %d lines.", len(g.items), g.maxLine-g.minLine+1),
662+
content: fullContent,
663+
startLine: g.minLine,
664+
endLine: g.maxLine,
665+
source: g.source,
666+
}
667+
out = append(out, merged)
668+
}
669+
670+
// After mixing merged chunks and original unmerged items, we should re-sort by score
671+
sort.Slice(out, func(i, j int) bool {
672+
return out[i].score > out[j].score
673+
})
674+
675+
return out
676+
}
677+

0 commit comments

Comments
 (0)