Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 57 additions & 5 deletions internal/ragcode/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"hash/fnv"
"log"
"path/filepath"
"strings"

Expand All @@ -13,6 +14,56 @@ import (
"github.com/doITmagic/rag-code-mcp/internal/memory"
)

// maxEmbedChars is the maximum number of Unicode characters sent to the embedding
// model. Common models (e.g. nomic-embed-text) have an 8 192-token context window
// (~4 chars/token → ~32 768 chars). We use 30 000 to give ~6% headroom and stay
// compatible with smaller-window models.
const maxEmbedChars = 30_000

// buildEmbedText constructs the text to embed for a CodeChunk, then truncates it
// to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context
// window. Metadata (docstring, signature) is always preserved in full; only Code
// is truncated when the total exceeds maxChars.
// Returns (text, wasTruncated).
Comment thread
doITmagic marked this conversation as resolved.
func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) {
meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
ch.Docstring,
ch.Signature,
}), "\n\n"))

var full string
if ch.Code != "" {
if meta != "" {
full = meta + "\n\n" + ch.Code
} else {
full = ch.Code
}
} else {
full = meta
}

runes := []rune(full)
if len(runes) <= maxChars {
Comment thread
doITmagic marked this conversation as resolved.
Outdated
return full, false
}

// Truncate only the Code portion — keep metadata intact.
metaWithSep := meta
if meta != "" && ch.Code != "" {
metaWithSep = meta + "\n\n"
}
metaRunes := []rune(metaWithSep)
remaining := maxChars - len(metaRunes)
if remaining < 0 {
remaining = 0
}
codeRunes := []rune(ch.Code)
if remaining > len(codeRunes) {
remaining = len(codeRunes)
}
return metaWithSep + string(codeRunes[:remaining]), true
Comment thread
doITmagic marked this conversation as resolved.
Outdated
Comment thread
doITmagic marked this conversation as resolved.
Outdated
}

// Indexer indexes CodeChunks into LongTermMemory using an embedding Provider.
type Indexer struct {
analyzer codetypes.PathAnalyzer
Expand All @@ -34,14 +85,15 @@ func (i *Indexer) IndexPaths(ctx context.Context, paths []string, sourceTag stri

indexed := 0
for _, ch := range chunks {
text := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
ch.Docstring,
ch.Signature,
ch.Code,
}), "\n\n"))
text, wasTruncated := buildEmbedText(ch, maxEmbedChars)
text = strings.TrimSpace(text)
if text == "" {
continue
}
if wasTruncated {
log.Printf("[WARN] embed text truncated for %s (%s:%d-%d) — content exceeds model context window",
ch.Name, filepath.Base(ch.FilePath), ch.StartLine, ch.EndLine)
}

emb, err := i.embedder.Embed(ctx, text)
if err != nil {
Expand Down
112 changes: 112 additions & 0 deletions internal/ragcode/indexer_embed_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package ragcode

import (
"strings"
"testing"

"github.com/doITmagic/rag-code-mcp/internal/codetypes"
)

func TestBuildEmbedText_NoTruncation(t *testing.T) {
ch := codetypes.CodeChunk{
Package: "mypkg",
Name: "MyFunc",
Signature: "func MyFunc() string",
Code: "return \"hello\"",
Docstring: "MyFunc returns hello.",
FilePath: "foo.go",
}
text, truncated := buildEmbedText(ch, maxEmbedChars)
if truncated {
t.Fatal("expected no truncation for small chunk")
}
if !strings.Contains(text, ch.Docstring) {
t.Errorf("embed text missing docstring")
}
if !strings.Contains(text, ch.Signature) {
t.Errorf("embed text missing signature")
}
if !strings.Contains(text, ch.Code) {
t.Errorf("embed text missing code")
}
}

func TestBuildEmbedText_TruncatesCode(t *testing.T) {
bigCode := strings.Repeat("x", 40_000)
ch := codetypes.CodeChunk{
Package: "mypkg",
Name: "BigFunc",
Signature: "func BigFunc()",
Code: bigCode,
FilePath: "big.go",
}
limit := 30_000
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation for large code body")
}
runes := []rune(text)
if len(runes) > limit {
t.Errorf("truncated text has %d runes, want <= %d", len(runes), limit)
}
if !strings.Contains(text, "func BigFunc()") {
t.Errorf("embed text missing signature after truncation")
}
}

func TestBuildEmbedText_WithDocstring_TruncatesCode(t *testing.T) {
bigCode := strings.Repeat("y", 40_000)
ch := codetypes.CodeChunk{
Signature: "func Fn()",
Code: bigCode,
Docstring: "This is a docstring.",
FilePath: "fn.go",
}
limit := 30_000
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation")
}
if !strings.Contains(text, "This is a docstring.") {
t.Errorf("docstring missing after truncation")
}
if len([]rune(text)) > limit {
t.Errorf("text exceeds limit after truncation")
}
}
Comment on lines +57 to +76
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test coverage doesn’t currently include the case where metadata alone (very large Docstring or Signature) exceeds the limit. That’s the scenario where buildEmbedText should still guarantee the returned text is <= limit (or clearly define how it behaves). Adding a test for an oversized docstring/signature would prevent regressions and catch the current overflow behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in cc23127.


func TestBuildEmbedText_ExactlyAtLimit(t *testing.T) {
limit := 100
// meta = "func Fn()\n\n" → 11 chars
meta := "func Fn()\n\n"
codeLen := limit - len([]rune(meta))
ch := codetypes.CodeChunk{
Signature: "func Fn()",
Code: strings.Repeat("a", codeLen),
FilePath: "fn.go",
}
text, truncated := buildEmbedText(ch, limit)
if truncated {
t.Fatalf("expected no truncation at exact boundary; len=%d", len([]rune(text)))
}
_ = text
}

func TestBuildEmbedText_EmptyCode(t *testing.T) {
ch := codetypes.CodeChunk{
Signature: "func Empty()",
Code: "",
Docstring: "Empty function.",
FilePath: "empty.go",
}
text, truncated := buildEmbedText(ch, maxEmbedChars)
if truncated {
t.Fatal("expected no truncation for empty code")
}
if !strings.Contains(text, "Empty function.") {
t.Errorf("docstring missing")
}
if !strings.Contains(text, "func Empty()") {
t.Errorf("signature missing")
}
}
Loading