diff --git a/internal/ragcode/indexer.go b/internal/ragcode/indexer.go index c9fd8f3..9c61e4e 100644 --- a/internal/ragcode/indexer.go +++ b/internal/ragcode/indexer.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "hash/fnv" + "log" "path/filepath" "strings" @@ -13,6 +14,62 @@ import ( "github.com/doITmagic/rag-code-mcp/internal/memory" ) +// maxEmbedChars is the maximum number of Unicode characters sent to the embedding +// model. Common models (e.g. nomic-embed-text) have an 8 192-token context window +// (~4 chars/token → ~32 768 chars). We use 30 000 to give ~6% headroom and stay +// compatible with smaller-window models. +const maxEmbedChars = 30_000 + +// buildEmbedText constructs the text to embed for a CodeChunk, then truncates it +// to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context +// window. Metadata (docstring, signature) is placed first so it is naturally +// preserved during truncation. When the total exceeds maxChars the result is +// hard-capped — no part of the text (including metadata) is allowed to overflow. +// Returns (text, wasTruncated). +func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) { + meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{ + ch.Docstring, + ch.Signature, + }), "\n\n")) + + var metaWithSep string + if meta != "" { + if ch.Code != "" { + metaWithSep = meta + "\n\n" + } else { + metaWithSep = meta + } + } else if ch.Code == "" { + return "", false + } + + metaRunes := []rune(metaWithSep) + if len(metaRunes) >= maxChars { + return string(metaRunes[:maxChars]), true + } + + remaining := maxChars - len(metaRunes) + + // Memory optimization for huge code chunks: avoid []rune conversion + // and full string concatenation unless needed. + if len(ch.Code) <= remaining { + // Fast path: byte length <= remaining runes guaranteed to fit + return metaWithSep + ch.Code, false + } + + // Slower path: count runes by ranging over the string which gives byte indices + // at rune boundaries, avoiding copying the massive ch.Code into a []rune. + charCount := 0 + for byteIndex := range ch.Code { + if charCount >= remaining { + return metaWithSep + ch.Code[:byteIndex], true + } + charCount++ + } + + return metaWithSep + ch.Code, false +} + // Indexer indexes CodeChunks into LongTermMemory using an embedding Provider. type Indexer struct { analyzer codetypes.PathAnalyzer @@ -34,14 +91,15 @@ func (i *Indexer) IndexPaths(ctx context.Context, paths []string, sourceTag stri indexed := 0 for _, ch := range chunks { - text := strings.TrimSpace(strings.Join(filterNonEmpty([]string{ - ch.Docstring, - ch.Signature, - ch.Code, - }), "\n\n")) + text, wasTruncated := buildEmbedText(ch, maxEmbedChars) + text = strings.TrimSpace(text) if text == "" { continue } + if wasTruncated { + log.Printf("[WARN] embed text truncated for %s (%s:%d-%d) — content exceeds model context window", + ch.Name, filepath.Base(ch.FilePath), ch.StartLine, ch.EndLine) + } emb, err := i.embedder.Embed(ctx, text) if err != nil { diff --git a/internal/ragcode/indexer_embed_test.go b/internal/ragcode/indexer_embed_test.go new file mode 100644 index 0000000..73d971c --- /dev/null +++ b/internal/ragcode/indexer_embed_test.go @@ -0,0 +1,153 @@ +package ragcode + +import ( + "strings" + "testing" + + "github.com/doITmagic/rag-code-mcp/internal/codetypes" +) + +func TestBuildEmbedText_NoTruncation(t *testing.T) { + ch := codetypes.CodeChunk{ + Package: "mypkg", + Name: "MyFunc", + Signature: "func MyFunc() string", + Code: "return \"hello\"", + Docstring: "MyFunc returns hello.", + FilePath: "foo.go", + } + text, truncated := buildEmbedText(ch, maxEmbedChars) + if truncated { + t.Fatal("expected no truncation for small chunk") + } + if !strings.Contains(text, ch.Docstring) { + t.Errorf("embed text missing docstring") + } + if !strings.Contains(text, ch.Signature) { + t.Errorf("embed text missing signature") + } + if !strings.Contains(text, ch.Code) { + t.Errorf("embed text missing code") + } +} + +func TestBuildEmbedText_TruncatesCode(t *testing.T) { + bigCode := strings.Repeat("x", 40_000) + ch := codetypes.CodeChunk{ + Package: "mypkg", + Name: "BigFunc", + Signature: "func BigFunc()", + Code: bigCode, + FilePath: "big.go", + } + limit := 30_000 + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation for large code body") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("truncated text has %d runes, want <= %d", len(runes), limit) + } + if !strings.Contains(text, "func BigFunc()") { + t.Errorf("embed text missing signature after truncation") + } +} + +func TestBuildEmbedText_WithDocstring_TruncatesCode(t *testing.T) { + bigCode := strings.Repeat("y", 40_000) + ch := codetypes.CodeChunk{ + Signature: "func Fn()", + Code: bigCode, + Docstring: "This is a docstring.", + FilePath: "fn.go", + } + limit := 30_000 + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation") + } + if !strings.Contains(text, "This is a docstring.") { + t.Errorf("docstring missing after truncation") + } + if len([]rune(text)) > limit { + t.Errorf("text exceeds limit after truncation") + } +} + +func TestBuildEmbedText_ExactlyAtLimit(t *testing.T) { + limit := 100 + // meta = "func Fn()\n\n" → 11 chars + meta := "func Fn()\n\n" + codeLen := limit - len([]rune(meta)) + ch := codetypes.CodeChunk{ + Signature: "func Fn()", + Code: strings.Repeat("a", codeLen), + FilePath: "fn.go", + } + text, truncated := buildEmbedText(ch, limit) + if truncated { + t.Fatalf("expected no truncation at exact boundary; len=%d", len([]rune(text))) + } + _ = text +} + +func TestBuildEmbedText_EmptyCode(t *testing.T) { + ch := codetypes.CodeChunk{ + Signature: "func Empty()", + Code: "", + Docstring: "Empty function.", + FilePath: "empty.go", + } + text, truncated := buildEmbedText(ch, maxEmbedChars) + if truncated { + t.Fatal("expected no truncation for empty code") + } + if !strings.Contains(text, "Empty function.") { + t.Errorf("docstring missing") + } + if !strings.Contains(text, "func Empty()") { + t.Errorf("signature missing") + } +} + +func TestBuildEmbedText_OversizedMetadata(t *testing.T) { + limit := 100 + bigDocstring := strings.Repeat("D", 200) + ch := codetypes.CodeChunk{ + Signature: "func Big()", + Code: "return nil", + Docstring: bigDocstring, + FilePath: "big.go", + } + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation when metadata alone exceeds limit") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("text has %d runes, want <= %d", len(runes), limit) + } + if len(runes) != limit { + t.Errorf("text has %d runes, want exactly %d (hard-capped)", len(runes), limit) + } +} + +func TestBuildEmbedText_OversizedMetadata_NoCode(t *testing.T) { + limit := 50 + bigDocstring := strings.Repeat("Z", 80) + ch := codetypes.CodeChunk{ + Signature: "func Huge()", + Code: "", + Docstring: bigDocstring, + FilePath: "huge.go", + } + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation when metadata exceeds limit") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("text has %d runes, want <= %d", len(runes), limit) + } +}