Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 63 additions & 5 deletions internal/ragcode/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"hash/fnv"
"log"
"path/filepath"
"strings"

Expand All @@ -13,6 +14,62 @@ import (
"github.com/doITmagic/rag-code-mcp/internal/memory"
)

// maxEmbedChars is the maximum number of Unicode characters sent to the embedding
// model. Common models (e.g. nomic-embed-text) have an 8 192-token context window
// (~4 chars/token → ~32 768 chars). We use 30 000 to give ~6% headroom and stay
// compatible with smaller-window models.
const maxEmbedChars = 30_000

// buildEmbedText constructs the text to embed for a CodeChunk, then truncates it
// to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context
// window. Metadata (docstring, signature) is placed first so it is naturally
// preserved during truncation. When the total exceeds maxChars the result is
// hard-capped — no part of the text (including metadata) is allowed to overflow.
// Returns (text, wasTruncated).
Comment thread
doITmagic marked this conversation as resolved.
func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) {
meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
ch.Docstring,
ch.Signature,
}), "\n\n"))

var metaWithSep string
if meta != "" {
if ch.Code != "" {
metaWithSep = meta + "\n\n"
} else {
metaWithSep = meta
}
} else if ch.Code == "" {
return "", false
}

metaRunes := []rune(metaWithSep)
if len(metaRunes) >= maxChars {
return string(metaRunes[:maxChars]), true
}

remaining := maxChars - len(metaRunes)

// Memory optimization for huge code chunks: avoid []rune conversion
// and full string concatenation unless needed.
if len(ch.Code) <= remaining {
// Fast path: byte length <= remaining runes guaranteed to fit
return metaWithSep + ch.Code, false
}

// Slower path: count runes by ranging over the string which gives byte indices
// at rune boundaries, avoiding copying the massive ch.Code into a []rune.
charCount := 0
for byteIndex := range ch.Code {
if charCount >= remaining {
return metaWithSep + ch.Code[:byteIndex], true
}
charCount++
}

return metaWithSep + ch.Code, false
}

// Indexer indexes CodeChunks into LongTermMemory using an embedding Provider.
type Indexer struct {
analyzer codetypes.PathAnalyzer
Expand All @@ -34,14 +91,15 @@ func (i *Indexer) IndexPaths(ctx context.Context, paths []string, sourceTag stri

indexed := 0
for _, ch := range chunks {
text := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
ch.Docstring,
ch.Signature,
ch.Code,
}), "\n\n"))
text, wasTruncated := buildEmbedText(ch, maxEmbedChars)
text = strings.TrimSpace(text)
if text == "" {
continue
}
if wasTruncated {
log.Printf("[WARN] embed text truncated for %s (%s:%d-%d) — content exceeds model context window",
ch.Name, filepath.Base(ch.FilePath), ch.StartLine, ch.EndLine)
}

emb, err := i.embedder.Embed(ctx, text)
if err != nil {
Expand Down
153 changes: 153 additions & 0 deletions internal/ragcode/indexer_embed_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package ragcode

import (
"strings"
"testing"

"github.com/doITmagic/rag-code-mcp/internal/codetypes"
)

func TestBuildEmbedText_NoTruncation(t *testing.T) {
ch := codetypes.CodeChunk{
Package: "mypkg",
Name: "MyFunc",
Signature: "func MyFunc() string",
Code: "return \"hello\"",
Docstring: "MyFunc returns hello.",
FilePath: "foo.go",
}
text, truncated := buildEmbedText(ch, maxEmbedChars)
if truncated {
t.Fatal("expected no truncation for small chunk")
}
if !strings.Contains(text, ch.Docstring) {
t.Errorf("embed text missing docstring")
}
if !strings.Contains(text, ch.Signature) {
t.Errorf("embed text missing signature")
}
if !strings.Contains(text, ch.Code) {
t.Errorf("embed text missing code")
}
}

func TestBuildEmbedText_TruncatesCode(t *testing.T) {
bigCode := strings.Repeat("x", 40_000)
ch := codetypes.CodeChunk{
Package: "mypkg",
Name: "BigFunc",
Signature: "func BigFunc()",
Code: bigCode,
FilePath: "big.go",
}
limit := 30_000
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation for large code body")
}
runes := []rune(text)
if len(runes) > limit {
t.Errorf("truncated text has %d runes, want <= %d", len(runes), limit)
}
if !strings.Contains(text, "func BigFunc()") {
t.Errorf("embed text missing signature after truncation")
}
}

func TestBuildEmbedText_WithDocstring_TruncatesCode(t *testing.T) {
bigCode := strings.Repeat("y", 40_000)
ch := codetypes.CodeChunk{
Signature: "func Fn()",
Code: bigCode,
Docstring: "This is a docstring.",
FilePath: "fn.go",
}
limit := 30_000
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation")
}
if !strings.Contains(text, "This is a docstring.") {
t.Errorf("docstring missing after truncation")
}
if len([]rune(text)) > limit {
t.Errorf("text exceeds limit after truncation")
}
}
Comment on lines +57 to +76
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test coverage doesn’t currently include the case where metadata alone (very large Docstring or Signature) exceeds the limit. That’s the scenario where buildEmbedText should still guarantee the returned text is <= limit (or clearly define how it behaves). Adding a test for an oversized docstring/signature would prevent regressions and catch the current overflow behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in cc23127.


func TestBuildEmbedText_ExactlyAtLimit(t *testing.T) {
limit := 100
// meta = "func Fn()\n\n" → 11 chars
meta := "func Fn()\n\n"
codeLen := limit - len([]rune(meta))
ch := codetypes.CodeChunk{
Signature: "func Fn()",
Code: strings.Repeat("a", codeLen),
FilePath: "fn.go",
}
text, truncated := buildEmbedText(ch, limit)
if truncated {
t.Fatalf("expected no truncation at exact boundary; len=%d", len([]rune(text)))
}
_ = text
}

func TestBuildEmbedText_EmptyCode(t *testing.T) {
ch := codetypes.CodeChunk{
Signature: "func Empty()",
Code: "",
Docstring: "Empty function.",
FilePath: "empty.go",
}
text, truncated := buildEmbedText(ch, maxEmbedChars)
if truncated {
t.Fatal("expected no truncation for empty code")
}
if !strings.Contains(text, "Empty function.") {
t.Errorf("docstring missing")
}
if !strings.Contains(text, "func Empty()") {
t.Errorf("signature missing")
}
}

func TestBuildEmbedText_OversizedMetadata(t *testing.T) {
limit := 100
bigDocstring := strings.Repeat("D", 200)
ch := codetypes.CodeChunk{
Signature: "func Big()",
Code: "return nil",
Docstring: bigDocstring,
FilePath: "big.go",
}
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation when metadata alone exceeds limit")
}
runes := []rune(text)
if len(runes) > limit {
t.Errorf("text has %d runes, want <= %d", len(runes), limit)
}
if len(runes) != limit {
t.Errorf("text has %d runes, want exactly %d (hard-capped)", len(runes), limit)
}
}

func TestBuildEmbedText_OversizedMetadata_NoCode(t *testing.T) {
limit := 50
bigDocstring := strings.Repeat("Z", 80)
ch := codetypes.CodeChunk{
Signature: "func Huge()",
Code: "",
Docstring: bigDocstring,
FilePath: "huge.go",
}
text, truncated := buildEmbedText(ch, limit)
if !truncated {
t.Fatal("expected truncation when metadata exceeds limit")
}
runes := []rune(text)
if len(runes) > limit {
t.Errorf("text has %d runes, want <= %d", len(runes), limit)
}
}
Loading