From fe3bdf24ff52b0f2bab6f1dded8db2209353c31d Mon Sep 17 00:00:00 2001 From: doITmagic Date: Thu, 16 Apr 2026 20:27:38 +0300 Subject: [PATCH 1/3] fix: truncate embed text to prevent context length overflow (issue #53) Add buildEmbedText() helper that caps embed payload at 30 000 chars (~7 700 tokens, safe for 8 192-token models like nomic-embed-text). Metadata (docstring + signature) is always preserved in full; only the raw Code body is truncated when the total exceeds the limit. A [WARN] log is emitted when truncation occurs. Fixes #53 --- internal/ragcode/indexer.go | 62 ++++++++++++-- internal/ragcode/indexer_embed_test.go | 112 +++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 internal/ragcode/indexer_embed_test.go diff --git a/internal/ragcode/indexer.go b/internal/ragcode/indexer.go index c9fd8f3..2edee6f 100644 --- a/internal/ragcode/indexer.go +++ b/internal/ragcode/indexer.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "hash/fnv" + "log" "path/filepath" "strings" @@ -13,6 +14,56 @@ import ( "github.com/doITmagic/rag-code-mcp/internal/memory" ) +// maxEmbedChars is the maximum number of Unicode characters sent to the embedding +// model. Common models (e.g. nomic-embed-text) have an 8 192-token context window +// (~4 chars/token → ~32 768 chars). We use 30 000 to give ~6% headroom and stay +// compatible with smaller-window models. +const maxEmbedChars = 30_000 + +// buildEmbedText constructs the text to embed for a CodeChunk, then truncates it +// to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context +// window. Metadata (docstring, signature) is always preserved in full; only Code +// is truncated when the total exceeds maxChars. +// Returns (text, wasTruncated). +func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) { + meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{ + ch.Docstring, + ch.Signature, + }), "\n\n")) + + var full string + if ch.Code != "" { + if meta != "" { + full = meta + "\n\n" + ch.Code + } else { + full = ch.Code + } + } else { + full = meta + } + + runes := []rune(full) + if len(runes) <= maxChars { + return full, false + } + + // Truncate only the Code portion — keep metadata intact. + metaWithSep := meta + if meta != "" && ch.Code != "" { + metaWithSep = meta + "\n\n" + } + metaRunes := []rune(metaWithSep) + remaining := maxChars - len(metaRunes) + if remaining < 0 { + remaining = 0 + } + codeRunes := []rune(ch.Code) + if remaining > len(codeRunes) { + remaining = len(codeRunes) + } + return metaWithSep + string(codeRunes[:remaining]), true +} + // Indexer indexes CodeChunks into LongTermMemory using an embedding Provider. type Indexer struct { analyzer codetypes.PathAnalyzer @@ -34,14 +85,15 @@ func (i *Indexer) IndexPaths(ctx context.Context, paths []string, sourceTag stri indexed := 0 for _, ch := range chunks { - text := strings.TrimSpace(strings.Join(filterNonEmpty([]string{ - ch.Docstring, - ch.Signature, - ch.Code, - }), "\n\n")) + text, wasTruncated := buildEmbedText(ch, maxEmbedChars) + text = strings.TrimSpace(text) if text == "" { continue } + if wasTruncated { + log.Printf("[WARN] embed text truncated for %s (%s:%d-%d) — content exceeds model context window", + ch.Name, filepath.Base(ch.FilePath), ch.StartLine, ch.EndLine) + } emb, err := i.embedder.Embed(ctx, text) if err != nil { diff --git a/internal/ragcode/indexer_embed_test.go b/internal/ragcode/indexer_embed_test.go new file mode 100644 index 0000000..96366e3 --- /dev/null +++ b/internal/ragcode/indexer_embed_test.go @@ -0,0 +1,112 @@ +package ragcode + +import ( + "strings" + "testing" + + "github.com/doITmagic/rag-code-mcp/internal/codetypes" +) + +func TestBuildEmbedText_NoTruncation(t *testing.T) { + ch := codetypes.CodeChunk{ + Package: "mypkg", + Name: "MyFunc", + Signature: "func MyFunc() string", + Code: "return \"hello\"", + Docstring: "MyFunc returns hello.", + FilePath: "foo.go", + } + text, truncated := buildEmbedText(ch, maxEmbedChars) + if truncated { + t.Fatal("expected no truncation for small chunk") + } + if !strings.Contains(text, ch.Docstring) { + t.Errorf("embed text missing docstring") + } + if !strings.Contains(text, ch.Signature) { + t.Errorf("embed text missing signature") + } + if !strings.Contains(text, ch.Code) { + t.Errorf("embed text missing code") + } +} + +func TestBuildEmbedText_TruncatesCode(t *testing.T) { + bigCode := strings.Repeat("x", 40_000) + ch := codetypes.CodeChunk{ + Package: "mypkg", + Name: "BigFunc", + Signature: "func BigFunc()", + Code: bigCode, + FilePath: "big.go", + } + limit := 30_000 + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation for large code body") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("truncated text has %d runes, want <= %d", len(runes), limit) + } + if !strings.Contains(text, "func BigFunc()") { + t.Errorf("embed text missing signature after truncation") + } +} + +func TestBuildEmbedText_WithDocstring_TruncatesCode(t *testing.T) { + bigCode := strings.Repeat("y", 40_000) + ch := codetypes.CodeChunk{ + Signature: "func Fn()", + Code: bigCode, + Docstring: "This is a docstring.", + FilePath: "fn.go", + } + limit := 30_000 + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation") + } + if !strings.Contains(text, "This is a docstring.") { + t.Errorf("docstring missing after truncation") + } + if len([]rune(text)) > limit { + t.Errorf("text exceeds limit after truncation") + } +} + +func TestBuildEmbedText_ExactlyAtLimit(t *testing.T) { + limit := 100 + // meta = "func Fn()\n\n" → 11 chars + meta := "func Fn()\n\n" + codeLen := limit - len([]rune(meta)) + ch := codetypes.CodeChunk{ + Signature: "func Fn()", + Code: strings.Repeat("a", codeLen), + FilePath: "fn.go", + } + text, truncated := buildEmbedText(ch, limit) + if truncated { + t.Fatalf("expected no truncation at exact boundary; len=%d", len([]rune(text))) + } + _ = text +} + +func TestBuildEmbedText_EmptyCode(t *testing.T) { + ch := codetypes.CodeChunk{ + Signature: "func Empty()", + Code: "", + Docstring: "Empty function.", + FilePath: "empty.go", + } + text, truncated := buildEmbedText(ch, maxEmbedChars) + if truncated { + t.Fatal("expected no truncation for empty code") + } + if !strings.Contains(text, "Empty function.") { + t.Errorf("docstring missing") + } + if !strings.Contains(text, "func Empty()") { + t.Errorf("signature missing") + } +} From cc23127a13f6a218fd6eba2d359a9c6bd7927d26 Mon Sep 17 00:00:00 2001 From: doITmagic Date: Mon, 20 Apr 2026 04:46:53 +0300 Subject: [PATCH 2/3] fix: hard-cap embed text truncation to strictly enforce maxChars limit Address PR #55 review comments: - Simplify buildEmbedText() to use single runes[:maxChars] truncation (gemini-code-assist suggestion). This guarantees the limit is never exceeded, even when metadata alone is larger than maxChars. - Remove redundant []rune conversions in truncation path (Copilot memory concern). - Add tests for oversized metadata with and without code body (Copilot test coverage request). --- internal/ragcode/indexer.go | 22 ++++---------- internal/ragcode/indexer_embed_test.go | 41 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/internal/ragcode/indexer.go b/internal/ragcode/indexer.go index 2edee6f..3a469af 100644 --- a/internal/ragcode/indexer.go +++ b/internal/ragcode/indexer.go @@ -22,8 +22,9 @@ const maxEmbedChars = 30_000 // buildEmbedText constructs the text to embed for a CodeChunk, then truncates it // to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context -// window. Metadata (docstring, signature) is always preserved in full; only Code -// is truncated when the total exceeds maxChars. +// window. Metadata (docstring, signature) is placed first so it is naturally +// preserved during truncation. When the total exceeds maxChars the result is +// hard-capped — no part of the text (including metadata) is allowed to overflow. // Returns (text, wasTruncated). func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) { meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{ @@ -47,21 +48,8 @@ func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) { return full, false } - // Truncate only the Code portion — keep metadata intact. - metaWithSep := meta - if meta != "" && ch.Code != "" { - metaWithSep = meta + "\n\n" - } - metaRunes := []rune(metaWithSep) - remaining := maxChars - len(metaRunes) - if remaining < 0 { - remaining = 0 - } - codeRunes := []rune(ch.Code) - if remaining > len(codeRunes) { - remaining = len(codeRunes) - } - return metaWithSep + string(codeRunes[:remaining]), true + // Truncate to maxChars. Since metadata is at the start, it is preserved. + return string(runes[:maxChars]), true } // Indexer indexes CodeChunks into LongTermMemory using an embedding Provider. diff --git a/internal/ragcode/indexer_embed_test.go b/internal/ragcode/indexer_embed_test.go index 96366e3..73d971c 100644 --- a/internal/ragcode/indexer_embed_test.go +++ b/internal/ragcode/indexer_embed_test.go @@ -110,3 +110,44 @@ func TestBuildEmbedText_EmptyCode(t *testing.T) { t.Errorf("signature missing") } } + +func TestBuildEmbedText_OversizedMetadata(t *testing.T) { + limit := 100 + bigDocstring := strings.Repeat("D", 200) + ch := codetypes.CodeChunk{ + Signature: "func Big()", + Code: "return nil", + Docstring: bigDocstring, + FilePath: "big.go", + } + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation when metadata alone exceeds limit") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("text has %d runes, want <= %d", len(runes), limit) + } + if len(runes) != limit { + t.Errorf("text has %d runes, want exactly %d (hard-capped)", len(runes), limit) + } +} + +func TestBuildEmbedText_OversizedMetadata_NoCode(t *testing.T) { + limit := 50 + bigDocstring := strings.Repeat("Z", 80) + ch := codetypes.CodeChunk{ + Signature: "func Huge()", + Code: "", + Docstring: bigDocstring, + FilePath: "huge.go", + } + text, truncated := buildEmbedText(ch, limit) + if !truncated { + t.Fatal("expected truncation when metadata exceeds limit") + } + runes := []rune(text) + if len(runes) > limit { + t.Errorf("text has %d runes, want <= %d", len(runes), limit) + } +} From c56300a724bb1b9a3cca0a7b31809038f8537bdc Mon Sep 17 00:00:00 2001 From: doITmagic Date: Mon, 20 Apr 2026 05:22:08 +0300 Subject: [PATCH 3/3] perf: optimize memory usage during embed text truncation Avoid massive allocations by not concatenating large code chunks with metadata before truncation. Instead, compute available space and only extract the necessary substring. Uses string iteration to safely truncate at rune boundaries without large []rune casting. --- internal/ragcode/indexer.go | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/internal/ragcode/indexer.go b/internal/ragcode/indexer.go index 3a469af..9c61e4e 100644 --- a/internal/ragcode/indexer.go +++ b/internal/ragcode/indexer.go @@ -32,24 +32,42 @@ func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) { ch.Signature, }), "\n\n")) - var full string - if ch.Code != "" { - if meta != "" { - full = meta + "\n\n" + ch.Code + var metaWithSep string + if meta != "" { + if ch.Code != "" { + metaWithSep = meta + "\n\n" } else { - full = ch.Code + metaWithSep = meta } - } else { - full = meta + } else if ch.Code == "" { + return "", false } - runes := []rune(full) - if len(runes) <= maxChars { - return full, false + metaRunes := []rune(metaWithSep) + if len(metaRunes) >= maxChars { + return string(metaRunes[:maxChars]), true } - // Truncate to maxChars. Since metadata is at the start, it is preserved. - return string(runes[:maxChars]), true + remaining := maxChars - len(metaRunes) + + // Memory optimization for huge code chunks: avoid []rune conversion + // and full string concatenation unless needed. + if len(ch.Code) <= remaining { + // Fast path: byte length <= remaining runes guaranteed to fit + return metaWithSep + ch.Code, false + } + + // Slower path: count runes by ranging over the string which gives byte indices + // at rune boundaries, avoiding copying the massive ch.Code into a []rune. + charCount := 0 + for byteIndex := range ch.Code { + if charCount >= remaining { + return metaWithSep + ch.Code[:byteIndex], true + } + charCount++ + } + + return metaWithSep + ch.Code, false } // Indexer indexes CodeChunks into LongTermMemory using an embedding Provider.