Skip to content

Commit 9f7c8ee

Browse files
authored
fix: truncate embed text to prevent context length overflow (issue #53)
* Simplified `buildEmbedText()` truncation using a single rune slice to guarantee a hard-cap, even if metadata alone exceeds the limit. * Optimized memory usage for massive code chunks by iterating rune boundaries, avoiding full string instantiation and `[]rune` allocation. * Added robust edge-case tests validating 100% oversized metadata logic.
2 parents 7de6476 + c56300a commit 9f7c8ee

2 files changed

Lines changed: 216 additions & 5 deletions

File tree

internal/ragcode/indexer.go

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/json"
66
"fmt"
77
"hash/fnv"
8+
"log"
89
"path/filepath"
910
"strings"
1011

@@ -13,6 +14,62 @@ import (
1314
"github.com/doITmagic/rag-code-mcp/internal/memory"
1415
)
1516

17+
// maxEmbedChars is the maximum number of Unicode characters sent to the embedding
18+
// model. Common models (e.g. nomic-embed-text) have an 8 192-token context window
19+
// (~4 chars/token → ~32 768 chars). We use 30 000 to give ~6% headroom and stay
20+
// compatible with smaller-window models.
21+
const maxEmbedChars = 30_000
22+
23+
// buildEmbedText constructs the text to embed for a CodeChunk, then truncates it
24+
// to maxChars (rune-safe, UTF-8 correct) to avoid exceeding the model's context
25+
// window. Metadata (docstring, signature) is placed first so it is naturally
26+
// preserved during truncation. When the total exceeds maxChars the result is
27+
// hard-capped — no part of the text (including metadata) is allowed to overflow.
28+
// Returns (text, wasTruncated).
29+
func buildEmbedText(ch codetypes.CodeChunk, maxChars int) (string, bool) {
30+
meta := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
31+
ch.Docstring,
32+
ch.Signature,
33+
}), "\n\n"))
34+
35+
var metaWithSep string
36+
if meta != "" {
37+
if ch.Code != "" {
38+
metaWithSep = meta + "\n\n"
39+
} else {
40+
metaWithSep = meta
41+
}
42+
} else if ch.Code == "" {
43+
return "", false
44+
}
45+
46+
metaRunes := []rune(metaWithSep)
47+
if len(metaRunes) >= maxChars {
48+
return string(metaRunes[:maxChars]), true
49+
}
50+
51+
remaining := maxChars - len(metaRunes)
52+
53+
// Memory optimization for huge code chunks: avoid []rune conversion
54+
// and full string concatenation unless needed.
55+
if len(ch.Code) <= remaining {
56+
// Fast path: byte length <= remaining runes guaranteed to fit
57+
return metaWithSep + ch.Code, false
58+
}
59+
60+
// Slower path: count runes by ranging over the string which gives byte indices
61+
// at rune boundaries, avoiding copying the massive ch.Code into a []rune.
62+
charCount := 0
63+
for byteIndex := range ch.Code {
64+
if charCount >= remaining {
65+
return metaWithSep + ch.Code[:byteIndex], true
66+
}
67+
charCount++
68+
}
69+
70+
return metaWithSep + ch.Code, false
71+
}
72+
1673
// Indexer indexes CodeChunks into LongTermMemory using an embedding Provider.
1774
type Indexer struct {
1875
analyzer codetypes.PathAnalyzer
@@ -34,14 +91,15 @@ func (i *Indexer) IndexPaths(ctx context.Context, paths []string, sourceTag stri
3491

3592
indexed := 0
3693
for _, ch := range chunks {
37-
text := strings.TrimSpace(strings.Join(filterNonEmpty([]string{
38-
ch.Docstring,
39-
ch.Signature,
40-
ch.Code,
41-
}), "\n\n"))
94+
text, wasTruncated := buildEmbedText(ch, maxEmbedChars)
95+
text = strings.TrimSpace(text)
4296
if text == "" {
4397
continue
4498
}
99+
if wasTruncated {
100+
log.Printf("[WARN] embed text truncated for %s (%s:%d-%d) — content exceeds model context window",
101+
ch.Name, filepath.Base(ch.FilePath), ch.StartLine, ch.EndLine)
102+
}
45103

46104
emb, err := i.embedder.Embed(ctx, text)
47105
if err != nil {
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package ragcode
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/doITmagic/rag-code-mcp/internal/codetypes"
8+
)
9+
10+
func TestBuildEmbedText_NoTruncation(t *testing.T) {
11+
ch := codetypes.CodeChunk{
12+
Package: "mypkg",
13+
Name: "MyFunc",
14+
Signature: "func MyFunc() string",
15+
Code: "return \"hello\"",
16+
Docstring: "MyFunc returns hello.",
17+
FilePath: "foo.go",
18+
}
19+
text, truncated := buildEmbedText(ch, maxEmbedChars)
20+
if truncated {
21+
t.Fatal("expected no truncation for small chunk")
22+
}
23+
if !strings.Contains(text, ch.Docstring) {
24+
t.Errorf("embed text missing docstring")
25+
}
26+
if !strings.Contains(text, ch.Signature) {
27+
t.Errorf("embed text missing signature")
28+
}
29+
if !strings.Contains(text, ch.Code) {
30+
t.Errorf("embed text missing code")
31+
}
32+
}
33+
34+
func TestBuildEmbedText_TruncatesCode(t *testing.T) {
35+
bigCode := strings.Repeat("x", 40_000)
36+
ch := codetypes.CodeChunk{
37+
Package: "mypkg",
38+
Name: "BigFunc",
39+
Signature: "func BigFunc()",
40+
Code: bigCode,
41+
FilePath: "big.go",
42+
}
43+
limit := 30_000
44+
text, truncated := buildEmbedText(ch, limit)
45+
if !truncated {
46+
t.Fatal("expected truncation for large code body")
47+
}
48+
runes := []rune(text)
49+
if len(runes) > limit {
50+
t.Errorf("truncated text has %d runes, want <= %d", len(runes), limit)
51+
}
52+
if !strings.Contains(text, "func BigFunc()") {
53+
t.Errorf("embed text missing signature after truncation")
54+
}
55+
}
56+
57+
func TestBuildEmbedText_WithDocstring_TruncatesCode(t *testing.T) {
58+
bigCode := strings.Repeat("y", 40_000)
59+
ch := codetypes.CodeChunk{
60+
Signature: "func Fn()",
61+
Code: bigCode,
62+
Docstring: "This is a docstring.",
63+
FilePath: "fn.go",
64+
}
65+
limit := 30_000
66+
text, truncated := buildEmbedText(ch, limit)
67+
if !truncated {
68+
t.Fatal("expected truncation")
69+
}
70+
if !strings.Contains(text, "This is a docstring.") {
71+
t.Errorf("docstring missing after truncation")
72+
}
73+
if len([]rune(text)) > limit {
74+
t.Errorf("text exceeds limit after truncation")
75+
}
76+
}
77+
78+
func TestBuildEmbedText_ExactlyAtLimit(t *testing.T) {
79+
limit := 100
80+
// meta = "func Fn()\n\n" → 11 chars
81+
meta := "func Fn()\n\n"
82+
codeLen := limit - len([]rune(meta))
83+
ch := codetypes.CodeChunk{
84+
Signature: "func Fn()",
85+
Code: strings.Repeat("a", codeLen),
86+
FilePath: "fn.go",
87+
}
88+
text, truncated := buildEmbedText(ch, limit)
89+
if truncated {
90+
t.Fatalf("expected no truncation at exact boundary; len=%d", len([]rune(text)))
91+
}
92+
_ = text
93+
}
94+
95+
func TestBuildEmbedText_EmptyCode(t *testing.T) {
96+
ch := codetypes.CodeChunk{
97+
Signature: "func Empty()",
98+
Code: "",
99+
Docstring: "Empty function.",
100+
FilePath: "empty.go",
101+
}
102+
text, truncated := buildEmbedText(ch, maxEmbedChars)
103+
if truncated {
104+
t.Fatal("expected no truncation for empty code")
105+
}
106+
if !strings.Contains(text, "Empty function.") {
107+
t.Errorf("docstring missing")
108+
}
109+
if !strings.Contains(text, "func Empty()") {
110+
t.Errorf("signature missing")
111+
}
112+
}
113+
114+
func TestBuildEmbedText_OversizedMetadata(t *testing.T) {
115+
limit := 100
116+
bigDocstring := strings.Repeat("D", 200)
117+
ch := codetypes.CodeChunk{
118+
Signature: "func Big()",
119+
Code: "return nil",
120+
Docstring: bigDocstring,
121+
FilePath: "big.go",
122+
}
123+
text, truncated := buildEmbedText(ch, limit)
124+
if !truncated {
125+
t.Fatal("expected truncation when metadata alone exceeds limit")
126+
}
127+
runes := []rune(text)
128+
if len(runes) > limit {
129+
t.Errorf("text has %d runes, want <= %d", len(runes), limit)
130+
}
131+
if len(runes) != limit {
132+
t.Errorf("text has %d runes, want exactly %d (hard-capped)", len(runes), limit)
133+
}
134+
}
135+
136+
func TestBuildEmbedText_OversizedMetadata_NoCode(t *testing.T) {
137+
limit := 50
138+
bigDocstring := strings.Repeat("Z", 80)
139+
ch := codetypes.CodeChunk{
140+
Signature: "func Huge()",
141+
Code: "",
142+
Docstring: bigDocstring,
143+
FilePath: "huge.go",
144+
}
145+
text, truncated := buildEmbedText(ch, limit)
146+
if !truncated {
147+
t.Fatal("expected truncation when metadata exceeds limit")
148+
}
149+
runes := []rune(text)
150+
if len(runes) > limit {
151+
t.Errorf("text has %d runes, want <= %d", len(runes), limit)
152+
}
153+
}

0 commit comments

Comments
 (0)