Skip to content

Commit 88cb8c7

Browse files
greynewellclaude
andauthored
fix(build): truncate search index descriptions by rune count not bytes (#100)
* test(build): add regression tests for search-index rune-boundary truncation Adds build_test.go covering generateSearchIndex: - short description written verbatim - long ASCII description truncated to exactly 120 runes - multi-byte (é, 2-byte) description truncated at rune boundary → valid UTF-8 - search disabled → no file written Regression for the byte-slice truncation bug fixed in build.go. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(build): truncate search index descriptions by rune count not bytes generateSearchIndex was using len(desc)/desc[:120] (byte operations) to limit descriptions to 120 characters. For multi-byte UTF-8 characters (é, ñ, ü, CJK, emoji) this could split a character in the middle, producing a replacement character (U+FFFD) when json.Marshal silently replaces invalid UTF-8 sequences in the output JSON. Fix: convert to []rune, check/slice by rune count, convert back to string. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent ac01b6d commit 88cb8c7

2 files changed

Lines changed: 142 additions & 2 deletions

File tree

internal/archdocs/pssg/build/build.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,8 +1342,8 @@ func (b *Builder) generateSearchIndex(entities []*entity.Entity, outDir string)
13421342
entries := make([]searchEntry, 0, len(entities))
13431343
for _, e := range entities {
13441344
desc := e.GetString("description")
1345-
if len(desc) > 120 {
1346-
desc = desc[:120]
1345+
if runes := []rune(desc); len(runes) > 120 {
1346+
desc = string(runes[:120])
13471347
}
13481348
entries = append(entries, searchEntry{
13491349
T: e.GetString("title"),
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package build
2+
3+
import (
4+
"encoding/json"
5+
"os"
6+
"path/filepath"
7+
"testing"
8+
"unicode/utf8"
9+
10+
"github.com/supermodeltools/cli/internal/archdocs/pssg/config"
11+
"github.com/supermodeltools/cli/internal/archdocs/pssg/entity"
12+
)
13+
14+
func newBuilder(outDir string) *Builder {
15+
return NewBuilder(&config.Config{
16+
Search: config.SearchConfig{Enabled: true},
17+
Paths: config.PathsConfig{Output: outDir},
18+
}, false)
19+
}
20+
21+
func makeEntity(slug, title, description string) *entity.Entity {
22+
return &entity.Entity{
23+
Slug: slug,
24+
Fields: map[string]interface{}{
25+
"title": title,
26+
"description": description,
27+
},
28+
}
29+
}
30+
31+
// TestGenerateSearchIndex_ShortDescription verifies that descriptions under
32+
// the 120-rune limit are written verbatim.
33+
func TestGenerateSearchIndex_ShortDescription(t *testing.T) {
34+
outDir := t.TempDir()
35+
b := newBuilder(outDir)
36+
37+
ent := makeEntity("test-slug", "Test Title", "Short description.")
38+
if err := b.generateSearchIndex([]*entity.Entity{ent}, outDir); err != nil {
39+
t.Fatalf("generateSearchIndex: %v", err)
40+
}
41+
42+
entries := readSearchIndex(t, outDir)
43+
if len(entries) != 1 {
44+
t.Fatalf("expected 1 entry, got %d", len(entries))
45+
}
46+
if entries[0]["d"] != "Short description." {
47+
t.Errorf("description mismatch: got %q", entries[0]["d"])
48+
}
49+
}
50+
51+
// TestGenerateSearchIndex_LongASCIIDescription verifies ASCII-only descriptions
52+
// longer than 120 chars are truncated to exactly 120 runes.
53+
func TestGenerateSearchIndex_LongASCIIDescription(t *testing.T) {
54+
outDir := t.TempDir()
55+
b := newBuilder(outDir)
56+
57+
// build a 200-char ASCII string
58+
long := ""
59+
for i := 0; i < 200; i++ {
60+
long += "a"
61+
}
62+
63+
ent := makeEntity("slug", "Title", long)
64+
if err := b.generateSearchIndex([]*entity.Entity{ent}, outDir); err != nil {
65+
t.Fatalf("generateSearchIndex: %v", err)
66+
}
67+
68+
entries := readSearchIndex(t, outDir)
69+
got := entries[0]["d"]
70+
if len([]rune(got)) != 120 {
71+
t.Errorf("expected 120 runes, got %d", len([]rune(got)))
72+
}
73+
}
74+
75+
// TestGenerateSearchIndex_MultiByteDescriptionTruncation is the regression test
76+
// for the byte-vs-rune truncation bug. A description whose byte length exceeds
77+
// 120 but whose rune count does not must NOT be truncated. A description whose
78+
// rune count exceeds 120 must be truncated at a rune boundary so the result
79+
// is valid UTF-8.
80+
func TestGenerateSearchIndex_MultiByteDescriptionTruncation(t *testing.T) {
81+
outDir := t.TempDir()
82+
b := newBuilder(outDir)
83+
84+
// Each 'é' is 2 bytes (U+00E9). We build a string of 121 'é' characters:
85+
// rune length = 121 (> 120) so it must be truncated to 120 runes.
86+
// byte length = 242, so the old code would have produced a split in the
87+
// middle of a multi-byte sequence → invalid UTF-8.
88+
longMultiByte := ""
89+
for i := 0; i < 121; i++ {
90+
longMultiByte += "é"
91+
}
92+
93+
ent := makeEntity("slug", "Title", longMultiByte)
94+
if err := b.generateSearchIndex([]*entity.Entity{ent}, outDir); err != nil {
95+
t.Fatalf("generateSearchIndex: %v", err)
96+
}
97+
98+
entries := readSearchIndex(t, outDir)
99+
got := entries[0]["d"]
100+
101+
if !utf8.ValidString(got) {
102+
t.Errorf("truncated description is not valid UTF-8: %q", got)
103+
}
104+
if runes := []rune(got); len(runes) != 120 {
105+
t.Errorf("expected 120 runes after truncation, got %d", len(runes))
106+
}
107+
}
108+
109+
// TestGenerateSearchIndex_DisabledSearch verifies no file is written when search
110+
// is disabled.
111+
func TestGenerateSearchIndex_DisabledSearch(t *testing.T) {
112+
outDir := t.TempDir()
113+
b := NewBuilder(&config.Config{
114+
Search: config.SearchConfig{Enabled: false},
115+
Paths: config.PathsConfig{Output: outDir},
116+
}, false)
117+
118+
ent := makeEntity("slug", "Title", "desc")
119+
if err := b.generateSearchIndex([]*entity.Entity{ent}, outDir); err != nil {
120+
t.Fatalf("generateSearchIndex: %v", err)
121+
}
122+
123+
if _, err := os.Stat(filepath.Join(outDir, "search-index.json")); !os.IsNotExist(err) {
124+
t.Error("search-index.json should not be written when search is disabled")
125+
}
126+
}
127+
128+
// readSearchIndex reads and unmarshals the search-index.json from outDir.
129+
func readSearchIndex(t *testing.T, outDir string) []map[string]string {
130+
t.Helper()
131+
data, err := os.ReadFile(filepath.Join(outDir, "search-index.json"))
132+
if err != nil {
133+
t.Fatalf("reading search-index.json: %v", err)
134+
}
135+
var entries []map[string]string
136+
if err := json.Unmarshal(data, &entries); err != nil {
137+
t.Fatalf("unmarshaling search-index.json: %v", err)
138+
}
139+
return entries
140+
}

0 commit comments

Comments
 (0)