Skip to content

Commit fde726f

Browse files
committed
index: fix merge bug by sharing more code between write and merge
1 parent e53428b commit fde726f

5 files changed

Lines changed: 172 additions & 91 deletions

File tree

cmd/cindex/cindex.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ var (
6262
resetFlag = flag.Bool("reset", false, "discard existing index")
6363
verboseFlag = flag.Bool("verbose", false, "print extra information")
6464
cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file")
65+
checkFlag = flag.Bool("check", false, "check index is well-formatted")
6566
zipFlag = flag.Bool("zip", false, "index content in zip files")
6667
statsFlag = flag.Bool("stats", false, "print index size statistics")
6768
)
@@ -73,6 +74,11 @@ func main() {
7374

7475
if *listFlag {
7576
ix := index.Open(index.File())
77+
if *checkFlag {
78+
if err := ix.Check(); err != nil {
79+
log.Fatal(err)
80+
}
81+
}
7682
for p := range ix.Roots().All() {
7783
fmt.Printf("%s\n", p)
7884
}
@@ -119,6 +125,12 @@ func main() {
119125
file := master
120126
if !*resetFlag {
121127
file += "~"
128+
if *checkFlag {
129+
ix := index.Open(master)
130+
if err := ix.Check(); err != nil {
131+
log.Fatal(err)
132+
}
133+
}
122134
}
123135

124136
ix := index.Create(file)
@@ -156,9 +168,23 @@ func main() {
156168
if !*resetFlag {
157169
log.Printf("merge %s %s", master, file)
158170
index.Merge(file+"~", master, file)
171+
if *checkFlag {
172+
ix := index.Open(file + "~")
173+
if err := ix.Check(); err != nil {
174+
log.Fatal(err)
175+
}
176+
}
159177
os.Remove(file)
160178
os.Rename(file+"~", master)
179+
} else {
180+
if *checkFlag {
181+
ix := index.Open(file)
182+
if err := ix.Check(); err != nil {
183+
log.Fatal(err)
184+
}
185+
}
161186
}
187+
162188
log.Printf("done")
163189

164190
if *statsFlag {

index/check.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package index
6+
7+
import (
8+
"encoding/binary"
9+
"encoding/hex"
10+
"fmt"
11+
"os"
12+
)
13+
14+
func (ix *Index) Check() error {
15+
if ix.version == 1 {
16+
return nil
17+
}
18+
19+
// TODO move to Index
20+
old := panicOnCorrupt
21+
panicOnCorrupt = true
22+
defer func() {
23+
panicOnCorrupt = old
24+
}()
25+
26+
// Read all names.
27+
for _ = range ix.NamesAt(0, ix.numName).All() {
28+
}
29+
30+
// Read all posting lists blocks.
31+
pblocks := ix.slice(ix.postIndex, ix.numPostBlock*postBlockSize)
32+
pdata := ix.slice(ix.postData, ix.nameIndex-ix.postData)
33+
pblocks0 := pblocks
34+
n := 0
35+
for len(pblocks) > 0 {
36+
b := pblocks[:postBlockSize]
37+
pblocks = pblocks[postBlockSize:]
38+
offset := 0
39+
b0 := b
40+
_ = b0
41+
for len(b) > 3 && (b[0] != 0 || b[1] != 0 || b[2] != 0) {
42+
t := b[:3]
43+
trigram := uint32(b[0])<<16 | uint32(b[1])<<8 | uint32(b[2])
44+
_ = trigram
45+
count, l1 := binary.Uvarint(b[3:])
46+
if l1 <= 0 {
47+
ix.corrupt()
48+
}
49+
o, l2 := binary.Uvarint(b[3+l1:])
50+
if l2 <= 0 {
51+
ix.corrupt()
52+
}
53+
offset += int(o)
54+
b = b[3+l1+l2:]
55+
56+
// Read posting list for this trigram.
57+
plist := pdata[offset:]
58+
if len(plist) < 3 || string(plist[:3]) != string(t) {
59+
fmt.Fprintf(os.Stderr, "BLOCK %d at %d %#x %d %d\n%s\nPLIST\n%s", n, cap(b0)-cap(t), trigram, count, offset, hex.Dump(pblocks0[:len(pblocks0)-len(pblocks)]), hex.Dump(plist[:min(256, len(plist))]))
60+
ix.corrupt()
61+
}
62+
var dr deltaReader
63+
dr.init(ix, plist[3:])
64+
for range count {
65+
d := dr.next()
66+
if d == 0 {
67+
ix.corrupt()
68+
}
69+
}
70+
if dr.next() != 0 {
71+
ix.corrupt()
72+
}
73+
}
74+
n++
75+
}
76+
return nil
77+
}

index/merge.go

Lines changed: 44 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ func Merge(dst, src1, src2 string) {
193193
var w postDataWriter
194194
r1.init(ix1, map1)
195195
r2.init(ix2, map2)
196-
w.init(ix, true)
196+
postIndexFile := bufCreate("")
197+
w.init(ix, postIndexFile)
197198
old1, old2 := uint32(0), uint32(0)
198199
for {
199200
if !(r1.trigram > old1 || r2.trigram > old2) {
@@ -237,7 +238,9 @@ func Merge(dst, src1, src2 string) {
237238
w.endTrigram()
238239
}
239240
}
240-
w.postIndexFile.Align(postBlockSize)
241+
if len(w.block) > 0 {
242+
w.flush()
243+
}
241244

242245
// Name index
243246
ix.Align(16)
@@ -247,7 +250,7 @@ func Merge(dst, src1, src2 string) {
247250
// Posting list index
248251
ix.Align(16)
249252
postIndex := ix.Offset()
250-
copyFile(ix, w.postIndexFile)
253+
copyFile(ix, postIndexFile)
251254

252255
// Trailer
253256
ix.Align(16)
@@ -389,63 +392,72 @@ type postDataWriter struct {
389392
t uint32
390393
delta deltaWriter
391394
numTrigram int
395+
tmp [32]byte
396+
block []byte
392397
}
393398

394-
func (w *postDataWriter) init(out *Buffer, doIndex bool) {
395-
w.out = out
396-
w.base = out.Offset()
399+
func (w *postDataWriter) flush() {
400+
if w.postIndexFile != nil && len(w.block) > 0 {
401+
w.postIndexFile.Write(w.block[:cap(w.block)])
402+
w.block = w.block[:0]
403+
}
404+
}
405+
406+
func (w *postDataWriter) init(postData, postIndex *Buffer) {
407+
w.out = postData
408+
w.base = w.out.Offset()
397409
w.postIndexFile = nil
398-
w.delta.init(out)
410+
w.delta.init(w.out)
399411
w.lastOffset = w.base
400-
if doIndex {
401-
w.postIndexFile = bufCreate("")
402-
}
412+
w.postIndexFile = postIndex
413+
w.block = make([]byte, 0, postBlockSize)
403414
}
404415

405416
func (w *postDataWriter) trigram(t uint32) {
417+
if t == 0 {
418+
panic("invalid trigram")
419+
}
406420
w.offset = w.out.Offset()
407421
w.count = 0
408422
w.t = t
409423
w.lastID = -1
410424
w.numTrigram++
425+
w.out.WriteTrigram(w.t)
411426
}
412427

413428
func (w *postDataWriter) fileid(id int) {
414-
if w.count == 0 {
415-
w.out.WriteTrigram(w.t)
416-
}
417429
w.delta.Write(id - w.lastID)
418430
w.lastID = id
419431
w.count++
420432
}
421433

422434
func (w *postDataWriter) endTrigram() {
423-
if w.count == 0 {
424-
return
425-
}
426435
w.delta.Write(0)
427436
w.delta.Flush()
428437
if w.postIndexFile == nil {
429438
return
430439
}
431-
if writeVersion == 2 {
432-
var buf []byte
433-
buf = append(buf, byte(w.t>>16), byte(w.t>>8), byte(w.t))
434-
buf = binary.AppendUvarint(buf, uint64(w.count))
435-
cbuf := buf
436-
buf = binary.AppendUvarint(buf, uint64(w.offset-w.lastOffset))
437-
if w.postIndexFile.Offset()/postBlockSize != (w.postIndexFile.Offset()+len(buf)-1)/postBlockSize {
438-
for w.postIndexFile.Offset()%postBlockSize != 0 {
439-
w.postIndexFile.WriteByte(0)
440-
}
441-
w.lastOffset = w.base
442-
buf = binary.AppendUvarint(cbuf, uint64(w.offset-w.lastOffset))
443-
}
444-
w.postIndexFile.Write(buf[:])
445-
w.lastOffset = w.offset
446-
} else {
440+
if writeVersion == 1 {
447441
w.postIndexFile.WriteTrigram(w.t)
448442
w.postIndexFile.WriteUint(w.count)
449443
w.postIndexFile.WriteUint(w.offset - w.base)
444+
return
445+
}
446+
447+
buf := w.tmp[:]
448+
buf[0] = byte(w.t >> 16)
449+
buf[1] = byte(w.t >> 8)
450+
buf[2] = byte(w.t)
451+
452+
n := 3
453+
n += binary.PutUvarint(buf[n:], uint64(w.count))
454+
n1 := binary.PutUvarint(buf[n:], uint64(w.offset-w.lastOffset))
455+
if len(w.block)+n+n1 > cap(w.block) {
456+
w.postIndexFile.Write(w.block[:cap(w.block)])
457+
clear(w.block)
458+
w.block = w.block[:0]
459+
n1 = binary.PutUvarint(buf[n:], uint64(w.offset-w.base))
450460
}
461+
w.block = append(w.block, buf[:n+n1]...)
462+
w.lastOffset = w.offset
451463
}

index/read.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,18 @@ package index
5858
//
5959
// The file count and offset are varint-encoded, breaking random
6060
// access to the posting list index. To restore that, any index
61-
// entry that would otherwise cross a 128-byte boundary is preceded
61+
// entry that would otherwise cross a 256-byte boundary is preceded
6262
// by zeroed padding bytes up to the boundary. The overall index
63-
// is also zero-padded to a multiple of 128 bytes.
64-
// The offsets in each 128-byte chunk are delta-encoded starting
63+
// is also zero-padded to a multiple of 256 bytes.
64+
// The offsets in each 256-byte chunk are delta-encoded starting
6565
// from a base offset of 0.
6666
//
6767
// Index entries are only written for the non-empty posting lists,
6868
// so finding the posting list for a specific trigram requires a
6969
// binary search over the posting list index. To find an entry
70-
// in the index for a given trigram, binary search on the 128-byte
71-
// sections to find the 128-byte entry where it would be,
72-
// and then linear search in the 128-byte section.
70+
// in the index for a given trigram, binary search on the 256-byte
71+
// sections to find the 256-byte entry where it would be,
72+
// and then linear search in the 256-byte section.
7373
//
7474
// In practice, the majority of the possible trigrams are never
7575
// seen, so omitting the missing ones represents a significant
@@ -635,7 +635,7 @@ func mergeOr(l1, l2 []int) []int {
635635
return l
636636
}
637637

638-
var panicOnCorrupt = false
638+
var panicOnCorrupt = true
639639

640640
func (ix *Index) corrupt() {
641641
if panicOnCorrupt {

index/write.go

Lines changed: 18 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,12 @@ func (ix *IndexWriter) add(name string, f io.Reader) error {
244244
if n++; n >= 3 {
245245
ix.trigram.Add(tv)
246246
}
247+
if c == 0 {
248+
if ix.LogSkip {
249+
log.Printf("%s: contains NUL, ignoring\n", name)
250+
}
251+
return nil
252+
}
247253
if !validUTF8((tv>>8)&0xFF, tv&0xFF) {
248254
if ix.LogSkip {
249255
log.Printf("%s: invalid UTF-8, ignoring\n", name)
@@ -393,7 +399,7 @@ func (ix *IndexWriter) flushPost() {
393399

394400
start := ix.postFile.Offset()
395401
var w postDataWriter
396-
w.init(ix.postFile, false)
402+
w.init(ix.postFile, nil)
397403
trigram := invalidTrigram
398404
for _, p := range ix.post {
399405
if t := p.trigram(); t != trigram {
@@ -429,63 +435,23 @@ func (ix *IndexWriter) mergePost(out *Buffer) {
429435
sortPost(ix.post)
430436
h.addMem(ix.post)
431437

438+
var w postDataWriter
439+
w.init(out, ix.postIndex)
440+
432441
e := h.next()
433-
offset0 := out.Offset()
434-
var delta deltaWriter
435-
delta.init(out)
436-
var block []byte
437-
out.WriteString("")
438-
lastBlockOffset := 0
439442
for {
440-
offset := out.Offset() - offset0
441-
trigram := e.trigram()
442-
443-
ix.buf[0] = byte(trigram >> 16)
444-
ix.buf[1] = byte(trigram >> 8)
445-
ix.buf[2] = byte(trigram)
446-
447-
// posting list
448-
fileid := -1
449-
nfile := 0
450-
out.Write(ix.buf[:3])
451-
for ; e.trigram() == trigram && trigram != invalidTrigram; e = h.next() {
452-
delta.Write(e.fileid() - fileid)
453-
fileid = e.fileid()
454-
nfile++
443+
t := e.trigram()
444+
w.trigram(t)
445+
for ; e.trigram() == t && t != invalidTrigram; e = h.next() {
446+
w.fileid(e.fileid())
455447
}
456-
delta.Write(0)
457-
delta.Flush()
458-
459-
// index entry
460-
if writeVersion == 1 {
461-
ix.postIndex.Write(ix.buf[:3])
462-
ix.postIndex.WriteUint(nfile)
463-
ix.postIndex.WriteUint(offset)
464-
} else {
465-
n := 3
466-
n += binary.PutUvarint(ix.buf[n:], uint64(nfile))
467-
n1 := binary.PutUvarint(ix.buf[n:], uint64(offset-lastBlockOffset))
468-
if len(block)+n+n1 > cap(block) {
469-
if block == nil {
470-
block = make([]byte, 0, postBlockSize)
471-
} else {
472-
ix.postIndex.Write(block[:cap(block)])
473-
clear(block)
474-
block = block[:0]
475-
}
476-
n1 = binary.PutUvarint(ix.buf[n:], uint64(offset))
477-
}
478-
block = append(block, ix.buf[:n+n1]...)
479-
lastBlockOffset = offset
480-
}
481-
ix.numTrigram++
482-
if trigram == 1<<24-1 {
448+
w.endTrigram()
449+
if t == invalidTrigram {
483450
break
484451
}
485452
}
486-
if len(block) > 0 {
487-
ix.postIndex.Write(block[:cap(block)])
488-
}
453+
w.flush()
454+
ix.numTrigram = w.numTrigram
489455
}
490456

491457
// A postChunk represents a chunk of post entries flushed to disk or

0 commit comments

Comments
 (0)