Skip to content

Commit a61b226

Browse files
perf(bm25): replace facet storage with compact direct Badger KV encoding
Replace the facet-based BM25 storage (~40-50 bytes/posting) with compact varint-encoded binary blobs stored as direct Badger KV entries (~4-6 bytes/posting, ~10x reduction). Add bm25_score pseudo-predicate for variable-based score ordering following the similar_to pattern. - Add posting/bm25enc package for compact binary encode/decode - Rewrite write path in posting/index.go for direct Badger KV - Add bm25Writes buffer to LocalCache with read-your-own-writes - Flush BM25 blobs in CommitToDisk with BitBM25Data UserMeta - Rewrite read path in worker/task.go with direct blob decoding - Add bm25_score pseudo-predicate in query/query.go - Add score ordering integration tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 937da2e commit a61b226

9 files changed

Lines changed: 562 additions & 163 deletions

File tree

posting/bm25enc/bm25enc.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/*
2+
* SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
// Package bm25enc provides compact binary encoding for BM25 index data.
7+
//
8+
// Two types of lists share the same format:
9+
// - Term posting lists: (UID, term-frequency) pairs
10+
// - Document length lists: (UID, doc-length) pairs
11+
//
12+
// Binary format:
13+
//
14+
// Header:
15+
// [4 bytes] uint32 big-endian: entry count
16+
// Entries (sorted ascending by UID):
17+
// [varint] UID delta from previous (first entry is absolute)
18+
// [varint] value (TF or doclen)
19+
package bm25enc
20+
21+
import (
22+
"encoding/binary"
23+
"sort"
24+
)
25+
26+
// Entry represents a single (UID, Value) pair in a BM25 posting list.
27+
type Entry struct {
28+
UID uint64
29+
Value uint32
30+
}
31+
32+
// Encode encodes a sorted slice of entries into the compact binary format.
33+
// Entries must be sorted by UID ascending. Returns nil for empty input.
34+
func Encode(entries []Entry) []byte {
35+
if len(entries) == 0 {
36+
return nil
37+
}
38+
39+
// Pre-allocate: 4 header + ~6 bytes per entry is a reasonable estimate.
40+
buf := make([]byte, 4, 4+len(entries)*6)
41+
binary.BigEndian.PutUint32(buf, uint32(len(entries)))
42+
43+
var tmp [binary.MaxVarintLen64]byte
44+
var prevUID uint64
45+
for _, e := range entries {
46+
delta := e.UID - prevUID
47+
n := binary.PutUvarint(tmp[:], delta)
48+
buf = append(buf, tmp[:n]...)
49+
n = binary.PutUvarint(tmp[:], uint64(e.Value))
50+
buf = append(buf, tmp[:n]...)
51+
prevUID = e.UID
52+
}
53+
return buf
54+
}
55+
56+
// Decode decodes the binary format into a sorted slice of entries.
57+
// Returns nil for nil/empty input.
58+
func Decode(data []byte) []Entry {
59+
if len(data) < 4 {
60+
return nil
61+
}
62+
count := binary.BigEndian.Uint32(data[:4])
63+
if count == 0 {
64+
return nil
65+
}
66+
67+
entries := make([]Entry, 0, count)
68+
pos := 4
69+
var prevUID uint64
70+
for i := uint32(0); i < count; i++ {
71+
delta, n := binary.Uvarint(data[pos:])
72+
if n <= 0 {
73+
break
74+
}
75+
pos += n
76+
77+
val, n := binary.Uvarint(data[pos:])
78+
if n <= 0 {
79+
break
80+
}
81+
pos += n
82+
83+
uid := prevUID + delta
84+
entries = append(entries, Entry{UID: uid, Value: uint32(val)})
85+
prevUID = uid
86+
}
87+
return entries
88+
}
89+
90+
// Upsert inserts or updates the entry for uid in a sorted entries slice.
91+
// Returns the new sorted slice.
92+
func Upsert(entries []Entry, uid uint64, value uint32) []Entry {
93+
i := sort.Search(len(entries), func(i int) bool { return entries[i].UID >= uid })
94+
if i < len(entries) && entries[i].UID == uid {
95+
entries[i].Value = value
96+
return entries
97+
}
98+
// Insert at position i.
99+
entries = append(entries, Entry{})
100+
copy(entries[i+1:], entries[i:])
101+
entries[i] = Entry{UID: uid, Value: value}
102+
return entries
103+
}
104+
105+
// Remove removes the entry for uid from a sorted entries slice.
106+
// Returns the new slice (may be shorter).
107+
func Remove(entries []Entry, uid uint64) []Entry {
108+
i := sort.Search(len(entries), func(i int) bool { return entries[i].UID >= uid })
109+
if i < len(entries) && entries[i].UID == uid {
110+
return append(entries[:i], entries[i+1:]...)
111+
}
112+
return entries
113+
}
114+
115+
// Search returns the value for uid using binary search, and whether it was found.
116+
func Search(entries []Entry, uid uint64) (uint32, bool) {
117+
i := sort.Search(len(entries), func(i int) bool { return entries[i].UID >= uid })
118+
if i < len(entries) && entries[i].UID == uid {
119+
return entries[i].Value, true
120+
}
121+
return 0, false
122+
}
123+
124+
// UIDs extracts just the UIDs from entries as a uint64 slice.
125+
func UIDs(entries []Entry) []uint64 {
126+
uids := make([]uint64, len(entries))
127+
for i, e := range entries {
128+
uids[i] = e.UID
129+
}
130+
return uids
131+
}
132+
133+
// EncodeStats encodes BM25 corpus statistics (docCount, totalTerms) as 16 bytes.
134+
func EncodeStats(docCount, totalTerms uint64) []byte {
135+
buf := make([]byte, 16)
136+
binary.BigEndian.PutUint64(buf[0:8], docCount)
137+
binary.BigEndian.PutUint64(buf[8:16], totalTerms)
138+
return buf
139+
}
140+
141+
// DecodeStats decodes BM25 corpus statistics. Returns (0,0) for invalid input.
142+
func DecodeStats(data []byte) (docCount, totalTerms uint64) {
143+
if len(data) != 16 {
144+
return 0, 0
145+
}
146+
return binary.BigEndian.Uint64(data[0:8]), binary.BigEndian.Uint64(data[8:16])
147+
}

posting/bm25enc/bm25enc_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package bm25enc
7+
8+
import (
9+
"testing"
10+
11+
"github.com/stretchr/testify/require"
12+
)
13+
14+
func TestRoundtrip(t *testing.T) {
15+
entries := []Entry{
16+
{UID: 1, Value: 3},
17+
{UID: 5, Value: 1},
18+
{UID: 100, Value: 7},
19+
{UID: 200, Value: 2},
20+
}
21+
data := Encode(entries)
22+
got := Decode(data)
23+
require.Equal(t, entries, got)
24+
}
25+
26+
func TestRoundtripEmpty(t *testing.T) {
27+
require.Nil(t, Encode(nil))
28+
require.Nil(t, Encode([]Entry{}))
29+
require.Nil(t, Decode(nil))
30+
require.Nil(t, Decode([]byte{}))
31+
require.Nil(t, Decode([]byte{0, 0, 0, 0})) // count=0
32+
}
33+
34+
func TestRoundtripSingle(t *testing.T) {
35+
entries := []Entry{{UID: 42, Value: 10}}
36+
got := Decode(Encode(entries))
37+
require.Equal(t, entries, got)
38+
}
39+
40+
func TestRoundtripLargeUIDs(t *testing.T) {
41+
entries := []Entry{
42+
{UID: 1<<40 + 1, Value: 1},
43+
{UID: 1<<40 + 1000, Value: 5},
44+
{UID: 1<<50 + 999, Value: 99},
45+
}
46+
got := Decode(Encode(entries))
47+
require.Equal(t, entries, got)
48+
}
49+
50+
func TestUpsertNew(t *testing.T) {
51+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}}
52+
entries = Upsert(entries, 3, 7)
53+
require.Equal(t, []Entry{{UID: 1, Value: 3}, {UID: 3, Value: 7}, {UID: 5, Value: 1}}, entries)
54+
}
55+
56+
func TestUpsertExisting(t *testing.T) {
57+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}}
58+
entries = Upsert(entries, 5, 99)
59+
require.Equal(t, []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 99}}, entries)
60+
}
61+
62+
func TestUpsertEmpty(t *testing.T) {
63+
var entries []Entry
64+
entries = Upsert(entries, 10, 5)
65+
require.Equal(t, []Entry{{UID: 10, Value: 5}}, entries)
66+
}
67+
68+
func TestRemove(t *testing.T) {
69+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}, {UID: 10, Value: 2}}
70+
entries = Remove(entries, 5)
71+
require.Equal(t, []Entry{{UID: 1, Value: 3}, {UID: 10, Value: 2}}, entries)
72+
}
73+
74+
func TestRemoveNotFound(t *testing.T) {
75+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}}
76+
entries = Remove(entries, 99)
77+
require.Equal(t, []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}}, entries)
78+
}
79+
80+
func TestSearch(t *testing.T) {
81+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}, {UID: 100, Value: 7}}
82+
v, ok := Search(entries, 5)
83+
require.True(t, ok)
84+
require.Equal(t, uint32(1), v)
85+
86+
_, ok = Search(entries, 50)
87+
require.False(t, ok)
88+
}
89+
90+
func TestUIDs(t *testing.T) {
91+
entries := []Entry{{UID: 1, Value: 3}, {UID: 5, Value: 1}, {UID: 100, Value: 7}}
92+
require.Equal(t, []uint64{1, 5, 100}, UIDs(entries))
93+
}
94+
95+
func TestStatsRoundtrip(t *testing.T) {
96+
data := EncodeStats(12345, 98765)
97+
dc, tt := DecodeStats(data)
98+
require.Equal(t, uint64(12345), dc)
99+
require.Equal(t, uint64(98765), tt)
100+
}
101+
102+
func TestStatsInvalid(t *testing.T) {
103+
dc, tt := DecodeStats(nil)
104+
require.Zero(t, dc)
105+
require.Zero(t, tt)
106+
dc, tt = DecodeStats([]byte{1, 2, 3})
107+
require.Zero(t, dc)
108+
require.Zero(t, tt)
109+
}
110+
111+
func BenchmarkEncode(b *testing.B) {
112+
entries := make([]Entry, 10000)
113+
for i := range entries {
114+
entries[i] = Entry{UID: uint64(i*3 + 1), Value: uint32(i % 100)}
115+
}
116+
b.ResetTimer()
117+
for i := 0; i < b.N; i++ {
118+
Encode(entries)
119+
}
120+
}
121+
122+
func BenchmarkDecode(b *testing.B) {
123+
entries := make([]Entry, 10000)
124+
for i := range entries {
125+
entries[i] = Entry{UID: uint64(i*3 + 1), Value: uint32(i % 100)}
126+
}
127+
data := Encode(entries)
128+
b.ResetTimer()
129+
for i := 0; i < b.N; i++ {
130+
Decode(data)
131+
}
132+
}

0 commit comments

Comments
 (0)