Skip to content

Commit b4d0479

Browse files
committed
perf(pruning): skip empty shards + per-shard byte counters
CompactSharded256 and CompactPrefixHex256 sweep the full 256-byte keyspace once `compaction_interval` is crossed. CometBFT key schemas ("tx.hash", "tx.height", "block.height", event-type prefixes) live in the lowercase-ASCII zone (~30 of 256 byte values), so ~220 of those shard sweeps target ranges with zero keys. Each empty Compact still spins up goleveldb's compaction-scheduling state and (on populated databases) can trigger background level work. On a 45 GB tx_index.db with 23k SSTs we observed 275 GB written for 21,935 deleted entries (12.5 MB/entry) — the per-prune sweep was amplifying ~6× the DB size into write traffic and ~33 GB of OS page cache. Three changes: 1. shardHasKeys probe: one IteratorWithOpts(DontFillCache=true).Valid() per shard, microseconds. Empty shards are skipped, populated ones proceed unchanged. 2. CompactAndLog now reads /proc/self/io read_bytes/write_bytes around db.Compact and logs the delta. Pins per-shard amplification cost in production logs so we can attribute the spike post hoc without a rebuild. On non-linux the helper returns 0/0 and the log line shows zero-deltas — no functional change off-linux. 3. WaitTimeBetweenCompactions: 50ms -> 200ms. With skip-empty the active shard count drops to ~30, so 30 * 200ms = 6s extra pacing per sweep — well inside the 3h pruner budget — gives the kernel real time to drain dirty pages between bursts. Tests: - TestCompactSharded256_SkipsEmptyShards: seed only 't.*' keys, assert exactly one shard fires. - TestCompactSharded256_EmptyDB_NoShardsCompacted: empty DB, zero shards. - TestCompactPrefixHex256_SkipsEmptyShards: same coverage for prefix-hex. Existing CompactIntSharded tests untouched (path is height-bounded so sharding is dense by construction; skip-empty there would never fire).
1 parent b940185 commit b4d0479

2 files changed

Lines changed: 147 additions & 8 deletions

File tree

internal/db/db_utils.go

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
package db
22

33
import (
4+
"bufio"
45
"bytes"
56
"encoding/binary"
67
"encoding/hex"
78
"fmt"
89
"log"
10+
"os"
911
"strconv"
12+
"strings"
1013
"time"
1114
"unicode/utf8"
1215

@@ -24,11 +27,11 @@ const (
2427
// WaitTimeBetweenCompactions throttles successive shard compactions to
2528
// (a) yield to the scheduler so consensus goroutines aren't starved and
2629
// (b) give the kernel time to drain dirty page-cache pages before the
27-
// next shard re-fills it. 2ms (the prior value) wins (a) but is far
28-
// below Linux's 5s vm.dirty_writeback_centisecs default; 50ms gives
29-
// writeback a real window at negligible cycle-time cost
30-
// (~75s extra on the heaviest 1500-shard pruner cycle).
31-
WaitTimeBetweenCompactions = 50 * time.Millisecond
30+
// next shard re-fills it. 200ms gives writeback well over Linux's 5s
31+
// vm.dirty_writeback_centisecs default a real window between bursts.
32+
// Combined with skip-empty-shard, sweeps now spend most time on the
33+
// ~30 populated shards (out of 256) so total wall-time cost is small.
34+
WaitTimeBetweenCompactions = 200 * time.Millisecond
3235
)
3336

3437
var (
@@ -146,13 +149,18 @@ func CompactIntSharded(
146149
//
147150
// The final shard ends at "BH:fg" so that every key starting with "BH:ff"
148151
// compares < "BH:fg" (since 'g' is the next ASCII char after 'f').
152+
//
153+
// Shards that contain no keys are detected by a one-row Iterator probe and
154+
// skipped — on real workloads this prunes ~220 of 256 shards because key
155+
// schemas concentrate in the lowercase-ASCII zone.
149156
func CompactPrefixHex256(db dbm.DB, prefix string, label string) error {
150157
startAll := time.Now()
151158

152159
if prefix == "" {
153160
return fmt.Errorf("prefix must be non-empty")
154161
}
155162

163+
var compacted, skipped int
156164
for b := 0; b <= 0xFF; b++ {
157165
start := []byte(fmt.Sprintf("%s%02x", prefix, b))
158166

@@ -173,20 +181,36 @@ func CompactPrefixHex256(db dbm.DB, prefix string, label string) error {
173181
shardLabel = fmt.Sprintf("%s %s ff-fg", label, prefix)
174182
}
175183

184+
hasAny, err := shardHasKeys(db, start, end)
185+
if err != nil {
186+
return fmt.Errorf("compaction %s probe failed: %w", shardLabel, err)
187+
}
188+
if !hasAny {
189+
skipped++
190+
continue
191+
}
192+
compacted++
176193
if err := compactAndLog(db, start, end, shardLabel); err != nil {
177194
return err
178195
}
179196
}
180197

181-
log.Printf("compaction %s prefix %q ALL 256 SHARDS DONE in %s", label, prefix, time.Since(startAll))
198+
log.Printf("compaction %s prefix %q DONE in %s (compacted=%d skipped_empty=%d)",
199+
label, prefix, time.Since(startAll), compacted, skipped)
182200
return nil
183201
}
184202

185203
// CompactSharded256 compacts the DB into 256 ranges:
186204
// [0x00,0x01), [0x01,0x02), …, [0xFE,0xFF), [0xFF,∞)
205+
//
206+
// Shards that contain no keys are detected by a one-row Iterator probe and
207+
// skipped. CometBFT key schemas (e.g. "tx.hash", "block.height", event-type
208+
// prefixes) concentrate keys in the lowercase-ASCII zone, leaving ~220 of
209+
// the 256 shards empty on real workloads.
187210
func CompactSharded256(db dbm.DB, label string) error {
188211
startAll := time.Now()
189212

213+
var compacted, skipped int
190214
for b := 0; b < 256; b++ {
191215
start := []byte{byte(b)}
192216
var end []byte
@@ -203,32 +227,92 @@ func CompactSharded256(db dbm.DB, label string) error {
203227
shardLabel = fmt.Sprintf("%s shard %02x-%02x", label, b, b+1)
204228
}
205229

230+
hasAny, err := shardHasKeys(db, start, end)
231+
if err != nil {
232+
return fmt.Errorf("compaction %s probe failed: %w", shardLabel, err)
233+
}
234+
if !hasAny {
235+
skipped++
236+
continue
237+
}
238+
compacted++
206239
if err := compactAndLog(db, start, end, shardLabel); err != nil {
207240
return err
208241
}
209242
}
210243

211-
log.Printf("compaction %s ALL SHARDS DONE in %s", label, time.Since(startAll))
244+
log.Printf("compaction %s DONE in %s (compacted=%d skipped_empty=%d)",
245+
label, time.Since(startAll), compacted, skipped)
212246
return nil
213247
}
214248

215-
// CompactAndLog compacts [start, limit) and logs the range and duration.
249+
// shardHasKeys reports whether any key exists in [start, end). Cost is one
250+
// SeekGE — microseconds — versus a full CompactRange which on a populated DB
251+
// rewrites overlapping SSTs. Uses DontFillCache so the probe doesn't pollute
252+
// goleveldb's block cache.
253+
func shardHasKeys(db dbm.DB, start, end []byte) (bool, error) {
254+
it, err := dbm.IteratorWithOpts(db, start, end, &dbm.ReadOptions{DontFillCache: true})
255+
if err != nil {
256+
return false, err
257+
}
258+
defer it.Close()
259+
return it.Valid(), nil
260+
}
261+
262+
// CompactAndLog compacts [start, limit) and logs the range, duration, and
263+
// the process-level read/write byte deltas observed during the call. The
264+
// byte deltas come from /proc/self/io and capture *all* I/O issued by the
265+
// process during the compact (including background goroutines), so they're
266+
// upper bounds — but they pin per-shard amplification cost in production.
216267
func CompactAndLog(db dbm.DB, start, limit []byte, label string) error {
217268
time.Sleep(WaitTimeBetweenCompactions)
218269

219270
rng := fmt.Sprintf("[%s, %s)", prettyKey(start), prettyKey(limit))
220271

272+
rb0, wb0 := procIOBytes()
221273
t0 := time.Now()
222274
err := db.Compact(start, limit)
223275
elapsed := time.Since(t0)
276+
rb1, wb1 := procIOBytes()
224277

225278
if err != nil {
226279
log.Printf("compaction %s range %s FAILED after %s: %v", label, rng, elapsed, err)
227280
return err
228281
}
282+
log.Printf("compaction %s range %s done in %s dRead=%dB dWrite=%dB",
283+
label, rng, elapsed, rb1-rb0, wb1-wb0)
229284
return nil
230285
}
231286

287+
// procIOBytes reads /proc/self/io and returns (read_bytes, write_bytes).
288+
// On non-linux or read failure, returns (0, 0); the resulting delta of 0
289+
// is logged unobtrusively rather than failing the compaction.
290+
func procIOBytes() (uint64, uint64) {
291+
f, err := os.Open("/proc/self/io")
292+
if err != nil {
293+
return 0, 0
294+
}
295+
defer f.Close()
296+
var rb, wb uint64
297+
sc := bufio.NewScanner(f)
298+
for sc.Scan() {
299+
line := sc.Text()
300+
switch {
301+
case strings.HasPrefix(line, "read_bytes:"):
302+
v, err := strconv.ParseUint(strings.TrimSpace(strings.TrimPrefix(line, "read_bytes:")), 10, 64)
303+
if err == nil {
304+
rb = v
305+
}
306+
case strings.HasPrefix(line, "write_bytes:"):
307+
v, err := strconv.ParseUint(strings.TrimSpace(strings.TrimPrefix(line, "write_bytes:")), 10, 64)
308+
if err == nil {
309+
wb = v
310+
}
311+
}
312+
}
313+
return rb, wb
314+
}
315+
232316
// prettyKey renders a key as quoted ASCII if possible, otherwise as hex.
233317
// nil renders as ∞ (end-of-keyspace).
234318
func prettyKey(b []byte) string {

internal/db/db_utils_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,58 @@ func TestCompactIntSharded_ResumeFromStoredMeta_NoGaps(t *testing.T) {
163163
require.NoError(t, err)
164164
require.Equal(t, end2-1, last2)
165165
}
166+
167+
// TestCompactSharded256_SkipsEmptyShards seeds keys only under the lowercase
168+
// 't' prefix and asserts that CompactSharded256 invokes compactAndLog only
169+
// for that single shard out of 256.
170+
func TestCompactSharded256_SkipsEmptyShards(t *testing.T) {
171+
var calls [][2][]byte
172+
restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
173+
calls = append(calls, [2][]byte{append([]byte(nil), start...), append([]byte(nil), end...)})
174+
return nil
175+
})
176+
defer restore()
177+
178+
memdb := dbm.NewMemDB()
179+
require.NoError(t, memdb.Set([]byte("tx.height/100"), []byte{1}))
180+
require.NoError(t, memdb.Set([]byte("tx.hash/abcd"), []byte{1}))
181+
182+
require.NoError(t, CompactSharded256(memdb, "test"))
183+
184+
require.Len(t, calls, 1, "only the 't' shard (0x74) should be compacted")
185+
require.Equal(t, byte('t'), calls[0][0][0], "compacted shard must start at 0x74 ('t')")
186+
}
187+
188+
// TestCompactSharded256_EmptyDB_NoShardsCompacted asserts that on an empty
189+
// DB no shard is compacted at all.
190+
func TestCompactSharded256_EmptyDB_NoShardsCompacted(t *testing.T) {
191+
var calls int
192+
restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
193+
calls++
194+
return nil
195+
})
196+
defer restore()
197+
198+
require.NoError(t, CompactSharded256(dbm.NewMemDB(), "empty"))
199+
require.Zero(t, calls, "empty DB must yield zero compactions")
200+
}
201+
202+
// TestCompactPrefixHex256_SkipsEmptyShards seeds two keys under "BH:" prefix
203+
// (heights 1 and 1000) so they fall in distinct hex shards (BH:31 and BH:31
204+
// — both '1' first byte). One shard should fire.
205+
func TestCompactPrefixHex256_SkipsEmptyShards(t *testing.T) {
206+
var calls int
207+
restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
208+
calls++
209+
return nil
210+
})
211+
defer restore()
212+
213+
memdb := dbm.NewMemDB()
214+
require.NoError(t, memdb.Set([]byte("BH:31"), []byte{1}))
215+
require.NoError(t, memdb.Set([]byte("BH:31000"), []byte{1}))
216+
217+
require.NoError(t, CompactPrefixHex256(memdb, "BH:", "test"))
218+
219+
require.Equal(t, 1, calls, "only BH:31-32 shard should fire")
220+
}

0 commit comments

Comments
 (0)