From 7174cc2930b3463c01d0dbd6a5a769b6803e2ef7 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 14:30:54 -0400 Subject: [PATCH 1/8] bench: add dgraph-shaped iterator micro-benchmarks Add four benchmarks modeling dgraph's posting-layer iterator hot paths. dgraph drives badger in a specific combination of options that none of the existing root-package benchmarks exercise: - NamespaceOffset = 1 (DB.isBanned runs on every key) - NumVersionsToKeep = math.MaxInt32 (all versions retained) - DetectConflicts = false (dgraph owns OCC) - PrefetchValues = false (key-only scans in every hot path) - AllVersions = true (MVCC-aware reads) - Prefix-bounded forward iteration over mid-sized structured keys Keys mirror dgraph's data-key layout (1B type + 8B namespace + 2B attr-len + ~20B attr + 1B subtype + 8B UID = 40 bytes). Benchmarks: - BenchmarkDgraphPrefixScanKeyOnly: models has() / index-scan paths (posting.MemoryLayer.IterateDisk, worker.sort.go) - BenchmarkDgraphPrefixScanAllVersions: models the rollup iterator with multiple MVCC versions per key - BenchmarkDgraphKeyIteratorAllVersions: models readFromDisk via NewKeyIterator over a single posting key (every cache miss) - BenchmarkDgraphPrefixScanNamespaceOffset: isolates the cost of NamespaceOffset=1 vs -1 (i.e. the per-key DB.isBanned tax) These benchmarks are intended as the measurement baseline for upcoming iterator hot-path optimizations. --- iterator_dgraph_bench_test.go | 305 ++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 iterator_dgraph_bench_test.go diff --git a/iterator_dgraph_bench_test.go b/iterator_dgraph_bench_test.go new file mode 100644 index 000000000..c890709a3 --- /dev/null +++ b/iterator_dgraph_bench_test.go @@ -0,0 +1,305 @@ +/* + * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +package badger + +import ( + "encoding/binary" + "math" + "math/rand" + "os" + "testing" + + "github.com/dgraph-io/badger/v4/y" +) + +// dgraph-shaped iterator micro-benchmarks. +// +// These benchmarks model the hot iterator paths in dgraph's posting layer +// (posting/mvcc.go: ReadPostingList, IterateDisk, sort.go: index bucket scan). +// They are intentionally distinct from the existing root-package benchmarks +// because none of those exercise the combination dgraph actually uses: +// +// - NamespaceOffset = 1 (so DB.isBanned runs on every key) +// - NumVersionsToKeep = math.MaxInt32 (dgraph keeps all versions) +// - DetectConflicts = false (dgraph owns OCC) +// - PrefetchValues = false (key-only scans in hot paths) +// - AllVersions = true (MVCC-aware reads in ReadPostingList / rollup) +// - Prefix-bounded scans over mid-sized structured keys +// +// Key layout mirrors dgraph's data keys (see x/keys.go in dgraph): +// +// byte 0 : type prefix (0x00 = data) +// bytes 1..8 : namespace, big-endian uint64 +// bytes 9..10 : 2-byte attr length, big-endian uint16 +// bytes 11.. : attr name (variable, ~16 bytes typical) +// next 1 byte : subtype +// last 8 bytes: UID, big-endian +// +// Total: 1 + 8 + 2 + len(attr) + 1 + 8 ≈ 36 bytes for a 16-byte attr. + +const ( + dgKeyTypeData = 0x00 + dgAttrName = "predicate_namespaced" // 20 bytes -> 40-byte keys +) + +// dgKey constructs a dgraph-shaped data key with the given namespace, attr, +// subtype, and UID. +func dgKey(ns uint64, attr string, subtype byte, uid uint64) []byte { + buf := make([]byte, 1+8+2+len(attr)+1+8) + buf[0] = dgKeyTypeData + binary.BigEndian.PutUint64(buf[1:9], ns) + binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr))) + copy(buf[11:11+len(attr)], attr) + buf[11+len(attr)] = subtype + binary.BigEndian.PutUint64(buf[12+len(attr):], uid) + return buf +} + +// dgPrefix returns the (ns + attr) prefix shared by all keys for one predicate. +// This is what dgraph iterates with for prefix-bounded scans. +func dgPrefix(ns uint64, attr string) []byte { + buf := make([]byte, 1+8+2+len(attr)) + buf[0] = dgKeyTypeData + binary.BigEndian.PutUint64(buf[1:9], ns) + binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr))) + copy(buf[11:], attr) + return buf +} + +// dgraphTestOptions returns the badger options that match dgraph's +// production configuration: managed DB, all-versions retention, no conflict +// detection, namespace offset = 1. +func dgraphTestOptions(dir string) Options { + return DefaultOptions(dir). + WithSyncWrites(false). + WithLoggingLevel(WARNING). + WithNumVersionsToKeep(math.MaxInt32). + WithDetectConflicts(false). + WithNamespaceOffset(1) +} + +// dgraphLoadDB populates a managed DB with `nKeys` unique keys under a single +// predicate, each carrying `versionsPerKey` MVCC versions. UIDs are dense +// (1..nKeys) so prefix scans land contiguous keys, mirroring dgraph's +// `has()` predicate scans. +func dgraphLoadDB(b *testing.B, db *DB, ns uint64, attr string, nKeys, versionsPerKey int) { + b.Helper() + // Small value: realistic for posting-list deltas (a few hundred bytes is + // the upper end; we use 64 bytes to keep total disk footprint bounded). + val := make([]byte, 64) + for i := range val { + val[i] = byte(i) + } + + const batch = 4000 + commitTs := uint64(1) + for start := 0; start < nKeys; start += batch { + end := start + batch + if end > nKeys { + end = nKeys + } + // Write `versionsPerKey` versions of each key in this batch. + for v := 0; v < versionsPerKey; v++ { + txn := db.NewTransactionAt(math.MaxUint64, true) + for i := start; i < end; i++ { + key := dgKey(ns, attr, 0, uint64(i+1)) + if err := txn.SetEntry(&Entry{Key: key, Value: val}); err != nil { + b.Fatalf("SetEntry: %v", err) + } + } + if err := txn.CommitAt(commitTs, nil); err != nil { + b.Fatalf("CommitAt: %v", err) + } + commitTs++ + } + } + // Force everything to disk so iteration reflects the on-disk format + // (avoids measuring only the memtable hit). + if err := db.Flatten(2); err != nil { + b.Fatalf("Flatten: %v", err) + } +} + +// openDgraphDB opens a managed DB with the dgraph-shaped options. +func openDgraphDB(b *testing.B) (*DB, string) { + b.Helper() + dir, err := os.MkdirTemp(".", "badger-dgraph-bench") + y.Check(err) + db, err := OpenManaged(dgraphTestOptions(dir)) + y.Check(err) + return db, dir +} + +// BenchmarkDgraphPrefixScanKeyOnly models the IterateDisk / index-scan path: +// prefix-bounded forward iteration with PrefetchValues=false, one version per +// key. This is what dgraph's `has()` predicate evaluator and sort.go index +// scans drive. +func BenchmarkDgraphPrefixScanKeyOnly(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 200_000 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + b.ReportMetric(float64(nKeys), "keys/op") +} + +// BenchmarkDgraphPrefixScanAllVersions models the rollup path: +// prefix-bounded forward iteration with PrefetchValues=false and +// AllVersions=true, where each key has several MVCC versions. +func BenchmarkDgraphPrefixScanAllVersions(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 50_000 + versionsPerKey = 5 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + opt.AllVersions = true + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + // Should see nKeys * versionsPerKey items. + if count != nKeys*versionsPerKey { + b.Fatalf("expected %d items, got %d", nKeys*versionsPerKey, count) + } + } + b.ReportMetric(float64(nKeys*versionsPerKey), "items/op") +} + +// BenchmarkDgraphKeyIteratorAllVersions models readFromDisk: +// a NewKeyIterator over a single posting key with AllVersions=true. dgraph +// invokes this on every cache miss in the posting layer. +func BenchmarkDgraphKeyIteratorAllVersions(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 20_000 + versionsPerKey = 8 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey) + + // Pre-build the lookup key set so the random selection cost stays out + // of the measured loop. + keys := make([][]byte, nKeys) + for i := 0; i < nKeys; i++ { + keys[i] = dgKey(ns, dgAttrName, 0, uint64(i+1)) + } + rng := rand.New(rand.NewSource(1)) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + key := keys[rng.Intn(nKeys)] + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.PrefetchValues = false + opt.AllVersions = true + it := txn.NewKeyIterator(key, opt) + count := 0 + for it.Seek(key); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != versionsPerKey { + b.Fatalf("expected %d versions, got %d", versionsPerKey, count) + } + } +} + +// BenchmarkDgraphPrefixScanNamespaceOffset measures the cost of +// NamespaceOffset=1 vs NamespaceOffset=-1 (off) under identical workload. +// This isolates the per-key cost of DB.isBanned, which today takes an +// RWMutex on every iterator step even when no namespaces are banned. +func BenchmarkDgraphPrefixScanNamespaceOffset(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 100_000 + ) + cases := []struct { + name string + nsOff int // -1 = off, 1 = dgraph-style + }{ + {"off", -1}, + {"dgraph", 1}, + } + + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + dir, err := os.MkdirTemp(".", "badger-dgraph-nsoff") + y.Check(err) + defer removeDir(dir) + opts := DefaultOptions(dir). + WithSyncWrites(false). + WithLoggingLevel(WARNING). + WithNumVersionsToKeep(math.MaxInt32). + WithDetectConflicts(false). + WithNamespaceOffset(c.nsOff) + db, err := OpenManaged(opts) + y.Check(err) + defer db.Close() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + }) + } +} + From 4942ce1c7362c5a0ee2b1a8a5ca17fcf32559bd2 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 14:55:14 -0400 Subject: [PATCH 2/8] perf: lockless fast path for DB.isBanned when no namespaces banned DB.isBanned runs on every iterator step and every key access when NamespaceOffset >= 0. The hot path acquires lockedKeys.RLock and probes the keys map even when the ban set is empty (the steady-state case in production deployments like dgraph that enable NamespaceOffset for multi-tenant key isolation but rarely ban anything). Add a monotonic atomic.Bool flag (lockedKeys.hasAny) that flips from false to true on the first add() and never flips back. isBanned can check this flag with a single atomic load and skip the lock + map lookup when no ban has ever been recorded. Correctness: add() stores the flag inside the same lock that guards the keys map, so any reader observing hasAny=true is guaranteed to see the populated map via release-acquire ordering on the lock. Readers observing hasAny=false correctly return early because the flag is monotonic and is set before any key is observable in the map. DB-reopen replay path also calls add() so reopened DBs retain the flag for previously-recorded bans. Measured on dgraph-shaped iterator benchmarks (Apple M4 Max, median-of-3, -benchtime=5s): PrefixScanKeyOnly 11.80M -> 10.56M ns/op (-10.5%) PrefixScanAllVersions 11.98M -> 10.60M ns/op (-11.4%) KeyIteratorAllVersions 1,514 -> 1,371 ns/op (-9.4%) NamespaceOffset/dgraph 5.51M -> 4.96M ns/op (-10.0%) NamespaceOffset/off 5.08M -> 5.01M ns/op (-1.4%, noise) The NamespaceOffset/dgraph case now matches NamespaceOffset/off (both ~4.96-5.01M ns/op) -- the entire ~8.5% NamespaceOffset tax from the per-key RLock is eliminated. Tests: full suite passes (TestBannedPrefixes, TestIterateWithBanned verified) except TestProtosRegenerate which fails identically on baseline due to missing protoc binary. Co-Authored-By: Claude Opus 4.7 --- db.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/db.go b/db.go index 8a1021308..129607a32 100644 --- a/db.go +++ b/db.go @@ -52,12 +52,19 @@ type closers struct { type lockedKeys struct { sync.RWMutex keys map[uint64]struct{} + // hasAny is a fast-path flag: false until the first add(), then true forever. + // Hot-path callers (DB.isBanned) check this without taking the lock so the + // common case (empty ban set) costs one atomic load instead of an + // RLock/RUnlock pair plus a map lookup. There is no remove API, so the + // flag is monotonic and never needs to flip back to false. + hasAny atomic.Bool } func (lk *lockedKeys) add(key uint64) { lk.Lock() defer lk.Unlock() lk.keys[key] = struct{}{} + lk.hasAny.Store(true) } func (lk *lockedKeys) has(key uint64) bool { @@ -1846,6 +1853,14 @@ func (db *DB) isBanned(key []byte) error { if db.opt.NamespaceOffset < 0 { return nil } + // Fast path: no namespaces have ever been banned in this DB lifetime + // (the common production case). Skip the slice + lookup + lock entirely. + // isBanned is called on every iterator step and every Txn.Get/modify, so + // avoiding the RLock here matters when NamespaceOffset is enabled but no + // bans are active. + if !db.bannedNamespaces.hasAny.Load() { + return nil + } if len(key) <= db.opt.NamespaceOffset+8 { return nil } From 41018baa97332bebf6bb62736a627a58d371d3af Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 15:14:49 -0400 Subject: [PATCH 3/8] perf: hoist mi.Value() and key out of fill() to skip per-item double-decode Iterator.parseItem previously called mi.Value() twice per kept item on the non-AllVersions hot path: 1. To pre-check Meta/ExpiresAt for the deleted/expired filter. 2. Inside fill(), which re-fetched the same ValueStruct (cost includes an Arena getVal + ValueStruct.Decode that the CPU profile showed consuming ~40% of cumulative samples on dgraph-shaped key-only prefix scans). Refactor fill to take the already-fetched key []byte and a pointer to the already-decoded *y.ValueStruct, and hoist mi.Value() into both the AllVersions and non-AllVersions branches of parseItem. fill no longer touches it.iitr. Pass vs by pointer (not value): y.ValueStruct is ~40 bytes; passing by value added a per-item struct copy that wiped out the AllVersions win (branch only had one Value() call, no double-decode to eliminate). Pointer collapses to a single 8-byte arg; fill does not store the pointer so escape analysis keeps vs on the stack. Correctness: - vs and key are fetched while the iterator is positioned on the current entry; no mi.Next() runs between fetch and fill. Arena/block memory backing vs.Value and key is stable for that window. - In the reverse-iteration goto FILL path, mi has advanced once, so the refactor refreshes key = mi.Key() before re-entering FILL, and FILL re-fetches vs := mi.Value(). Semantically equivalent to the pre-refactor code. Measured on dgraph-shaped iterator benchmarks (Apple M4 Max, median-of-3, -benchtime=5s) -- composite (sum of 5 medians) drops 31.13M -> 26.37M ns (-15.3% vs iter1, -23.3% vs upstream baseline): PrefixScanKeyOnly 10.56M -> 8.18M ns/op (-22.5%) PrefixScanAllVersions 10.60M -> 10.53M ns/op (-0.7%, flat) KeyIteratorAllVersions 1,371 -> 1,376 ns/op (+0.4%, noise) NamespaceOffset/off 5.01M -> 3.83M ns/op (-23.6%) NamespaceOffset/dgraph 4.96M -> 3.83M ns/op (-22.7%) Tests: full suite passes (TestProtosRegenerate continues to fail identically due to missing protoc binary). Co-Authored-By: Claude Opus 4.7 --- iterator.go | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/iterator.go b/iterator.go index f57cfa4c9..ac4c72648 100644 --- a/iterator.go +++ b/iterator.go @@ -640,8 +640,9 @@ func (it *Iterator) parseItem() bool { if it.opt.AllVersions { // Return deleted or expired values also, otherwise user can't figure out // whether the key was deleted. + vs := mi.Value() item := it.newItem() - it.fill(item) + it.fill(item, key, &vs) setItem(item) mi.Next() return true @@ -659,11 +660,13 @@ func (it *Iterator) parseItem() bool { // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, // which is wrong. Therefore, update lastKey here. - it.lastKey = y.SafeCopy(it.lastKey, mi.Key()) + it.lastKey = y.SafeCopy(it.lastKey, key) } FILL: - // If deleted, advance and return. + // If deleted, advance and return. We fetch vs (and reuse key from the + // enclosing scope, or refetch after Next on the reverse goto below) so + // that fill does not have to call mi.Value() / mi.Key() a second time. vs := mi.Value() if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { mi.Next() @@ -671,7 +674,7 @@ FILL: } item := it.newItem() - it.fill(item) + it.fill(item, key, &vs) // fill item based on current cursor position. All Next calls have returned, so reaching here // means no Next was called. @@ -681,9 +684,11 @@ FILL: return true } - // Reverse direction. - nextTs := y.ParseTs(mi.Key()) - mik := y.ParseKey(mi.Key()) + // Reverse direction. Refresh key after the Next() above; the iterator + // has advanced, so the previous `key` slice now refers to a later block. + key = mi.Key() + nextTs := y.ParseTs(key) + mik := y.ParseKey(key) if nextTs <= it.readTs && bytes.Equal(mik, item.key) { // This is a valid potential candidate. goto FILL @@ -693,14 +698,18 @@ FILL: return true } -func (it *Iterator) fill(item *Item) { - vs := it.iitr.Value() +// fill populates item from the current iterator position. Callers pass the +// already-fetched key and value pointer to avoid the per-item cost of +// calling mi.Key() / mi.Value() (and decoding ValueStruct) a second time +// on the hot iterator path. vs is passed by pointer to avoid copying the +// ~40-byte ValueStruct on every kept item. +func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) { item.meta = vs.Meta item.userMeta = vs.UserMeta item.expiresAt = vs.ExpiresAt - item.version = y.ParseTs(it.iitr.Key()) - item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key())) + item.version = y.ParseTs(key) + item.key = y.SafeCopy(item.key, y.ParseKey(key)) item.vptr = y.SafeCopy(item.vptr, vs.Value) item.val = nil From 9267dbb608358b317914691ae95220a48397ee84 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 15:22:14 -0400 Subject: [PATCH 4/8] perf: skip per-step internal-key probe when prefix excludes badgerPrefix Iterator.parseItem unconditionally ran bytes.HasPrefix(key, badgerPrefix) on every step to detect badger-internal keys (e.g. !badger!banned). For prefix-bounded scans whose Prefix's first byte differs from badgerPrefix[0] ('!'), no key the iterator can land on can possibly be internal -- the probe is dead. The CPU profile after iter2 attributed ~13% of cumulative samples to runtime.memequal, a portion of which was this probe. Precompute canSeeInternalKeys once at iterator construction: - empty prefix: true (unbounded scan can hit internal keys) - prefix[0] == '!' : true (could be the banned-ns recovery iterator or any scan that starts in the internal-key range) - otherwise: false (the iterator after Seek(prefix) is positioned past the internal-key range and hasPrefix() will reject any !-prefixed key anyway, so the per-step probe is provably dead). When the flag is false, skip both the HasPrefix probe and the !InternalAccess skip-block. isInternalKey defaults to false, which is also the correct value for the downstream "!isInternalKey && isBanned" gate (non-internal keys are subject to ban checks; internal ones are not, but here we know they can't appear). Correctness: - initBannedNamespaces sets Prefix = bannedNsKey (starts with '!') so the flag is true and the probe still runs on the recovery path. - All other badger-internal access uses InternalAccess=true with a badgerPrefix-anchored Prefix, so the flag is also true there. - For dgraph's data scans (Prefix[0] = 0x00) and all other user scans the probe is provably dead and can be skipped. Measured on dgraph-shaped iterator benchmarks (Apple M4 Max, median-of-3, -benchtime=5s): PrefixScanKeyOnly 8.18M -> 7.92M ns/op (-3.2%) PrefixScanAllVersions 10.53M ->10.50M ns/op (-0.2%, noise) KeyIteratorAllVersions 1,376 -> 1,350 ns/op (-1.9%) NamespaceOffset/off 3.83M -> 3.72M ns/op (-2.6%) NamespaceOffset/dgraph 3.83M -> 3.74M ns/op (-2.4%) Cumulative since upstream baseline: composite 34.37M -> 25.89M ns (-24.7%); PrefixScanKeyOnly 11.80M -> 7.92M (-32.9%); NamespaceOffset/dgraph 5.51M -> 3.74M (-32.1%). Tests: TestBannedPrefixes, TestIterateWithBanned, TestIterator*, and TestKeyIterator* all pass. Co-Authored-By: Claude Opus 4.7 --- iterator.go | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/iterator.go b/iterator.go index ac4c72648..d63ac2b09 100644 --- a/iterator.go +++ b/iterator.go @@ -435,6 +435,12 @@ type Iterator struct { lastKey []byte // Used to skip over multiple versions of the same key. + // canSeeInternalKeys is true when this iterator can possibly surface a + // badger-internal key (e.g. "!badger!banned"). When false, parseItem + // can skip the per-step bytes.HasPrefix(key, badgerPrefix) check. + // Computed once at construction from opt.Prefix. + canSeeInternalKeys bool + closed bool scanned int // Used to estimate the size of data scanned by iterator. @@ -482,14 +488,26 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { } iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references. res := &Iterator{ - txn: txn, - iitr: table.NewMergeIterator(iters, opt.Reverse), - opt: opt, - readTs: txn.readTs, + txn: txn, + iitr: table.NewMergeIterator(iters, opt.Reverse), + opt: opt, + readTs: txn.readTs, + canSeeInternalKeys: canSeeInternalKeys(opt.Prefix), } return res } +// canSeeInternalKeys reports whether an iterator with the given prefix can +// possibly surface a badger-internal key. Internal keys all live under the +// fixed badgerPrefix; if the user's prefix exists and starts with a +// different byte than badgerPrefix[0], no internal key can match. +func canSeeInternalKeys(prefix []byte) bool { + if len(prefix) == 0 { + return true + } + return prefix[0] == badgerPrefix[0] +} + // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to // additionally run bloom filter lookups before picking tables from the LSM tree. @@ -616,11 +634,18 @@ func (it *Iterator) parseItem() bool { } } - isInternalKey := bytes.HasPrefix(key, badgerPrefix) - // Skip badger keys. - if !it.opt.InternalAccess && isInternalKey { - mi.Next() - return false + // Detect badger-internal keys. When canSeeInternalKeys is false (the + // common case for prefix-bounded user scans whose prefix cannot collide + // with badgerPrefix), we know the current key cannot be internal and + // elide the per-step bytes.HasPrefix(key, badgerPrefix) probe. + var isInternalKey bool + if it.canSeeInternalKeys { + isInternalKey = bytes.HasPrefix(key, badgerPrefix) + // Skip badger keys. + if !it.opt.InternalAccess && isInternalKey { + mi.Next() + return false + } } // Skip any versions which are beyond the readTs. From 1cb43e70123fdda96e06aa89b4e07ba12add4212 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 15:35:40 -0400 Subject: [PATCH 5/8] perf(iterator): IteratorOptions.KeyOnly skip per-item value SafeCopy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add opt-in KeyOnly flag to IteratorOptions. When set, fill() skips the per-item SafeCopy of vs.Value into item.vptr, which is the largest remaining memmove on the key-only forward-scan hot path (dgraph has()/index scans). Trade-offs (Option A, strict): - Item.Value / Item.ValueCopy return new ErrKeyOnlyMode. - Item.ValueSize / Item.EstimatedSize report 0 / key-size only. - Item.Key, Version, UserMeta, ExpiresAt, IsDeletedOrExpired unaffected. - NewIterator forces PrefetchValues=false when KeyOnly=true so the prefetch goroutine never starts. Existing callers see no behavior change. The new benchmark BenchmarkDgraphPrefixScanKeyOnlyOpt exercises the opt-in path alongside the original PrefixScanKeyOnly to preserve comparison. Results (5-bench composite, median-of-3, Apple M4 Max): - PrefixScanKeyOnlyOpt: 7.76M ns/op (−2.0% vs iter3 KeyOnly, −34.3% vs baseline) - Composite with opt-in: 25.63M ns (−25.4% vs upstream baseline) Co-Authored-By: Claude Opus 4.7 --- errors.go | 8 ++++++ iterator.go | 54 ++++++++++++++++++++++++++++++++++- iterator_dgraph_bench_test.go | 35 +++++++++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/errors.go b/errors.go index dcf0d12ae..40433dcb6 100644 --- a/errors.go +++ b/errors.go @@ -46,6 +46,14 @@ var ( // ErrBannedKey is returned if the read/write key belongs to any banned namespace. ErrBannedKey = stderrors.New("Key is using the banned prefix") + // ErrKeyOnlyMode is returned by Item.Value and Item.ValueCopy when the + // containing iterator was created with IteratorOptions.KeyOnly=true. + // In that mode the iterator never copies the value bytes into the Item + // (the main reason to use KeyOnly is to avoid that per-item copy on + // key-only scans), so value access is unavailable on those items. + ErrKeyOnlyMode = stderrors.New( + "Item value is unavailable in KeyOnly iterator mode") + // ErrThresholdZero is returned if threshold is set to zero, and value log GC is called. // In such a case, GC can't be run. ErrThresholdZero = stderrors.New( diff --git a/iterator.go b/iterator.go index d63ac2b09..10b08ea92 100644 --- a/iterator.go +++ b/iterator.go @@ -43,6 +43,11 @@ type Item struct { status prefetchStatus meta byte // We need to store meta to know about bitValuePointer. userMeta byte + // keyOnly is true when the parent iterator was created with + // IteratorOptions.KeyOnly. The iterator skips copying value bytes into + // this item, so Item.Value/ValueCopy and the size estimators must + // short-circuit instead of touching the (nil) vptr. + keyOnly bool } // String returns a string representation of Item @@ -81,6 +86,9 @@ func (item *Item) Version() uint64 { // instead, or copy it yourself. Value might change once discard or commit is called. // Use ValueCopy if you want to do a Set after Get. func (item *Item) Value(fn func(val []byte) error) error { + if item.keyOnly { + return ErrKeyOnlyMode + } item.wg.Wait() if item.status == prefetched { if item.err == nil && fn != nil { @@ -108,6 +116,9 @@ func (item *Item) Value(fn func(val []byte) error) error { // This function is useful in long running iterate/update transactions to avoid a write deadlock. // See Github issue: https://github.com/dgraph-io/badger/issues/315 func (item *Item) ValueCopy(dst []byte) ([]byte, error) { + if item.keyOnly { + return nil, ErrKeyOnlyMode + } item.wg.Wait() if item.status == prefetched { return y.SafeCopy(dst, item.val), item.err @@ -213,7 +224,14 @@ func (item *Item) prefetchValue() { // This can be called while iterating through a store to quickly estimate the // size of a range of key-value pairs (without fetching the corresponding // values). +// +// When the iterator was created with IteratorOptions.KeyOnly=true, the +// value bytes (and value pointer for vlog entries) are not retained on +// the item, so this returns the key size only. func (item *Item) EstimatedSize() int64 { + if item.keyOnly { + return int64(len(item.key)) + } if !item.hasValue() { return 0 } @@ -235,7 +253,13 @@ func (item *Item) KeySize() int64 { // // This can be called to quickly estimate the size of a value without fetching // it. +// +// When the iterator was created with IteratorOptions.KeyOnly=true the value +// length is not retained on the item; this returns 0. func (item *Item) ValueSize() int64 { + if item.keyOnly { + return 0 + } if !item.hasValue() { return 0 } @@ -312,6 +336,17 @@ type IteratorOptions struct { AllVersions bool // Fetch all valid versions of the same key. InternalAccess bool // Used to allow internal access to badger keys. + // KeyOnly tells the iterator that the caller will not access value bytes + // from any item. When set, the iterator skips copying value bytes into + // the Item, saving a per-item memcpy on key-only forward scans (e.g. + // dgraph's has() predicate evaluator and index scans). The trade-off: + // Item.Value and Item.ValueCopy return ErrKeyOnlyMode, and + // Item.ValueSize / Item.EstimatedSize report 0. Item.Key, Version, + // UserMeta, ExpiresAt and IsDeletedOrExpired continue to work normally. + // + // PrefetchValues is forced to false when KeyOnly is true. + KeyOnly bool + // The following option is used to narrow down the SSTables that iterator // picks up. If Prefix is specified, only tables which could have this // prefix are picked based on their range of keys. @@ -470,6 +505,12 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { panic(ErrDBClosed) } + // KeyOnly disables value access, so prefetching values is nonsensical. + // Force PrefetchValues off so the prefetch goroutine is never started. + if opt.KeyOnly { + opt.PrefetchValues = false + } + y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1) // Keep track of the number of active iterators. @@ -732,12 +773,23 @@ func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) { item.meta = vs.Meta item.userMeta = vs.UserMeta item.expiresAt = vs.ExpiresAt + item.keyOnly = it.opt.KeyOnly item.version = y.ParseTs(key) item.key = y.SafeCopy(item.key, y.ParseKey(key)) - item.vptr = y.SafeCopy(item.vptr, vs.Value) item.val = nil + if it.opt.KeyOnly { + // Don't copy vs.Value: KeyOnly callers have promised not to read + // it, and the SafeCopy is the largest per-item memmove on the + // key-only forward-scan hot path. nil out any leftover capacity + // from a previous item that was reused via the iterator's + // freelist; callers that ignore the contract will at least see a + // nil vptr rather than stale bytes. + item.vptr = nil + } else { + item.vptr = y.SafeCopy(item.vptr, vs.Value) + } if it.opt.PrefetchValues { item.wg.Add(1) go func() { diff --git a/iterator_dgraph_bench_test.go b/iterator_dgraph_bench_test.go index c890709a3..61bd9d3a2 100644 --- a/iterator_dgraph_bench_test.go +++ b/iterator_dgraph_bench_test.go @@ -168,6 +168,41 @@ func BenchmarkDgraphPrefixScanKeyOnly(b *testing.B) { b.ReportMetric(float64(nKeys), "keys/op") } +// BenchmarkDgraphPrefixScanKeyOnlyOpt is the same workload as +// BenchmarkDgraphPrefixScanKeyOnly but uses IteratorOptions.KeyOnly=true. +// This measures the win from skipping the per-item SafeCopy(vptr) on the +// has()/index-scan hot path; callers in this mode never read item.Value. +func BenchmarkDgraphPrefixScanKeyOnlyOpt(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 200_000 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.KeyOnly = true + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + b.ReportMetric(float64(nKeys), "keys/op") +} + // BenchmarkDgraphPrefixScanAllVersions models the rollup path: // prefix-bounded forward iteration with PrefetchValues=false and // AllVersions=true, where each key has several MVCC versions. From 32d58ab9f4e528ee0b886971df3db2bf19702336 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 15:46:07 -0400 Subject: [PATCH 6/8] perf(iterator): store user-key only in lastKey, inline same-key compare Iterator.lastKey previously stored the full key (user-key || 8B ts) but SameKey only compared the user-key portion. Two consequences: 1. SafeCopy(it.lastKey, key) copied 8 bytes of dead ts every kept item. 2. y.SameKey called ParseKey on both src and dst (two slice ops + a function call) on the hot dedup compare. Change: store the user-key only in lastKey; inline the compare. ukLen := len(key) - 8 if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) { mi.Next() return false } it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen]) The same-key dedup invariant ("skip later versions of a user-key we've already considered, including tombstoned ones") is preserved exactly: lastKey is still updated before the FILL/deleted-or-expired check. Result: BenchmarkDgraphPrefixScanKeyOnlyOpt: 7.76M -> 7.58M ns (-2.3%). Other benchmarks within run-to-run noise. Co-Authored-By: Claude Opus 4.7 --- iterator.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/iterator.go b/iterator.go index 10b08ea92..f7c5cf5c1 100644 --- a/iterator.go +++ b/iterator.go @@ -468,7 +468,12 @@ type Iterator struct { data list waste list - lastKey []byte // Used to skip over multiple versions of the same key. + // lastKey stores the user-key (no 8-byte timestamp suffix) of the most + // recently considered candidate, used to skip subsequent older versions + // of the same user-key on AllVersions=false forward scans. Storing the + // user-key only avoids one ParseKey per same-key compare and a per-item + // 8-byte memcpy on the update. + lastKey []byte // canSeeInternalKeys is true when this iterator can possibly surface a // badger-internal key (e.g. "!badger!banned"). When false, parseItem @@ -717,7 +722,13 @@ func (it *Iterator) parseItem() bool { // If iterating in forward direction, then just checking the last key against current key would // be sufficient. if !it.opt.Reverse { - if y.SameKey(it.lastKey, key) { + // lastKey holds the user-key only. Compare against the user-key + // portion of the current full key (last 8 bytes are the ts). + // bytes.Equal already short-circuits on length mismatch, but the + // explicit length check lets the compiler hoist the bounds check + // out of the user-key slice and keeps the hot path branch-tight. + ukLen := len(key) - 8 + if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) { mi.Next() return false } @@ -726,7 +737,7 @@ func (it *Iterator) parseItem() bool { // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, // which is wrong. Therefore, update lastKey here. - it.lastKey = y.SafeCopy(it.lastKey, key) + it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen]) } FILL: From d98b3ed5c918f5593f790944e9ad9c346f95fa5a Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 16:37:46 -0400 Subject: [PATCH 7/8] docs(iterator): clarify FILL key-freshness and ukLen invariants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strengthen comments on two hot-path invariants surfaced during code review of the perf series: 1. FILL label entry contract: `key` is always mi.Key() at the current iitr position. The only goto FILL (reverse path) refreshes key after mi.Next(); the fall-through entry above never advances the iterator between `key := mi.Key()` and reaching FILL. This is what lets fill() safely reuse the caller-supplied key without re-calling mi.Key() per item. 2. lastKey ukLen safety: `ukLen := len(key) - 8` is safe because len(key) >= 8 is a badger-wide invariant (every LSM key is stored with an 8-byte timestamp suffix via y.KeyWithTs). y.ParseTs(key) above already relies on this — no defensive check needed. Co-Authored-By: Claude Opus 4.7 --- iterator.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/iterator.go b/iterator.go index f7c5cf5c1..379b5e5dc 100644 --- a/iterator.go +++ b/iterator.go @@ -727,6 +727,11 @@ func (it *Iterator) parseItem() bool { // bytes.Equal already short-circuits on length mismatch, but the // explicit length check lets the compiler hoist the bounds check // out of the user-key slice and keeps the hot path branch-tight. + // + // len(key) >= 8 is a badger-wide invariant: every key in the LSM is + // stored with an 8-byte timestamp suffix via y.KeyWithTs, and + // y.ParseTs(key) above already relies on this (it indexes + // key[len(key)-8:]). No defensive check is needed here. ukLen := len(key) - 8 if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) { mi.Next() @@ -741,9 +746,12 @@ func (it *Iterator) parseItem() bool { } FILL: - // If deleted, advance and return. We fetch vs (and reuse key from the - // enclosing scope, or refetch after Next on the reverse goto below) so - // that fill does not have to call mi.Value() / mi.Key() a second time. + // Invariant on entry to FILL: `key` is mi.Key() at the *current* iitr + // position. The only goto FILL (below, reverse path) refreshes `key` + // after mi.Next(); the fall-through entry from above never advances the + // iterator between `key := mi.Key()` and reaching FILL. fill() can + // therefore safely reuse the caller-supplied key without re-calling + // mi.Key(). vs := mi.Value() if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { mi.Next() From 16e8b0f7b904cf3cdc72463d8996629db551f8e0 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 23 May 2026 16:37:57 -0400 Subject: [PATCH 8/8] test: cover KeyOnly iterator paths and preserved-behavior regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a comprehensive test file that hits every line of the perf series touched in db.go and iterator.go. Two test groups: * TestKeyOnlyIterator_* — exercise the new KeyOnly opt-in path: Value/ValueCopy return ErrKeyOnlyMode; EstimatedSize returns key length; ValueSize returns 0; NewIterator forces PrefetchValues=false; Key/UserMeta/Version/ExpiresAt continue to work under KeyOnly. * TestRegression* — lock in pre-existing behaviors that the perf series rewrote in place: same-key dedup across multiple MVCC versions (iter5 lastKey rewrite), internal-key hiding (iter3 canSeeInternalKeys gate), isBanned correctness across the new atomic.Bool fast path (iter1) including short-key-with-bans branch, and non-KeyOnly fill path correctness (iter2 hoist). * TestCanSeeInternalKeys — table-driven unit test for the helper added in iter3. All preserved-behavior tests also pass on upstream/main, proving the expectations match the unchanged baseline. Together with the benchmarks added in 7174cc2, the touched lines in db.go and iterator.go now have 100% coverage. Co-Authored-By: Claude Opus 4.7 --- iterator_keyonly_test.go | 399 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 iterator_keyonly_test.go diff --git a/iterator_keyonly_test.go b/iterator_keyonly_test.go new file mode 100644 index 000000000..8a7d1c601 --- /dev/null +++ b/iterator_keyonly_test.go @@ -0,0 +1,399 @@ +/* + * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +package badger + +import ( + "errors" + "fmt" + "testing" + + "github.com/dgraph-io/badger/v4/y" + "github.com/stretchr/testify/require" +) + +// TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode covers iterator.go:Item.Value +// short-circuit when item.keyOnly is set (iter4). +func TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + err := item.Value(func(v []byte) error { return nil }) + require.True(t, errors.Is(err, ErrKeyOnlyMode), + "Value() should return ErrKeyOnlyMode, got %v", err) + } + return nil + })) + }) +} + +// TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode covers iterator.go:Item.ValueCopy. +func TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + buf, err := item.ValueCopy(nil) + require.True(t, errors.Is(err, ErrKeyOnlyMode)) + require.Nil(t, buf) + } + return nil + })) + }) +} + +// TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen covers iterator.go:Item.EstimatedSize +// short-circuit (returns int64(len(item.key)) when keyOnly). +func TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("abc"), []byte("lots-of-value-bytes-here"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + item := it.Item() + require.Equal(t, int64(len("abc")), item.EstimatedSize()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_ValueSizeIsZero covers iterator.go:Item.ValueSize short-circuit. +func TestKeyOnlyIterator_ValueSizeIsZero(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("hello-world"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + require.Equal(t, int64(0), it.Item().ValueSize()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_ForcesPrefetchOff covers iterator.go:NewIterator forcing +// PrefetchValues=false when KeyOnly=true. +func TestKeyOnlyIterator_ForcesPrefetchOff(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + opt.PrefetchValues = true + opt.PrefetchSize = 100 + it := txn.NewIterator(opt) + defer it.Close() + require.False(t, it.opt.PrefetchValues, + "NewIterator must force PrefetchValues=false when KeyOnly=true") + it.Rewind() + require.True(t, it.Valid()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_KeyMetaVersionWork verifies that the metadata methods on Item +// still work correctly in KeyOnly mode (per the contract documented on KeyOnly). +// Indirectly covers fill() setting item.keyOnly while still populating meta/version. +func TestKeyOnlyIterator_KeyMetaVersionWork(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + key := []byte("hello") + val := []byte("world") + txn := db.NewTransaction(true) + require.NoError(t, txn.SetEntry(NewEntry(key, val).WithMeta(0x42))) + require.NoError(t, txn.Commit()) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + item := it.Item() + require.Equal(t, key, item.Key()) + require.Equal(t, byte(0x42), item.UserMeta()) + require.NotZero(t, item.Version()) + require.False(t, item.IsDeletedOrExpired()) + require.Equal(t, int64(len(key)), item.KeySize()) + require.False(t, item.DiscardEarlierVersions()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_PrefetchValuesFalseStillWorks covers fill() KeyOnly branch +// without prefetch (the actual hot path; PrefetchValues=false bypasses the +// prefetch goroutine entirely). +func TestKeyOnlyIterator_PrefetchValuesFalseStillWorks(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + for i := 0; i < 20; i++ { + txnSet(t, db, []byte(fmt.Sprintf("k%02d", i)), []byte("vvv"), 0) + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + require.NotEmpty(t, item.Key()) + err := item.Value(func(v []byte) error { return nil }) + require.True(t, errors.Is(err, ErrKeyOnlyMode)) + count++ + } + require.Equal(t, 20, count) + return nil + })) + }) +} + +// TestCanSeeInternalKeys is a table-driven unit test for the iter3 helper. +// It is the single source of truth for which prefix shapes allow internal keys. +func TestCanSeeInternalKeys(t *testing.T) { + cases := []struct { + name string + prefix []byte + want bool + }{ + {"empty prefix sees everything", nil, true}, + {"empty slice sees everything", []byte{}, true}, + {"prefix starting with '!' (badgerPrefix[0]) may overlap", []byte("!badger"), true}, + {"prefix starting with 0x21 (== '!') overlap", []byte{0x21, 0xff}, true}, + {"prefix starting with 0x00 cannot overlap", []byte{0x00, 0x01, 0x02}, false}, + {"prefix starting with 'a' cannot overlap", []byte("abc"), false}, + {"prefix starting with high byte cannot overlap", []byte{0xff}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := canSeeInternalKeys(tc.prefix) + require.Equal(t, tc.want, got) + }) + } +} + +// TestRegressionSameKeyDedup locks in the iter5 lastKey behavior: when multiple +// versions of the same user-key exist and AllVersions=false, the iterator must +// surface exactly one item per user-key (the freshest non-expired one). This +// test exercises the user-key-only comparison path. +func TestRegressionSameKeyDedup(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + // Write three versions of "k1" and three versions of "k2". + for _, k := range []string{"k1", "k2"} { + for v := 1; v <= 3; v++ { + txnSet(t, db, []byte(k), []byte(fmt.Sprintf("v%d", v)), 0) + } + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false + it := txn.NewIterator(opt) + defer it.Close() + var seen []string + for it.Rewind(); it.Valid(); it.Next() { + seen = append(seen, string(it.Item().Key())) + } + require.Equal(t, []string{"k1", "k2"}, seen, + "non-AllVersions iterator must dedup to one item per user-key") + return nil + })) + + // AllVersions=true should see all 3 versions per key. + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.AllVersions = true + opt.PrefetchValues = false + it := txn.NewIterator(opt) + defer it.Close() + counts := map[string]int{} + for it.Rewind(); it.Valid(); it.Next() { + counts[string(it.Item().Key())]++ + } + require.Equal(t, 3, counts["k1"]) + require.Equal(t, 3, counts["k2"]) + return nil + })) + }) +} + +// TestRegressionInternalKeysHidden locks in iter3 behavior: a default user +// iterator (no InternalAccess) must not surface badger-internal keys even when +// the optimized canSeeInternalKeys path is taken (prefix nil or '!'). +func TestRegressionInternalKeysHidden(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("user-key"), []byte("v"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + it := txn.NewIterator(DefaultIteratorOptions) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + k := it.Item().Key() + require.False(t, isBadgerInternalKey(k), + "default iterator surfaced internal key %q", k) + } + return nil + })) + }) +} + +func isBadgerInternalKey(k []byte) bool { + if len(k) < len(badgerPrefix) { + return false + } + for i := range badgerPrefix { + if k[i] != badgerPrefix[i] { + return false + } + } + return true +} + +// TestRegressionIsBannedNoBans locks in iter1's fast-path correctness: +// when no namespaces have been banned, isBanned must return nil for any key +// (regardless of NamespaceOffset). Exercises the hasAny.Load()==false branch. +func TestRegressionIsBannedNoBans(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + // hasAny is false at this point - never any bans on a fresh DB. + require.False(t, db.bannedNamespaces.hasAny.Load()) + + // isBanned must return nil for any key on a fresh DB. + key := y.KeyWithTs([]byte("hello-world-12345"), 1) + require.NoError(t, db.isBanned(key)) + + // Empty key, short key - all fine, fast path doesn't even reach the + // length check. + require.NoError(t, db.isBanned(nil)) + require.NoError(t, db.isBanned([]byte("x"))) + }) +} + +// TestRegressionIsBannedWithBan covers the slow-path: hasAny=true, key matches +// a banned namespace. +func TestRegressionIsBannedWithBan(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + const ns = uint64(0x4242) + require.NoError(t, db.BanNamespace(ns)) + require.True(t, db.bannedNamespaces.hasAny.Load(), + "hasAny must flip to true after add()") + + // Key in banned namespace: ns bytes (big-endian) at offset 0 + suffix + // + 8B ts. + banned := y.KeyWithTs(append(y.U64ToBytes(ns), []byte("suffix")...), 1) + require.ErrorIs(t, db.isBanned(banned), ErrBannedKey) + + // Key in a different namespace: not banned. + other := y.KeyWithTs(append(y.U64ToBytes(0x1111), []byte("suffix")...), 1) + require.NoError(t, db.isBanned(other)) + }) +} + +// TestRegressionIsBannedShortKeyWithBans covers the slow path early-return when +// hasAny=true but the key is too short to extract a namespace. +func TestRegressionIsBannedShortKeyWithBans(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + const ns = uint64(0x4242) + require.NoError(t, db.BanNamespace(ns)) + + // Key shorter than NamespaceOffset+8 should be accepted (nil) — the + // pre-existing semantics treat un-namespaceable keys as non-banned. + require.NoError(t, db.isBanned([]byte("short"))) + require.NoError(t, db.isBanned(nil)) + }) +} + +// TestRegressionIsBannedNegativeOffset covers the early-return: NamespaceOffset<0 +// short-circuits before any atomic load. +func TestRegressionIsBannedNegativeOffset(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = -1 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + require.NoError(t, db.isBanned([]byte("anything"))) + require.NoError(t, db.isBanned(nil)) + }) +} + +// TestRegressionFillNonKeyOnly exercises the non-KeyOnly fill path (iter2's +// hoist), verifying that values are still copied correctly when KeyOnly=false. +// This is the path that all existing badger users hit by default. +func TestRegressionFillNonKeyOnly(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("expected-value-bytes"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false // exercise yieldItemValue path + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + got, err := it.Item().ValueCopy(nil) + require.NoError(t, err) + require.Equal(t, []byte("expected-value-bytes"), got) + return nil + })) + }) +} + +// TestRegressionPrefetchValuesTrue exercises the prefetch goroutine path, +// confirming KeyOnly=false + PrefetchValues=true still works (iter2 + iter4 +// didn't break the default). +func TestRegressionPrefetchValuesTrue(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + for i := 0; i < 5; i++ { + txnSet(t, db, []byte(fmt.Sprintf("k%d", i)), []byte(fmt.Sprintf("v%d", i)), 0) + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = true + opt.PrefetchSize = 10 + it := txn.NewIterator(opt) + defer it.Close() + seen := 0 + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + v, err := item.ValueCopy(nil) + require.NoError(t, err) + require.NotEmpty(t, v) + seen++ + } + require.Equal(t, 5, seen) + return nil + })) + }) +}