diff --git a/db.go b/db.go index 8a1021308..129607a32 100644 --- a/db.go +++ b/db.go @@ -52,12 +52,19 @@ type closers struct { type lockedKeys struct { sync.RWMutex keys map[uint64]struct{} + // hasAny is a fast-path flag: false until the first add(), then true forever. + // Hot-path callers (DB.isBanned) check this without taking the lock so the + // common case (empty ban set) costs one atomic load instead of an + // RLock/RUnlock pair plus a map lookup. There is no remove API, so the + // flag is monotonic and never needs to flip back to false. + hasAny atomic.Bool } func (lk *lockedKeys) add(key uint64) { lk.Lock() defer lk.Unlock() lk.keys[key] = struct{}{} + lk.hasAny.Store(true) } func (lk *lockedKeys) has(key uint64) bool { @@ -1846,6 +1853,14 @@ func (db *DB) isBanned(key []byte) error { if db.opt.NamespaceOffset < 0 { return nil } + // Fast path: no namespaces have ever been banned in this DB lifetime + // (the common production case). Skip the slice + lookup + lock entirely. + // isBanned is called on every iterator step and every Txn.Get/modify, so + // avoiding the RLock here matters when NamespaceOffset is enabled but no + // bans are active. + if !db.bannedNamespaces.hasAny.Load() { + return nil + } if len(key) <= db.opt.NamespaceOffset+8 { return nil } diff --git a/errors.go b/errors.go index dcf0d12ae..40433dcb6 100644 --- a/errors.go +++ b/errors.go @@ -46,6 +46,14 @@ var ( // ErrBannedKey is returned if the read/write key belongs to any banned namespace. ErrBannedKey = stderrors.New("Key is using the banned prefix") + // ErrKeyOnlyMode is returned by Item.Value and Item.ValueCopy when the + // containing iterator was created with IteratorOptions.KeyOnly=true. + // In that mode the iterator never copies the value bytes into the Item + // (the main reason to use KeyOnly is to avoid that per-item copy on + // key-only scans), so value access is unavailable on those items. + ErrKeyOnlyMode = stderrors.New( + "Item value is unavailable in KeyOnly iterator mode") + // ErrThresholdZero is returned if threshold is set to zero, and value log GC is called. // In such a case, GC can't be run. ErrThresholdZero = stderrors.New( diff --git a/iterator.go b/iterator.go index f57cfa4c9..379b5e5dc 100644 --- a/iterator.go +++ b/iterator.go @@ -43,6 +43,11 @@ type Item struct { status prefetchStatus meta byte // We need to store meta to know about bitValuePointer. userMeta byte + // keyOnly is true when the parent iterator was created with + // IteratorOptions.KeyOnly. The iterator skips copying value bytes into + // this item, so Item.Value/ValueCopy and the size estimators must + // short-circuit instead of touching the (nil) vptr. + keyOnly bool } // String returns a string representation of Item @@ -81,6 +86,9 @@ func (item *Item) Version() uint64 { // instead, or copy it yourself. Value might change once discard or commit is called. // Use ValueCopy if you want to do a Set after Get. func (item *Item) Value(fn func(val []byte) error) error { + if item.keyOnly { + return ErrKeyOnlyMode + } item.wg.Wait() if item.status == prefetched { if item.err == nil && fn != nil { @@ -108,6 +116,9 @@ func (item *Item) Value(fn func(val []byte) error) error { // This function is useful in long running iterate/update transactions to avoid a write deadlock. // See Github issue: https://github.com/dgraph-io/badger/issues/315 func (item *Item) ValueCopy(dst []byte) ([]byte, error) { + if item.keyOnly { + return nil, ErrKeyOnlyMode + } item.wg.Wait() if item.status == prefetched { return y.SafeCopy(dst, item.val), item.err @@ -213,7 +224,14 @@ func (item *Item) prefetchValue() { // This can be called while iterating through a store to quickly estimate the // size of a range of key-value pairs (without fetching the corresponding // values). +// +// When the iterator was created with IteratorOptions.KeyOnly=true, the +// value bytes (and value pointer for vlog entries) are not retained on +// the item, so this returns the key size only. func (item *Item) EstimatedSize() int64 { + if item.keyOnly { + return int64(len(item.key)) + } if !item.hasValue() { return 0 } @@ -235,7 +253,13 @@ func (item *Item) KeySize() int64 { // // This can be called to quickly estimate the size of a value without fetching // it. +// +// When the iterator was created with IteratorOptions.KeyOnly=true the value +// length is not retained on the item; this returns 0. func (item *Item) ValueSize() int64 { + if item.keyOnly { + return 0 + } if !item.hasValue() { return 0 } @@ -312,6 +336,17 @@ type IteratorOptions struct { AllVersions bool // Fetch all valid versions of the same key. InternalAccess bool // Used to allow internal access to badger keys. + // KeyOnly tells the iterator that the caller will not access value bytes + // from any item. When set, the iterator skips copying value bytes into + // the Item, saving a per-item memcpy on key-only forward scans (e.g. + // dgraph's has() predicate evaluator and index scans). The trade-off: + // Item.Value and Item.ValueCopy return ErrKeyOnlyMode, and + // Item.ValueSize / Item.EstimatedSize report 0. Item.Key, Version, + // UserMeta, ExpiresAt and IsDeletedOrExpired continue to work normally. + // + // PrefetchValues is forced to false when KeyOnly is true. + KeyOnly bool + // The following option is used to narrow down the SSTables that iterator // picks up. If Prefix is specified, only tables which could have this // prefix are picked based on their range of keys. @@ -433,7 +468,18 @@ type Iterator struct { data list waste list - lastKey []byte // Used to skip over multiple versions of the same key. + // lastKey stores the user-key (no 8-byte timestamp suffix) of the most + // recently considered candidate, used to skip subsequent older versions + // of the same user-key on AllVersions=false forward scans. Storing the + // user-key only avoids one ParseKey per same-key compare and a per-item + // 8-byte memcpy on the update. + lastKey []byte + + // canSeeInternalKeys is true when this iterator can possibly surface a + // badger-internal key (e.g. "!badger!banned"). When false, parseItem + // can skip the per-step bytes.HasPrefix(key, badgerPrefix) check. + // Computed once at construction from opt.Prefix. + canSeeInternalKeys bool closed bool scanned int // Used to estimate the size of data scanned by iterator. @@ -464,6 +510,12 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { panic(ErrDBClosed) } + // KeyOnly disables value access, so prefetching values is nonsensical. + // Force PrefetchValues off so the prefetch goroutine is never started. + if opt.KeyOnly { + opt.PrefetchValues = false + } + y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1) // Keep track of the number of active iterators. @@ -482,14 +534,26 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { } iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references. res := &Iterator{ - txn: txn, - iitr: table.NewMergeIterator(iters, opt.Reverse), - opt: opt, - readTs: txn.readTs, + txn: txn, + iitr: table.NewMergeIterator(iters, opt.Reverse), + opt: opt, + readTs: txn.readTs, + canSeeInternalKeys: canSeeInternalKeys(opt.Prefix), } return res } +// canSeeInternalKeys reports whether an iterator with the given prefix can +// possibly surface a badger-internal key. Internal keys all live under the +// fixed badgerPrefix; if the user's prefix exists and starts with a +// different byte than badgerPrefix[0], no internal key can match. +func canSeeInternalKeys(prefix []byte) bool { + if len(prefix) == 0 { + return true + } + return prefix[0] == badgerPrefix[0] +} + // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to // additionally run bloom filter lookups before picking tables from the LSM tree. @@ -616,11 +680,18 @@ func (it *Iterator) parseItem() bool { } } - isInternalKey := bytes.HasPrefix(key, badgerPrefix) - // Skip badger keys. - if !it.opt.InternalAccess && isInternalKey { - mi.Next() - return false + // Detect badger-internal keys. When canSeeInternalKeys is false (the + // common case for prefix-bounded user scans whose prefix cannot collide + // with badgerPrefix), we know the current key cannot be internal and + // elide the per-step bytes.HasPrefix(key, badgerPrefix) probe. + var isInternalKey bool + if it.canSeeInternalKeys { + isInternalKey = bytes.HasPrefix(key, badgerPrefix) + // Skip badger keys. + if !it.opt.InternalAccess && isInternalKey { + mi.Next() + return false + } } // Skip any versions which are beyond the readTs. @@ -640,8 +711,9 @@ func (it *Iterator) parseItem() bool { if it.opt.AllVersions { // Return deleted or expired values also, otherwise user can't figure out // whether the key was deleted. + vs := mi.Value() item := it.newItem() - it.fill(item) + it.fill(item, key, &vs) setItem(item) mi.Next() return true @@ -650,7 +722,18 @@ func (it *Iterator) parseItem() bool { // If iterating in forward direction, then just checking the last key against current key would // be sufficient. if !it.opt.Reverse { - if y.SameKey(it.lastKey, key) { + // lastKey holds the user-key only. Compare against the user-key + // portion of the current full key (last 8 bytes are the ts). + // bytes.Equal already short-circuits on length mismatch, but the + // explicit length check lets the compiler hoist the bounds check + // out of the user-key slice and keeps the hot path branch-tight. + // + // len(key) >= 8 is a badger-wide invariant: every key in the LSM is + // stored with an 8-byte timestamp suffix via y.KeyWithTs, and + // y.ParseTs(key) above already relies on this (it indexes + // key[len(key)-8:]). No defensive check is needed here. + ukLen := len(key) - 8 + if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) { mi.Next() return false } @@ -659,11 +742,16 @@ func (it *Iterator) parseItem() bool { // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, // which is wrong. Therefore, update lastKey here. - it.lastKey = y.SafeCopy(it.lastKey, mi.Key()) + it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen]) } FILL: - // If deleted, advance and return. + // Invariant on entry to FILL: `key` is mi.Key() at the *current* iitr + // position. The only goto FILL (below, reverse path) refreshes `key` + // after mi.Next(); the fall-through entry from above never advances the + // iterator between `key := mi.Key()` and reaching FILL. fill() can + // therefore safely reuse the caller-supplied key without re-calling + // mi.Key(). vs := mi.Value() if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { mi.Next() @@ -671,7 +759,7 @@ FILL: } item := it.newItem() - it.fill(item) + it.fill(item, key, &vs) // fill item based on current cursor position. All Next calls have returned, so reaching here // means no Next was called. @@ -681,9 +769,11 @@ FILL: return true } - // Reverse direction. - nextTs := y.ParseTs(mi.Key()) - mik := y.ParseKey(mi.Key()) + // Reverse direction. Refresh key after the Next() above; the iterator + // has advanced, so the previous `key` slice now refers to a later block. + key = mi.Key() + nextTs := y.ParseTs(key) + mik := y.ParseKey(key) if nextTs <= it.readTs && bytes.Equal(mik, item.key) { // This is a valid potential candidate. goto FILL @@ -693,17 +783,32 @@ FILL: return true } -func (it *Iterator) fill(item *Item) { - vs := it.iitr.Value() +// fill populates item from the current iterator position. Callers pass the +// already-fetched key and value pointer to avoid the per-item cost of +// calling mi.Key() / mi.Value() (and decoding ValueStruct) a second time +// on the hot iterator path. vs is passed by pointer to avoid copying the +// ~40-byte ValueStruct on every kept item. +func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) { item.meta = vs.Meta item.userMeta = vs.UserMeta item.expiresAt = vs.ExpiresAt + item.keyOnly = it.opt.KeyOnly - item.version = y.ParseTs(it.iitr.Key()) - item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key())) + item.version = y.ParseTs(key) + item.key = y.SafeCopy(item.key, y.ParseKey(key)) - item.vptr = y.SafeCopy(item.vptr, vs.Value) item.val = nil + if it.opt.KeyOnly { + // Don't copy vs.Value: KeyOnly callers have promised not to read + // it, and the SafeCopy is the largest per-item memmove on the + // key-only forward-scan hot path. nil out any leftover capacity + // from a previous item that was reused via the iterator's + // freelist; callers that ignore the contract will at least see a + // nil vptr rather than stale bytes. + item.vptr = nil + } else { + item.vptr = y.SafeCopy(item.vptr, vs.Value) + } if it.opt.PrefetchValues { item.wg.Add(1) go func() { diff --git a/iterator_dgraph_bench_test.go b/iterator_dgraph_bench_test.go new file mode 100644 index 000000000..61bd9d3a2 --- /dev/null +++ b/iterator_dgraph_bench_test.go @@ -0,0 +1,340 @@ +/* + * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +package badger + +import ( + "encoding/binary" + "math" + "math/rand" + "os" + "testing" + + "github.com/dgraph-io/badger/v4/y" +) + +// dgraph-shaped iterator micro-benchmarks. +// +// These benchmarks model the hot iterator paths in dgraph's posting layer +// (posting/mvcc.go: ReadPostingList, IterateDisk, sort.go: index bucket scan). +// They are intentionally distinct from the existing root-package benchmarks +// because none of those exercise the combination dgraph actually uses: +// +// - NamespaceOffset = 1 (so DB.isBanned runs on every key) +// - NumVersionsToKeep = math.MaxInt32 (dgraph keeps all versions) +// - DetectConflicts = false (dgraph owns OCC) +// - PrefetchValues = false (key-only scans in hot paths) +// - AllVersions = true (MVCC-aware reads in ReadPostingList / rollup) +// - Prefix-bounded scans over mid-sized structured keys +// +// Key layout mirrors dgraph's data keys (see x/keys.go in dgraph): +// +// byte 0 : type prefix (0x00 = data) +// bytes 1..8 : namespace, big-endian uint64 +// bytes 9..10 : 2-byte attr length, big-endian uint16 +// bytes 11.. : attr name (variable, ~16 bytes typical) +// next 1 byte : subtype +// last 8 bytes: UID, big-endian +// +// Total: 1 + 8 + 2 + len(attr) + 1 + 8 ≈ 36 bytes for a 16-byte attr. + +const ( + dgKeyTypeData = 0x00 + dgAttrName = "predicate_namespaced" // 20 bytes -> 40-byte keys +) + +// dgKey constructs a dgraph-shaped data key with the given namespace, attr, +// subtype, and UID. +func dgKey(ns uint64, attr string, subtype byte, uid uint64) []byte { + buf := make([]byte, 1+8+2+len(attr)+1+8) + buf[0] = dgKeyTypeData + binary.BigEndian.PutUint64(buf[1:9], ns) + binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr))) + copy(buf[11:11+len(attr)], attr) + buf[11+len(attr)] = subtype + binary.BigEndian.PutUint64(buf[12+len(attr):], uid) + return buf +} + +// dgPrefix returns the (ns + attr) prefix shared by all keys for one predicate. +// This is what dgraph iterates with for prefix-bounded scans. +func dgPrefix(ns uint64, attr string) []byte { + buf := make([]byte, 1+8+2+len(attr)) + buf[0] = dgKeyTypeData + binary.BigEndian.PutUint64(buf[1:9], ns) + binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr))) + copy(buf[11:], attr) + return buf +} + +// dgraphTestOptions returns the badger options that match dgraph's +// production configuration: managed DB, all-versions retention, no conflict +// detection, namespace offset = 1. +func dgraphTestOptions(dir string) Options { + return DefaultOptions(dir). + WithSyncWrites(false). + WithLoggingLevel(WARNING). + WithNumVersionsToKeep(math.MaxInt32). + WithDetectConflicts(false). + WithNamespaceOffset(1) +} + +// dgraphLoadDB populates a managed DB with `nKeys` unique keys under a single +// predicate, each carrying `versionsPerKey` MVCC versions. UIDs are dense +// (1..nKeys) so prefix scans land contiguous keys, mirroring dgraph's +// `has()` predicate scans. +func dgraphLoadDB(b *testing.B, db *DB, ns uint64, attr string, nKeys, versionsPerKey int) { + b.Helper() + // Small value: realistic for posting-list deltas (a few hundred bytes is + // the upper end; we use 64 bytes to keep total disk footprint bounded). + val := make([]byte, 64) + for i := range val { + val[i] = byte(i) + } + + const batch = 4000 + commitTs := uint64(1) + for start := 0; start < nKeys; start += batch { + end := start + batch + if end > nKeys { + end = nKeys + } + // Write `versionsPerKey` versions of each key in this batch. + for v := 0; v < versionsPerKey; v++ { + txn := db.NewTransactionAt(math.MaxUint64, true) + for i := start; i < end; i++ { + key := dgKey(ns, attr, 0, uint64(i+1)) + if err := txn.SetEntry(&Entry{Key: key, Value: val}); err != nil { + b.Fatalf("SetEntry: %v", err) + } + } + if err := txn.CommitAt(commitTs, nil); err != nil { + b.Fatalf("CommitAt: %v", err) + } + commitTs++ + } + } + // Force everything to disk so iteration reflects the on-disk format + // (avoids measuring only the memtable hit). + if err := db.Flatten(2); err != nil { + b.Fatalf("Flatten: %v", err) + } +} + +// openDgraphDB opens a managed DB with the dgraph-shaped options. +func openDgraphDB(b *testing.B) (*DB, string) { + b.Helper() + dir, err := os.MkdirTemp(".", "badger-dgraph-bench") + y.Check(err) + db, err := OpenManaged(dgraphTestOptions(dir)) + y.Check(err) + return db, dir +} + +// BenchmarkDgraphPrefixScanKeyOnly models the IterateDisk / index-scan path: +// prefix-bounded forward iteration with PrefetchValues=false, one version per +// key. This is what dgraph's `has()` predicate evaluator and sort.go index +// scans drive. +func BenchmarkDgraphPrefixScanKeyOnly(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 200_000 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + b.ReportMetric(float64(nKeys), "keys/op") +} + +// BenchmarkDgraphPrefixScanKeyOnlyOpt is the same workload as +// BenchmarkDgraphPrefixScanKeyOnly but uses IteratorOptions.KeyOnly=true. +// This measures the win from skipping the per-item SafeCopy(vptr) on the +// has()/index-scan hot path; callers in this mode never read item.Value. +func BenchmarkDgraphPrefixScanKeyOnlyOpt(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 200_000 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.KeyOnly = true + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + b.ReportMetric(float64(nKeys), "keys/op") +} + +// BenchmarkDgraphPrefixScanAllVersions models the rollup path: +// prefix-bounded forward iteration with PrefetchValues=false and +// AllVersions=true, where each key has several MVCC versions. +func BenchmarkDgraphPrefixScanAllVersions(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 50_000 + versionsPerKey = 5 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + opt.AllVersions = true + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + // Should see nKeys * versionsPerKey items. + if count != nKeys*versionsPerKey { + b.Fatalf("expected %d items, got %d", nKeys*versionsPerKey, count) + } + } + b.ReportMetric(float64(nKeys*versionsPerKey), "items/op") +} + +// BenchmarkDgraphKeyIteratorAllVersions models readFromDisk: +// a NewKeyIterator over a single posting key with AllVersions=true. dgraph +// invokes this on every cache miss in the posting layer. +func BenchmarkDgraphKeyIteratorAllVersions(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 20_000 + versionsPerKey = 8 + ) + db, dir := openDgraphDB(b) + defer func() { db.Close(); removeDir(dir) }() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey) + + // Pre-build the lookup key set so the random selection cost stays out + // of the measured loop. + keys := make([][]byte, nKeys) + for i := 0; i < nKeys; i++ { + keys[i] = dgKey(ns, dgAttrName, 0, uint64(i+1)) + } + rng := rand.New(rand.NewSource(1)) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + key := keys[rng.Intn(nKeys)] + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.PrefetchValues = false + opt.AllVersions = true + it := txn.NewKeyIterator(key, opt) + count := 0 + for it.Seek(key); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != versionsPerKey { + b.Fatalf("expected %d versions, got %d", versionsPerKey, count) + } + } +} + +// BenchmarkDgraphPrefixScanNamespaceOffset measures the cost of +// NamespaceOffset=1 vs NamespaceOffset=-1 (off) under identical workload. +// This isolates the per-key cost of DB.isBanned, which today takes an +// RWMutex on every iterator step even when no namespaces are banned. +func BenchmarkDgraphPrefixScanNamespaceOffset(b *testing.B) { + const ( + ns = uint64(0x0102030405060708) + nKeys = 100_000 + ) + cases := []struct { + name string + nsOff int // -1 = off, 1 = dgraph-style + }{ + {"off", -1}, + {"dgraph", 1}, + } + + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + dir, err := os.MkdirTemp(".", "badger-dgraph-nsoff") + y.Check(err) + defer removeDir(dir) + opts := DefaultOptions(dir). + WithSyncWrites(false). + WithLoggingLevel(WARNING). + WithNumVersionsToKeep(math.MaxInt32). + WithDetectConflicts(false). + WithNamespaceOffset(c.nsOff) + db, err := OpenManaged(opts) + y.Check(err) + defer db.Close() + dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1) + + prefix := dgPrefix(ns, dgAttrName) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + txn := db.NewTransactionAt(math.MaxUint64, false) + opt := DefaultIteratorOptions + opt.Prefix = prefix + opt.PrefetchValues = false + it := txn.NewIterator(opt) + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + it.Close() + txn.Discard() + if count != nKeys { + b.Fatalf("expected %d keys, got %d", nKeys, count) + } + } + }) + } +} + diff --git a/iterator_keyonly_test.go b/iterator_keyonly_test.go new file mode 100644 index 000000000..8a7d1c601 --- /dev/null +++ b/iterator_keyonly_test.go @@ -0,0 +1,399 @@ +/* + * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +package badger + +import ( + "errors" + "fmt" + "testing" + + "github.com/dgraph-io/badger/v4/y" + "github.com/stretchr/testify/require" +) + +// TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode covers iterator.go:Item.Value +// short-circuit when item.keyOnly is set (iter4). +func TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + err := item.Value(func(v []byte) error { return nil }) + require.True(t, errors.Is(err, ErrKeyOnlyMode), + "Value() should return ErrKeyOnlyMode, got %v", err) + } + return nil + })) + }) +} + +// TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode covers iterator.go:Item.ValueCopy. +func TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + buf, err := item.ValueCopy(nil) + require.True(t, errors.Is(err, ErrKeyOnlyMode)) + require.Nil(t, buf) + } + return nil + })) + }) +} + +// TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen covers iterator.go:Item.EstimatedSize +// short-circuit (returns int64(len(item.key)) when keyOnly). +func TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("abc"), []byte("lots-of-value-bytes-here"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + item := it.Item() + require.Equal(t, int64(len("abc")), item.EstimatedSize()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_ValueSizeIsZero covers iterator.go:Item.ValueSize short-circuit. +func TestKeyOnlyIterator_ValueSizeIsZero(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("hello-world"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + require.Equal(t, int64(0), it.Item().ValueSize()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_ForcesPrefetchOff covers iterator.go:NewIterator forcing +// PrefetchValues=false when KeyOnly=true. +func TestKeyOnlyIterator_ForcesPrefetchOff(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("v1"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + opt.PrefetchValues = true + opt.PrefetchSize = 100 + it := txn.NewIterator(opt) + defer it.Close() + require.False(t, it.opt.PrefetchValues, + "NewIterator must force PrefetchValues=false when KeyOnly=true") + it.Rewind() + require.True(t, it.Valid()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_KeyMetaVersionWork verifies that the metadata methods on Item +// still work correctly in KeyOnly mode (per the contract documented on KeyOnly). +// Indirectly covers fill() setting item.keyOnly while still populating meta/version. +func TestKeyOnlyIterator_KeyMetaVersionWork(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + key := []byte("hello") + val := []byte("world") + txn := db.NewTransaction(true) + require.NoError(t, txn.SetEntry(NewEntry(key, val).WithMeta(0x42))) + require.NoError(t, txn.Commit()) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + item := it.Item() + require.Equal(t, key, item.Key()) + require.Equal(t, byte(0x42), item.UserMeta()) + require.NotZero(t, item.Version()) + require.False(t, item.IsDeletedOrExpired()) + require.Equal(t, int64(len(key)), item.KeySize()) + require.False(t, item.DiscardEarlierVersions()) + return nil + })) + }) +} + +// TestKeyOnlyIterator_PrefetchValuesFalseStillWorks covers fill() KeyOnly branch +// without prefetch (the actual hot path; PrefetchValues=false bypasses the +// prefetch goroutine entirely). +func TestKeyOnlyIterator_PrefetchValuesFalseStillWorks(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + for i := 0; i < 20; i++ { + txnSet(t, db, []byte(fmt.Sprintf("k%02d", i)), []byte("vvv"), 0) + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false + opt.KeyOnly = true + it := txn.NewIterator(opt) + defer it.Close() + count := 0 + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + require.NotEmpty(t, item.Key()) + err := item.Value(func(v []byte) error { return nil }) + require.True(t, errors.Is(err, ErrKeyOnlyMode)) + count++ + } + require.Equal(t, 20, count) + return nil + })) + }) +} + +// TestCanSeeInternalKeys is a table-driven unit test for the iter3 helper. +// It is the single source of truth for which prefix shapes allow internal keys. +func TestCanSeeInternalKeys(t *testing.T) { + cases := []struct { + name string + prefix []byte + want bool + }{ + {"empty prefix sees everything", nil, true}, + {"empty slice sees everything", []byte{}, true}, + {"prefix starting with '!' (badgerPrefix[0]) may overlap", []byte("!badger"), true}, + {"prefix starting with 0x21 (== '!') overlap", []byte{0x21, 0xff}, true}, + {"prefix starting with 0x00 cannot overlap", []byte{0x00, 0x01, 0x02}, false}, + {"prefix starting with 'a' cannot overlap", []byte("abc"), false}, + {"prefix starting with high byte cannot overlap", []byte{0xff}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := canSeeInternalKeys(tc.prefix) + require.Equal(t, tc.want, got) + }) + } +} + +// TestRegressionSameKeyDedup locks in the iter5 lastKey behavior: when multiple +// versions of the same user-key exist and AllVersions=false, the iterator must +// surface exactly one item per user-key (the freshest non-expired one). This +// test exercises the user-key-only comparison path. +func TestRegressionSameKeyDedup(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + // Write three versions of "k1" and three versions of "k2". + for _, k := range []string{"k1", "k2"} { + for v := 1; v <= 3; v++ { + txnSet(t, db, []byte(k), []byte(fmt.Sprintf("v%d", v)), 0) + } + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false + it := txn.NewIterator(opt) + defer it.Close() + var seen []string + for it.Rewind(); it.Valid(); it.Next() { + seen = append(seen, string(it.Item().Key())) + } + require.Equal(t, []string{"k1", "k2"}, seen, + "non-AllVersions iterator must dedup to one item per user-key") + return nil + })) + + // AllVersions=true should see all 3 versions per key. + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.AllVersions = true + opt.PrefetchValues = false + it := txn.NewIterator(opt) + defer it.Close() + counts := map[string]int{} + for it.Rewind(); it.Valid(); it.Next() { + counts[string(it.Item().Key())]++ + } + require.Equal(t, 3, counts["k1"]) + require.Equal(t, 3, counts["k2"]) + return nil + })) + }) +} + +// TestRegressionInternalKeysHidden locks in iter3 behavior: a default user +// iterator (no InternalAccess) must not surface badger-internal keys even when +// the optimized canSeeInternalKeys path is taken (prefix nil or '!'). +func TestRegressionInternalKeysHidden(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("user-key"), []byte("v"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + it := txn.NewIterator(DefaultIteratorOptions) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + k := it.Item().Key() + require.False(t, isBadgerInternalKey(k), + "default iterator surfaced internal key %q", k) + } + return nil + })) + }) +} + +func isBadgerInternalKey(k []byte) bool { + if len(k) < len(badgerPrefix) { + return false + } + for i := range badgerPrefix { + if k[i] != badgerPrefix[i] { + return false + } + } + return true +} + +// TestRegressionIsBannedNoBans locks in iter1's fast-path correctness: +// when no namespaces have been banned, isBanned must return nil for any key +// (regardless of NamespaceOffset). Exercises the hasAny.Load()==false branch. +func TestRegressionIsBannedNoBans(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + // hasAny is false at this point - never any bans on a fresh DB. + require.False(t, db.bannedNamespaces.hasAny.Load()) + + // isBanned must return nil for any key on a fresh DB. + key := y.KeyWithTs([]byte("hello-world-12345"), 1) + require.NoError(t, db.isBanned(key)) + + // Empty key, short key - all fine, fast path doesn't even reach the + // length check. + require.NoError(t, db.isBanned(nil)) + require.NoError(t, db.isBanned([]byte("x"))) + }) +} + +// TestRegressionIsBannedWithBan covers the slow-path: hasAny=true, key matches +// a banned namespace. +func TestRegressionIsBannedWithBan(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + const ns = uint64(0x4242) + require.NoError(t, db.BanNamespace(ns)) + require.True(t, db.bannedNamespaces.hasAny.Load(), + "hasAny must flip to true after add()") + + // Key in banned namespace: ns bytes (big-endian) at offset 0 + suffix + // + 8B ts. + banned := y.KeyWithTs(append(y.U64ToBytes(ns), []byte("suffix")...), 1) + require.ErrorIs(t, db.isBanned(banned), ErrBannedKey) + + // Key in a different namespace: not banned. + other := y.KeyWithTs(append(y.U64ToBytes(0x1111), []byte("suffix")...), 1) + require.NoError(t, db.isBanned(other)) + }) +} + +// TestRegressionIsBannedShortKeyWithBans covers the slow path early-return when +// hasAny=true but the key is too short to extract a namespace. +func TestRegressionIsBannedShortKeyWithBans(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = 0 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + const ns = uint64(0x4242) + require.NoError(t, db.BanNamespace(ns)) + + // Key shorter than NamespaceOffset+8 should be accepted (nil) — the + // pre-existing semantics treat un-namespaceable keys as non-banned. + require.NoError(t, db.isBanned([]byte("short"))) + require.NoError(t, db.isBanned(nil)) + }) +} + +// TestRegressionIsBannedNegativeOffset covers the early-return: NamespaceOffset<0 +// short-circuits before any atomic load. +func TestRegressionIsBannedNegativeOffset(t *testing.T) { + opt := getTestOptions("") + opt.NamespaceOffset = -1 + runBadgerTest(t, &opt, func(t *testing.T, db *DB) { + require.NoError(t, db.isBanned([]byte("anything"))) + require.NoError(t, db.isBanned(nil)) + }) +} + +// TestRegressionFillNonKeyOnly exercises the non-KeyOnly fill path (iter2's +// hoist), verifying that values are still copied correctly when KeyOnly=false. +// This is the path that all existing badger users hit by default. +func TestRegressionFillNonKeyOnly(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + txnSet(t, db, []byte("k1"), []byte("expected-value-bytes"), 0) + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = false // exercise yieldItemValue path + it := txn.NewIterator(opt) + defer it.Close() + it.Rewind() + require.True(t, it.Valid()) + got, err := it.Item().ValueCopy(nil) + require.NoError(t, err) + require.Equal(t, []byte("expected-value-bytes"), got) + return nil + })) + }) +} + +// TestRegressionPrefetchValuesTrue exercises the prefetch goroutine path, +// confirming KeyOnly=false + PrefetchValues=true still works (iter2 + iter4 +// didn't break the default). +func TestRegressionPrefetchValuesTrue(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + for i := 0; i < 5; i++ { + txnSet(t, db, []byte(fmt.Sprintf("k%d", i)), []byte(fmt.Sprintf("v%d", i)), 0) + } + + require.NoError(t, db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.PrefetchValues = true + opt.PrefetchSize = 10 + it := txn.NewIterator(opt) + defer it.Close() + seen := 0 + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + v, err := item.ValueCopy(nil) + require.NoError(t, err) + require.NotEmpty(t, v) + seen++ + } + require.Equal(t, 5, seen) + return nil + })) + }) +}