Skip to content
Open
15 changes: 15 additions & 0 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ type closers struct {
type lockedKeys struct {
sync.RWMutex
keys map[uint64]struct{}
// hasAny is a fast-path flag: false until the first add(), then true forever.
// Hot-path callers (DB.isBanned) check this without taking the lock so the
// common case (empty ban set) costs one atomic load instead of an
// RLock/RUnlock pair plus a map lookup. There is no remove API, so the
// flag is monotonic and never needs to flip back to false.
hasAny atomic.Bool
}

func (lk *lockedKeys) add(key uint64) {
lk.Lock()
defer lk.Unlock()
lk.keys[key] = struct{}{}
lk.hasAny.Store(true)
}

func (lk *lockedKeys) has(key uint64) bool {
Expand Down Expand Up @@ -1846,6 +1853,14 @@ func (db *DB) isBanned(key []byte) error {
if db.opt.NamespaceOffset < 0 {
return nil
}
// Fast path: no namespaces have ever been banned in this DB lifetime
// (the common production case). Skip the slice + lookup + lock entirely.
// isBanned is called on every iterator step and every Txn.Get/modify, so
// avoiding the RLock here matters when NamespaceOffset is enabled but no
// bans are active.
if !db.bannedNamespaces.hasAny.Load() {
return nil
}
if len(key) <= db.opt.NamespaceOffset+8 {
return nil
}
Expand Down
8 changes: 8 additions & 0 deletions errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ var (
// ErrBannedKey is returned if the read/write key belongs to any banned namespace.
ErrBannedKey = stderrors.New("Key is using the banned prefix")

// ErrKeyOnlyMode is returned by Item.Value and Item.ValueCopy when the
// containing iterator was created with IteratorOptions.KeyOnly=true.
// In that mode the iterator never copies the value bytes into the Item
// (the main reason to use KeyOnly is to avoid that per-item copy on
// key-only scans), so value access is unavailable on those items.
ErrKeyOnlyMode = stderrors.New(
"Item value is unavailable in KeyOnly iterator mode")

// ErrThresholdZero is returned if threshold is set to zero, and value log GC is called.
// In such a case, GC can't be run.
ErrThresholdZero = stderrors.New(
Expand Down
151 changes: 128 additions & 23 deletions iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ type Item struct {
status prefetchStatus
meta byte // We need to store meta to know about bitValuePointer.
userMeta byte
// keyOnly is true when the parent iterator was created with
// IteratorOptions.KeyOnly. The iterator skips copying value bytes into
// this item, so Item.Value/ValueCopy and the size estimators must
// short-circuit instead of touching the (nil) vptr.
keyOnly bool
}

// String returns a string representation of Item
Expand Down Expand Up @@ -81,6 +86,9 @@ func (item *Item) Version() uint64 {
// instead, or copy it yourself. Value might change once discard or commit is called.
// Use ValueCopy if you want to do a Set after Get.
func (item *Item) Value(fn func(val []byte) error) error {
if item.keyOnly {
return ErrKeyOnlyMode
}
item.wg.Wait()
if item.status == prefetched {
if item.err == nil && fn != nil {
Expand Down Expand Up @@ -108,6 +116,9 @@ func (item *Item) Value(fn func(val []byte) error) error {
// This function is useful in long running iterate/update transactions to avoid a write deadlock.
// See Github issue: https://github.com/dgraph-io/badger/issues/315
func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
if item.keyOnly {
return nil, ErrKeyOnlyMode
}
item.wg.Wait()
if item.status == prefetched {
return y.SafeCopy(dst, item.val), item.err
Expand Down Expand Up @@ -213,7 +224,14 @@ func (item *Item) prefetchValue() {
// This can be called while iterating through a store to quickly estimate the
// size of a range of key-value pairs (without fetching the corresponding
// values).
//
// When the iterator was created with IteratorOptions.KeyOnly=true, the
// value bytes (and value pointer for vlog entries) are not retained on
// the item, so this returns the key size only.
func (item *Item) EstimatedSize() int64 {
if item.keyOnly {
return int64(len(item.key))
}
if !item.hasValue() {
return 0
}
Expand All @@ -235,7 +253,13 @@ func (item *Item) KeySize() int64 {
//
// This can be called to quickly estimate the size of a value without fetching
// it.
//
// When the iterator was created with IteratorOptions.KeyOnly=true the value
// length is not retained on the item; this returns 0.
func (item *Item) ValueSize() int64 {
if item.keyOnly {
return 0
}
if !item.hasValue() {
return 0
}
Expand Down Expand Up @@ -312,6 +336,17 @@ type IteratorOptions struct {
AllVersions bool // Fetch all valid versions of the same key.
InternalAccess bool // Used to allow internal access to badger keys.

// KeyOnly tells the iterator that the caller will not access value bytes
// from any item. When set, the iterator skips copying value bytes into
// the Item, saving a per-item memcpy on key-only forward scans (e.g.
// dgraph's has() predicate evaluator and index scans). The trade-off:
// Item.Value and Item.ValueCopy return ErrKeyOnlyMode, and
// Item.ValueSize / Item.EstimatedSize report 0. Item.Key, Version,
// UserMeta, ExpiresAt and IsDeletedOrExpired continue to work normally.
//
// PrefetchValues is forced to false when KeyOnly is true.
KeyOnly bool

// The following option is used to narrow down the SSTables that iterator
// picks up. If Prefix is specified, only tables which could have this
// prefix are picked based on their range of keys.
Expand Down Expand Up @@ -433,7 +468,18 @@ type Iterator struct {
data list
waste list

lastKey []byte // Used to skip over multiple versions of the same key.
// lastKey stores the user-key (no 8-byte timestamp suffix) of the most
// recently considered candidate, used to skip subsequent older versions
// of the same user-key on AllVersions=false forward scans. Storing the
// user-key only avoids one ParseKey per same-key compare and a per-item
// 8-byte memcpy on the update.
lastKey []byte

// canSeeInternalKeys is true when this iterator can possibly surface a
// badger-internal key (e.g. "!badger!banned"). When false, parseItem
// can skip the per-step bytes.HasPrefix(key, badgerPrefix) check.
// Computed once at construction from opt.Prefix.
canSeeInternalKeys bool

closed bool
scanned int // Used to estimate the size of data scanned by iterator.
Expand Down Expand Up @@ -464,6 +510,12 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
panic(ErrDBClosed)
}

// KeyOnly disables value access, so prefetching values is nonsensical.
// Force PrefetchValues off so the prefetch goroutine is never started.
if opt.KeyOnly {
opt.PrefetchValues = false
}

y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1)

// Keep track of the number of active iterators.
Expand All @@ -482,14 +534,26 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
}
iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references.
res := &Iterator{
txn: txn,
iitr: table.NewMergeIterator(iters, opt.Reverse),
opt: opt,
readTs: txn.readTs,
txn: txn,
iitr: table.NewMergeIterator(iters, opt.Reverse),
opt: opt,
readTs: txn.readTs,
canSeeInternalKeys: canSeeInternalKeys(opt.Prefix),
}
return res
}

// canSeeInternalKeys reports whether an iterator with the given prefix can
// possibly surface a badger-internal key. Internal keys all live under the
// fixed badgerPrefix; if the user's prefix exists and starts with a
// different byte than badgerPrefix[0], no internal key can match.
func canSeeInternalKeys(prefix []byte) bool {
if len(prefix) == 0 {
return true
}
return prefix[0] == badgerPrefix[0]
}

// NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a
// single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to
// additionally run bloom filter lookups before picking tables from the LSM tree.
Expand Down Expand Up @@ -616,11 +680,18 @@ func (it *Iterator) parseItem() bool {
}
}

isInternalKey := bytes.HasPrefix(key, badgerPrefix)
// Skip badger keys.
if !it.opt.InternalAccess && isInternalKey {
mi.Next()
return false
// Detect badger-internal keys. When canSeeInternalKeys is false (the
// common case for prefix-bounded user scans whose prefix cannot collide
// with badgerPrefix), we know the current key cannot be internal and
// elide the per-step bytes.HasPrefix(key, badgerPrefix) probe.
var isInternalKey bool
if it.canSeeInternalKeys {
isInternalKey = bytes.HasPrefix(key, badgerPrefix)
// Skip badger keys.
if !it.opt.InternalAccess && isInternalKey {
mi.Next()
return false
}
}

// Skip any versions which are beyond the readTs.
Expand All @@ -640,8 +711,9 @@ func (it *Iterator) parseItem() bool {
if it.opt.AllVersions {
// Return deleted or expired values also, otherwise user can't figure out
// whether the key was deleted.
vs := mi.Value()
item := it.newItem()
it.fill(item)
it.fill(item, key, &vs)
setItem(item)
mi.Next()
return true
Expand All @@ -650,7 +722,18 @@ func (it *Iterator) parseItem() bool {
// If iterating in forward direction, then just checking the last key against current key would
// be sufficient.
if !it.opt.Reverse {
if y.SameKey(it.lastKey, key) {
// lastKey holds the user-key only. Compare against the user-key
// portion of the current full key (last 8 bytes are the ts).
// bytes.Equal already short-circuits on length mismatch, but the
// explicit length check lets the compiler hoist the bounds check
// out of the user-key slice and keeps the hot path branch-tight.
//
// len(key) >= 8 is a badger-wide invariant: every key in the LSM is
// stored with an 8-byte timestamp suffix via y.KeyWithTs, and
// y.ParseTs(key) above already relies on this (it indexes
// key[len(key)-8:]). No defensive check is needed here.
ukLen := len(key) - 8
if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) {
mi.Next()
return false
}
Expand All @@ -659,19 +742,24 @@ func (it *Iterator) parseItem() bool {
// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
// which is wrong. Therefore, update lastKey here.
it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen])
}

FILL:
// If deleted, advance and return.
// Invariant on entry to FILL: `key` is mi.Key() at the *current* iitr
// position. The only goto FILL (below, reverse path) refreshes `key`
// after mi.Next(); the fall-through entry from above never advances the
// iterator between `key := mi.Key()` and reaching FILL. fill() can
// therefore safely reuse the caller-supplied key without re-calling
// mi.Key().
vs := mi.Value()
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
mi.Next()
return false
}

item := it.newItem()
it.fill(item)
it.fill(item, key, &vs)
// fill item based on current cursor position. All Next calls have returned, so reaching here
// means no Next was called.

Expand All @@ -681,9 +769,11 @@ FILL:
return true
}

// Reverse direction.
nextTs := y.ParseTs(mi.Key())
mik := y.ParseKey(mi.Key())
// Reverse direction. Refresh key after the Next() above; the iterator
// has advanced, so the previous `key` slice now refers to a later block.
key = mi.Key()
nextTs := y.ParseTs(key)
mik := y.ParseKey(key)
if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
// This is a valid potential candidate.
goto FILL
Expand All @@ -693,17 +783,32 @@ FILL:
return true
}

func (it *Iterator) fill(item *Item) {
vs := it.iitr.Value()
// fill populates item from the current iterator position. Callers pass the
// already-fetched key and value pointer to avoid the per-item cost of
// calling mi.Key() / mi.Value() (and decoding ValueStruct) a second time
// on the hot iterator path. vs is passed by pointer to avoid copying the
// ~40-byte ValueStruct on every kept item.
func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) {
item.meta = vs.Meta
item.userMeta = vs.UserMeta
item.expiresAt = vs.ExpiresAt
item.keyOnly = it.opt.KeyOnly

item.version = y.ParseTs(it.iitr.Key())
item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
item.version = y.ParseTs(key)
item.key = y.SafeCopy(item.key, y.ParseKey(key))

item.vptr = y.SafeCopy(item.vptr, vs.Value)
item.val = nil
if it.opt.KeyOnly {
// Don't copy vs.Value: KeyOnly callers have promised not to read
// it, and the SafeCopy is the largest per-item memmove on the
// key-only forward-scan hot path. nil out any leftover capacity
// from a previous item that was reused via the iterator's
// freelist; callers that ignore the contract will at least see a
// nil vptr rather than stale bytes.
item.vptr = nil
} else {
item.vptr = y.SafeCopy(item.vptr, vs.Value)
}
if it.opt.PrefetchValues {
item.wg.Add(1)
go func() {
Expand Down
Loading