dgraph-io · shaunpatterson · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
@@ -52,12 +52,19 @@ type closers struct {
 type lockedKeys struct {
 	sync.RWMutex
 	keys map[uint64]struct{}
+	// hasAny is a fast-path flag: false until the first add(), then true forever.
+	// Hot-path callers (DB.isBanned) check this without taking the lock so the
+	// common case (empty ban set) costs one atomic load instead of an
+	// RLock/RUnlock pair plus a map lookup. There is no remove API, so the
+	// flag is monotonic and never needs to flip back to false.
+	hasAny atomic.Bool
 }
 
 func (lk *lockedKeys) add(key uint64) {
 	lk.Lock()
 	defer lk.Unlock()
 	lk.keys[key] = struct{}{}
+	lk.hasAny.Store(true)
 }
 
 func (lk *lockedKeys) has(key uint64) bool {
@@ -1846,6 +1853,14 @@ func (db *DB) isBanned(key []byte) error {
 	if db.opt.NamespaceOffset < 0 {
 		return nil
 	}
+	// Fast path: no namespaces have ever been banned in this DB lifetime
+	// (the common production case). Skip the slice + lookup + lock entirely.
+	// isBanned is called on every iterator step and every Txn.Get/modify, so
+	// avoiding the RLock here matters when NamespaceOffset is enabled but no
+	// bans are active.
+	if !db.bannedNamespaces.hasAny.Load() {
+		return nil
+	}
 	if len(key) <= db.opt.NamespaceOffset+8 {
 		return nil
 	}

@@ -46,6 +46,14 @@ var (
 	// ErrBannedKey is returned if the read/write key belongs to any banned namespace.
 	ErrBannedKey = stderrors.New("Key is using the banned prefix")
 
+	// ErrKeyOnlyMode is returned by Item.Value and Item.ValueCopy when the
+	// containing iterator was created with IteratorOptions.KeyOnly=true.
+	// In that mode the iterator never copies the value bytes into the Item
+	// (the main reason to use KeyOnly is to avoid that per-item copy on
+	// key-only scans), so value access is unavailable on those items.
+	ErrKeyOnlyMode = stderrors.New(
+		"Item value is unavailable in KeyOnly iterator mode")
+
 	// ErrThresholdZero is returned if threshold is set to zero, and value log GC is called.
 	// In such a case, GC can't be run.
 	ErrThresholdZero = stderrors.New(

@@ -43,6 +43,11 @@ type Item struct {
 	status   prefetchStatus
 	meta     byte // We need to store meta to know about bitValuePointer.
 	userMeta byte
+	// keyOnly is true when the parent iterator was created with
+	// IteratorOptions.KeyOnly. The iterator skips copying value bytes into
+	// this item, so Item.Value/ValueCopy and the size estimators must
+	// short-circuit instead of touching the (nil) vptr.
+	keyOnly bool
 }
 
 // String returns a string representation of Item
@@ -81,6 +86,9 @@ func (item *Item) Version() uint64 {
 // instead, or copy it yourself. Value might change once discard or commit is called.
 // Use ValueCopy if you want to do a Set after Get.
 func (item *Item) Value(fn func(val []byte) error) error {
+	if item.keyOnly {
+		return ErrKeyOnlyMode
+	}
 	item.wg.Wait()
 	if item.status == prefetched {
 		if item.err == nil && fn != nil {
@@ -108,6 +116,9 @@ func (item *Item) Value(fn func(val []byte) error) error {
 // This function is useful in long running iterate/update transactions to avoid a write deadlock.
 // See Github issue: https://github.com/dgraph-io/badger/issues/315
 func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
+	if item.keyOnly {
+		return nil, ErrKeyOnlyMode
+	}
 	item.wg.Wait()
 	if item.status == prefetched {
 		return y.SafeCopy(dst, item.val), item.err
@@ -213,7 +224,14 @@ func (item *Item) prefetchValue() {
 // This can be called while iterating through a store to quickly estimate the
 // size of a range of key-value pairs (without fetching the corresponding
 // values).
+//
+// When the iterator was created with IteratorOptions.KeyOnly=true, the
+// value bytes (and value pointer for vlog entries) are not retained on
+// the item, so this returns the key size only.
 func (item *Item) EstimatedSize() int64 {
+	if item.keyOnly {
+		return int64(len(item.key))
+	}
 	if !item.hasValue() {
 		return 0
 	}
@@ -235,7 +253,13 @@ func (item *Item) KeySize() int64 {
 //
 // This can be called to quickly estimate the size of a value without fetching
 // it.
+//
+// When the iterator was created with IteratorOptions.KeyOnly=true the value
+// length is not retained on the item; this returns 0.
 func (item *Item) ValueSize() int64 {
+	if item.keyOnly {
+		return 0
+	}
 	if !item.hasValue() {
 		return 0
 	}
@@ -312,6 +336,17 @@ type IteratorOptions struct {
 	AllVersions    bool // Fetch all valid versions of the same key.
 	InternalAccess bool // Used to allow internal access to badger keys.
 
+	// KeyOnly tells the iterator that the caller will not access value bytes
+	// from any item. When set, the iterator skips copying value bytes into
+	// the Item, saving a per-item memcpy on key-only forward scans (e.g.
+	// dgraph's has() predicate evaluator and index scans). The trade-off:
+	// Item.Value and Item.ValueCopy return ErrKeyOnlyMode, and
+	// Item.ValueSize / Item.EstimatedSize report 0. Item.Key, Version,
+	// UserMeta, ExpiresAt and IsDeletedOrExpired continue to work normally.
+	//
+	// PrefetchValues is forced to false when KeyOnly is true.
+	KeyOnly bool
+
 	// The following option is used to narrow down the SSTables that iterator
 	// picks up. If Prefix is specified, only tables which could have this
 	// prefix are picked based on their range of keys.
@@ -433,7 +468,18 @@ type Iterator struct {
 	data  list
 	waste list
 
-	lastKey []byte // Used to skip over multiple versions of the same key.
+	// lastKey stores the user-key (no 8-byte timestamp suffix) of the most
+	// recently considered candidate, used to skip subsequent older versions
+	// of the same user-key on AllVersions=false forward scans. Storing the
+	// user-key only avoids one ParseKey per same-key compare and a per-item
+	// 8-byte memcpy on the update.
+	lastKey []byte
+
+	// canSeeInternalKeys is true when this iterator can possibly surface a
+	// badger-internal key (e.g. "!badger!banned"). When false, parseItem
+	// can skip the per-step bytes.HasPrefix(key, badgerPrefix) check.
+	// Computed once at construction from opt.Prefix.
+	canSeeInternalKeys bool
 
 	closed  bool
 	scanned int // Used to estimate the size of data scanned by iterator.
@@ -464,6 +510,12 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
 		panic(ErrDBClosed)
 	}
 
+	// KeyOnly disables value access, so prefetching values is nonsensical.
+	// Force PrefetchValues off so the prefetch goroutine is never started.
+	if opt.KeyOnly {
+		opt.PrefetchValues = false
+	}
+
 	y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1)
 
 	// Keep track of the number of active iterators.
@@ -482,14 +534,26 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
 	}
 	iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references.
 	res := &Iterator{
-		txn:    txn,
-		iitr:   table.NewMergeIterator(iters, opt.Reverse),
-		opt:    opt,
-		readTs: txn.readTs,
+		txn:               txn,
+		iitr:              table.NewMergeIterator(iters, opt.Reverse),
+		opt:               opt,
+		readTs:            txn.readTs,
+		canSeeInternalKeys: canSeeInternalKeys(opt.Prefix),
 	}
 	return res
 }
 
+// canSeeInternalKeys reports whether an iterator with the given prefix can
+// possibly surface a badger-internal key. Internal keys all live under the
+// fixed badgerPrefix; if the user's prefix exists and starts with a
+// different byte than badgerPrefix[0], no internal key can match.
+func canSeeInternalKeys(prefix []byte) bool {
+	if len(prefix) == 0 {
+		return true
+	}
+	return prefix[0] == badgerPrefix[0]
+}
+
 // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a
 // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to
 // additionally run bloom filter lookups before picking tables from the LSM tree.
@@ -616,11 +680,18 @@ func (it *Iterator) parseItem() bool {
 		}
 	}
 
-	isInternalKey := bytes.HasPrefix(key, badgerPrefix)
-	// Skip badger keys.
-	if !it.opt.InternalAccess && isInternalKey {
-		mi.Next()
-		return false
+	// Detect badger-internal keys. When canSeeInternalKeys is false (the
+	// common case for prefix-bounded user scans whose prefix cannot collide
+	// with badgerPrefix), we know the current key cannot be internal and
+	// elide the per-step bytes.HasPrefix(key, badgerPrefix) probe.
+	var isInternalKey bool
+	if it.canSeeInternalKeys {
+		isInternalKey = bytes.HasPrefix(key, badgerPrefix)
+		// Skip badger keys.
+		if !it.opt.InternalAccess && isInternalKey {
+			mi.Next()
+			return false
+		}
 	}
 
 	// Skip any versions which are beyond the readTs.
@@ -640,8 +711,9 @@ func (it *Iterator) parseItem() bool {
 	if it.opt.AllVersions {
 		// Return deleted or expired values also, otherwise user can't figure out
 		// whether the key was deleted.
+		vs := mi.Value()
 		item := it.newItem()
-		it.fill(item)
+		it.fill(item, key, &vs)
 		setItem(item)
 		mi.Next()
 		return true
@@ -650,7 +722,18 @@ func (it *Iterator) parseItem() bool {
 	// If iterating in forward direction, then just checking the last key against current key would
 	// be sufficient.
 	if !it.opt.Reverse {
-		if y.SameKey(it.lastKey, key) {
+		// lastKey holds the user-key only. Compare against the user-key
+		// portion of the current full key (last 8 bytes are the ts).
+		// bytes.Equal already short-circuits on length mismatch, but the
+		// explicit length check lets the compiler hoist the bounds check
+		// out of the user-key slice and keeps the hot path branch-tight.
+		//
+		// len(key) >= 8 is a badger-wide invariant: every key in the LSM is
+		// stored with an 8-byte timestamp suffix via y.KeyWithTs, and
+		// y.ParseTs(key) above already relies on this (it indexes
+		// key[len(key)-8:]). No defensive check is needed here.
+		ukLen := len(key) - 8
+		if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) {
 			mi.Next()
 			return false
 		}
@@ -659,19 +742,24 @@ func (it *Iterator) parseItem() bool {
 		// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
 		// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
 		// which is wrong. Therefore, update lastKey here.
-		it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
+		it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen])
 	}
 
 FILL:
-	// If deleted, advance and return.
+	// Invariant on entry to FILL: `key` is mi.Key() at the *current* iitr
+	// position. The only goto FILL (below, reverse path) refreshes `key`
+	// after mi.Next(); the fall-through entry from above never advances the
+	// iterator between `key := mi.Key()` and reaching FILL. fill() can
+	// therefore safely reuse the caller-supplied key without re-calling
+	// mi.Key().
 	vs := mi.Value()
 	if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
 		mi.Next()
 		return false
 	}
 
 	item := it.newItem()
-	it.fill(item)
+	it.fill(item, key, &vs)
 	// fill item based on current cursor position. All Next calls have returned, so reaching here
 	// means no Next was called.
 
@@ -681,9 +769,11 @@ FILL:
 		return true
 	}
 
-	// Reverse direction.
-	nextTs := y.ParseTs(mi.Key())
-	mik := y.ParseKey(mi.Key())
+	// Reverse direction. Refresh key after the Next() above; the iterator
+	// has advanced, so the previous `key` slice now refers to a later block.
+	key = mi.Key()
+	nextTs := y.ParseTs(key)
+	mik := y.ParseKey(key)
 	if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
 		// This is a valid potential candidate.
 		goto FILL
@@ -693,17 +783,32 @@ FILL:
 	return true
 }
 
-func (it *Iterator) fill(item *Item) {
-	vs := it.iitr.Value()
+// fill populates item from the current iterator position. Callers pass the
+// already-fetched key and value pointer to avoid the per-item cost of
+// calling mi.Key() / mi.Value() (and decoding ValueStruct) a second time
+// on the hot iterator path. vs is passed by pointer to avoid copying the
+// ~40-byte ValueStruct on every kept item.
+func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) {
 	item.meta = vs.Meta
 	item.userMeta = vs.UserMeta
 	item.expiresAt = vs.ExpiresAt
+	item.keyOnly = it.opt.KeyOnly
 
-	item.version = y.ParseTs(it.iitr.Key())
-	item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
+	item.version = y.ParseTs(key)
+	item.key = y.SafeCopy(item.key, y.ParseKey(key))
 
-	item.vptr = y.SafeCopy(item.vptr, vs.Value)
 	item.val = nil
+	if it.opt.KeyOnly {
+		// Don't copy vs.Value: KeyOnly callers have promised not to read
+		// it, and the SafeCopy is the largest per-item memmove on the
+		// key-only forward-scan hot path. nil out any leftover capacity
+		// from a previous item that was reused via the iterator's
+		// freelist; callers that ignore the contract will at least see a
+		// nil vptr rather than stale bytes.
+		item.vptr = nil
+	} else {
+		item.vptr = y.SafeCopy(item.vptr, vs.Value)
+	}
 	if it.opt.PrefetchValues {
 		item.wg.Add(1)
 		go func() {